├── .gitignore ├── PCA_plot_for_dataset.py ├── README.md ├── additional_methods ├── Deep-SVDD │ ├── LICENSE │ ├── README.md │ ├── data │ │ ├── .gitignore │ │ ├── .gitkeep │ │ └── cardio.npz │ ├── imgs │ │ ├── .gitkeep │ │ ├── cifar10.png │ │ └── mnist.png │ ├── log │ │ └── .gitkeep │ ├── requirements.txt │ ├── src │ │ ├── __init__.py │ │ ├── base │ │ │ ├── __init__.py │ │ │ ├── base_dataset.py │ │ │ ├── base_net.py │ │ │ ├── base_trainer.py │ │ │ └── torchvision_dataset.py │ │ ├── datasets │ │ │ ├── OD_dataset.py │ │ │ ├── __init__.py │ │ │ ├── cifar10.py │ │ │ ├── main.py │ │ │ ├── mnist.py │ │ │ └── preprocessing.py │ │ ├── deepSVDD.py │ │ ├── main.py │ │ ├── networks │ │ │ ├── __init__.py │ │ │ ├── cifar10_LeNet.py │ │ │ ├── cifar10_LeNet_elu.py │ │ │ ├── mnist_LeNet.py │ │ │ └── networks.py │ │ ├── optim │ │ │ ├── __init__.py │ │ │ ├── ae_trainer.py │ │ │ └── deepSVDD_trainer.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── collect_results.py │ │ │ ├── config.py │ │ │ └── visualization │ │ │ └── plot_images_grid.py │ └── test_instruction.txt ├── HBOS │ ├── hbos.py │ └── hbos_LICENSE.txt ├── ODIN.py ├── SVDD │ ├── .gitattributes │ ├── .spyproject │ │ └── config │ │ │ ├── backups │ │ │ ├── codestyle.ini.bak │ │ │ ├── encoding.ini.bak │ │ │ ├── vcs.ini.bak │ │ │ └── workspace.ini.bak │ │ │ ├── codestyle.ini │ │ │ ├── defaults │ │ │ ├── defaults-codestyle-0.2.0.ini │ │ │ ├── defaults-encoding-0.2.0.ini │ │ │ ├── defaults-vcs-0.2.0.ini │ │ │ └── defaults-workspace-0.2.0.ini │ │ │ ├── encoding.ini │ │ │ ├── vcs.ini │ │ │ └── workspace.ini │ ├── LICENSE │ ├── README.md │ ├── SECURITY.md │ ├── examples │ │ ├── svdd_example_KPCA.py │ │ ├── svdd_example_PSO.py │ │ ├── svdd_example_confusion_matrix.py │ │ ├── svdd_example_cross_validation.py │ │ ├── svdd_example_grid_search.py │ │ ├── svdd_example_hybrid_data.py │ │ ├── svdd_example_kernel.py │ │ └── svdd_example_unlabeled_data.py │ ├── requirements.txt │ └── src │ │ └── BaseSVDD.py ├── abod.py ├── cof.py ├── ensemble.py ├── gen2out │ ├── bagging.py │ ├── gen2out.py │ ├── iforest.py │ ├── main.py │ └── utils.py ├── lmdd.py ├── sod.py └── wrappers │ ├── AE.py │ ├── ALAD.py │ ├── AnoGAN.py │ ├── ExtendedIForest.py │ ├── HBOS.py │ ├── VAE.py │ └── rrcf.py ├── environment.yml ├── evaluation_metrics.py ├── figures └── .gitignore ├── formatted_data ├── aloi.npz ├── annthyroid.npz ├── arrhythmia.npz ├── breastw.npz ├── campaign.npz ├── cardio.npz ├── cover.npz ├── donors.npz ├── fault.npz ├── glass.npz ├── hepatitis.npz ├── hrss_anomalous_optimized.npz ├── hrss_anomalous_standard.npz ├── http.npz ├── internetads.npz ├── ionosphere.npz ├── landsat.npz ├── letter.npz ├── magic.gamma.npz ├── mammography.npz ├── mi-f.npz ├── mi-v.npz ├── mnist.npz ├── musk.npz ├── nasa.npz ├── optdigits.npz ├── pageblocks.npz ├── parkinson.npz ├── pen-global.npz ├── pen-local.npz ├── pendigits.npz ├── pima.npz ├── satellite.npz ├── satimage-2.npz ├── seismic-bumps.npz ├── shuttle.npz ├── skin.npz ├── smtp.npz ├── spambase.npz ├── speech.npz ├── stamps.npz ├── thyroid.npz ├── vertebral.npz ├── vowels.npz ├── waveform.npz ├── wbc.npz ├── wbc2.npz ├── wilt.npz ├── wine.npz ├── wpbc.npz ├── yeast.npz └── yeast6.npz ├── generate_and_plot_types_of_anomalies.py ├── invert_labels_calculate_metrics.py ├── method_example.py ├── minimal_environment.yml ├── preprocess_detect_outliers.py ├── produce_figures.py ├── raw_data ├── .gitkeep ├── ADBench_data_raw │ └── .gitkeep ├── ELKI_data_raw │ ├── .gitkeep │ ├── Annthyroid │ │ └── .gitkeep │ ├── Arrhythmia │ │ └── .gitkeep │ ├── Cardiotocography │ │ └── .gitkeep │ ├── HeartDisease │ │ └── .gitkeep │ ├── Hepatitis │ │ └── .gitkeep │ ├── InternetAds │ │ └── .gitkeep │ ├── PageBlocks │ │ └── .gitkeep │ ├── Parkinson │ │ └── .gitkeep │ ├── Pima │ │ └── .gitkeep │ ├── SpamBase │ │ └── .gitkeep │ ├── Stamps │ │ └── .gitkeep │ └── Wilt │ │ └── .gitkeep ├── GAAL_data_raw │ └── .gitkeep ├── Goldstein_data_raw │ └── .gitkeep ├── ODDS_data_raw │ ├── .gitkeep │ ├── categorical_variables_per_dataset.json │ ├── matfile_data │ │ └── .gitkeep │ └── other_data │ │ └── .gitkeep └── extended_AE_data_raw │ ├── .gitkeep │ └── CNC-kaggle │ └── .gitkeep ├── read_raw_write_in_format.py ├── run_all_methods.py ├── tables └── .gitignore └── testnewmethods.py /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | results/ 3 | results_temp/ 4 | logs/ 5 | formatted_data_old/ 6 | 7 | raw_data/ADBench_data_raw/*.npz 8 | 9 | 10 | *.pyc 11 | *.log 12 | *.tex 13 | *.pickle 14 | *.arff 15 | *.mat 16 | *.csv 17 | 18 | additional_methods/Deep-SVDD/log/mnist_test/ 19 | 20 | additional_methods/Deep-SVDD/log/musk/ 21 | /raw_data/GAAL_data_raw/Annthyroid 22 | /raw_data/GAAL_data_raw/SpamBase 23 | /raw_data/GAAL_data_raw/WDBC 24 | /raw_data/GAAL_data_raw/Waveform 25 | /raw_data/ODDS_data_raw/other_data/yeast.data 26 | /raw_data/ODDS_data_raw/other_data/yeast.names 27 | /raw_data/extended_AE_data_raw/CNC-kaggle/README.txt 28 | /raw_data/extended_AE_data_raw/CNC-kaggle/test_artifact.jpg 29 | /results.zip 30 | -------------------------------------------------------------------------------- /PCA_plot_for_dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri Sep 23 08:56:39 2022 5 | 6 | @author: rbouman 7 | """ 8 | 9 | import os 10 | from numpy.linalg import svd 11 | from sklearn.preprocessing import StandardScaler 12 | import pickle 13 | import numpy as np 14 | import matplotlib.pyplot as plt 15 | import pandas as pd 16 | 17 | pickle_dir = "formatted_data" 18 | base_result_dir = "results" 19 | result_dir = "result_dir" 20 | csvresult_dir = "csvresult_dir" 21 | score_dir = "score_dir" 22 | log_dir = "logs" 23 | 24 | method_name = "EIF" 25 | dataset_name = "musk" 26 | 27 | 28 | 29 | picklefile_name = dataset_name + ".pickle" 30 | 31 | 32 | full_path_filename = os.path.join(pickle_dir, picklefile_name) 33 | 34 | data = pickle.load(open(full_path_filename, 'rb')) 35 | X, y = data["X"], np.squeeze(data["y"]) 36 | 37 | 38 | 39 | score_folder_path = os.path.join(base_result_dir, score_dir, dataset_name, method_name) 40 | 41 | hyperparameter_scores = os.listdir(score_folder_path) 42 | 43 | n_scores = len(hyperparameter_scores) 44 | 45 | score_sums = np.zeros(y.shape) 46 | 47 | for hyperparameter_score in hyperparameter_scores: 48 | full_path_filename = os.path.join(score_folder_path, hyperparameter_score) 49 | 50 | score_sums += pd.read_csv(full_path_filename, names=["scores"])["scores"] 51 | 52 | scores = score_sums/n_scores 53 | 54 | 55 | scaler = StandardScaler() 56 | 57 | X_scaled = scaler.fit_transform(X) 58 | 59 | _, S, Vt = svd(X_scaled, full_matrices=False) 60 | V = Vt.T 61 | 62 | var_explained = S**2 / np.sum(S**2) 63 | 64 | X_PCA = X.dot(V) 65 | #%% make plots 66 | plt.figure() 67 | plt.title("class colored plot: ") 68 | 69 | plt.scatter(X_PCA[y==0,0], X_PCA[y==0,1], label="normal") 70 | plt.scatter(X_PCA[y==1,0], X_PCA[y==1,1], label="outlier") 71 | 72 | plt.xlabel("PC1 " + str(var_explained[0]*100) + "% var explained") 73 | plt.ylabel("PC2 " + str(var_explained[1]*100) + "% var explained") 74 | 75 | plt.legend() 76 | 77 | plt.figure() 78 | plt.title 79 | plt.show() 80 | 81 | plt.figure() 82 | plt.title("score colored plot") 83 | 84 | plt.scatter(X_PCA[:,0], X_PCA[:,1], c=scores) 85 | 86 | plt.xlabel("PC1 " + str(var_explained[0]*100) + "% var explained") 87 | plt.ylabel("PC2 " + str(var_explained[1]*100) + "% var explained") 88 | 89 | plt.colorbar() 90 | 91 | plt.figure() 92 | plt.title 93 | plt.show() 94 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Unsupervised anomaly detection algorithms on real-world data: how many do we need? 2 | This is the repository supplementing our [JMLR paper](https://jmlr.org/papers/v25/23-0570.html). 3 | Currently, this is the largest benchmark of unsupervised anomaly detection algorithms, with 33 algorithms applied on 52 datasets. 4 | 5 | You can cite our paper as follows: 6 | 7 | ``` 8 | @article{Bouman2024UnsupervisedADComparison, 9 | author = {Roel Bouman and Zaharah Bukhsh and Tom Heskes}, 10 | title = {Unsupervised Anomaly Detection Algorithms on Real-world Data: How Many Do We Need?}, 11 | journal = {Journal of Machine Learning Research}, 12 | year = {2024}, 13 | volume = {25}, 14 | number = {105}, 15 | pages = {1--34}, 16 | url = {http://jmlr.org/papers/v25/23-0570.html} 17 | } 18 | ``` 19 | 20 | ## Running the full benchmark 21 | In order to run the full benchmark, you will need to install all dependencies. The easiest way to do this is through the supplied .yml file through an Anaconda environment. 22 | ``` 23 | conda env create -f environment.yml 24 | ``` 25 | 26 | Then, activate the environment: 27 | ``` 28 | conda activate OD_benchmark 29 | ``` 30 | 31 | Due to permission (read/write) errors, it might be that the pip packages in the environment.yml file do not install correctly. It is then recommended to activate the OD_benchmark environment, and install these packages manually using `pip install` from within the environment. 32 | 33 | If you want to run the DeepSVDD benchmark, or use the method in any other way, you also need to install a separate environment for DeepSVDD: 34 | 35 | ``` 36 | cd additional_methods/Deep-SVDD 37 | conda create --name myenv 38 | source activate myenv 39 | while read requirement; do conda install -n myenv --yes $requirement; done < requirements.txt 40 | cd ../.. 41 | ``` 42 | 43 | You can replace the Conda environment name `myenv` with any of your choice, but you will have to change the name accordingly in the `run_all_methods.py` script. 44 | 45 | The current installation instructions do not include GPU acceleration for the Tensorflow/PyTorch libraries. Should you wish to use it nonetheless, please follow the installation instructions for your specific system. Make sure to install these in the correct OD_benchmark conda environment. 46 | 47 | When all dependencies are succesfully installed, you can either re-run the preprocessing, or make use of the existing preprocessed `.npz` files. 48 | 49 | If you want to download all raw data, you can download them from the following sources: 50 | 51 | | **Name** | **Source URL** | **Datasets** | 52 | |-------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 53 | | ADBench | https://github.com/Minqi824/ADBench/tree/main/datasets | 11_donors.npz, 12_fault.npz, 19_landsat.npz, 22_magic.gamma.npz, 33_skin.npz, 42_WBC.npz, 46_WPBC.npz, 47_yeast.npz, 4_breastw.npz, 5_campaign.npz | 54 | | ELKI | https://www.dbs.ifi.lmu.de/research/outlier-evaluation/DAMI/ | Hepatitis_withoutdupl_norm_16.arff, InternetAds_withoutdupl_norm_19.arff, PageBlocks_withoutdupl_norm_09.arff, Parkinson_withoutdupl_norm_75.arff, Stamps_withoutdupl_norm_09.arff, Wilt_withoutdupl_norm_05.arff | 55 | | extended AE | https://www.kaggle.com/datasets/shasun/tool-wear-detection-in-cnc-mill, https://www.kaggle.com/datasets/inIT-OWL/high-storage-system-data-for-energy-optimization, https://www.kaggle.com/datasets/shrutimehta/nasa-asteroids-classification | HRSS_anomalous_optimized.csv, HRSS_anomalous_standard.csv, nasa.csv, and the entire folder: "CNC-kaggle" | 56 | | GAAL | https://github.com/leibinghe/GAAL-based-outlier-detection/blob/master/Data/ | Spambase, Waveform | 57 | | Goldstein | http://dx.doi.org/10.7910/DVN/OPQMVF | aloi-unsupervised-ad.csv, pen-global-unsupervised-ad.csv, pen-local-unsupervised-ad.csv | 58 | | ODDS | http://odds.cs.stonybrook.edu/ | annthyroid.mat, arrhythmia.mat, cardio.mat, cover.mat, glass.mat, http.mat, ionosphere.mat, letter.mat, mammography.mat, mnist.mat, musk.mat, optdigits.mat, pendigits.mat, pima.mat, satellite.mat, satimage-2.mat, shuttle.mat, smtp.mat, speech.mat, thyroid.mat, vertebral.mat, vowels.mat, wbc.mat, wine.mat, and non ".mat" data: seismic-bumps.arff, yeast.data, yeast.names | 59 | 60 | Ensure each of the datasets is put into the correct folder in the `raw_data` folder. 61 | 62 | The raw data can then be processed using the `read_raw_write_in_format.py` script. 63 | 64 | ``` 65 | python3 read_raw_write_in_format.py 66 | ``` 67 | 68 | All methods can then be run using a nice CLI: 69 | 70 | ``` 71 | python3 run_all_methods.py 72 | ``` 73 | 74 | Or alternatively, you can add additional arguments to run only subsets. For example, you only want to run the kNN method on the wine dataset: 75 | 76 | ``` 77 | python3 run_all_methods.py --method kNN --dataset wine 78 | ``` 79 | 80 | As noted in the paper, we've inverted the labels for the `skin` and `vertebral` datasets post-hoc. This can be reproduced by executing the following script: 81 | 82 | ``` 83 | python3 invert_labels_calculate_metrics.py 84 | ``` 85 | 86 | Finally, reproducing the figures and analysis from the paper is then easily done using the following command: 87 | 88 | 89 | ``` 90 | python3 produce_figures.py 91 | ``` 92 | 93 | ## Extending the benchmark 94 | Extending the benchmark is easy! 95 | You'll not need to install all dependencies for this, but a minimal set will do: 96 | ``` 97 | conda env create -f minimal_environment.yml 98 | ``` 99 | 100 | Then, activate the environment: 101 | ``` 102 | conda activate OD_benchmark_minimal 103 | ``` 104 | 105 | ### Adding datasets 106 | Datasets can be added by ensuring processed datafiles are added to `processed_data` folder in either `.npz` or `.pickle` format. You can look at the `read_raw_write_in_format.py` script for inspiration. Most importantly, data can't contain duplicates, and must include the following attributes: `X` with samples as rows and features as columns, `y`, 1-dimensional with a label `0` for each normal sample, and `1` for each anomaly. 107 | 108 | ### Adding methods: 109 | Methods are even easier to add, they only need to produce outlier scores according to the PyOD standard. If your implementation follows the Sklearn API, you can simply modify the following example: (also found in `method_example.py`) 110 | ``` 111 | from preprocess_detect_outliers import preprocess_detect_outliers 112 | 113 | from pyod.models.knn import KNN 114 | 115 | #uninstantiated method class 116 | methods = { 117 | "kNN":KNN 118 | } 119 | 120 | #dict of methods and parameters 121 | method_parameters = { 122 | "kNN":{"n_neighbors":range(5,31), "method":["mean"]} 123 | } 124 | 125 | preprocess_detect_outliers(methods, method_parameters) 126 | ``` 127 | If you want to add your method, the class needs to at least possess a `.fit()` function (like the Scikit-learn API), and it must after fitting have an `.decision_scores_` attribute which gives an outlier score for each sample in `X`. According to the PyOD standard, a high outlier score indicates a higher likelihood for a sample to be an outlier. 128 | 129 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Lukas Ruff 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/README.md: -------------------------------------------------------------------------------- 1 | # PyTorch Implementation of Deep SVDD 2 | This repository provides a [PyTorch](https://pytorch.org/) implementation of the *Deep SVDD* method presented in our 3 | ICML 2018 paper ”Deep One-Class Classification”. 4 | 5 | 6 | ## Citation and Contact 7 | You find a PDF of the Deep One-Class Classification ICML 2018 paper at 8 | [http://proceedings.mlr.press/v80/ruff18a.html](http://proceedings.mlr.press/v80/ruff18a.html). 9 | 10 | If you use our work, please also cite the paper: 11 | ``` 12 | @InProceedings{pmlr-v80-ruff18a, 13 | title = {Deep One-Class Classification}, 14 | author = {Ruff, Lukas and Vandermeulen, Robert A. and G{\"o}rnitz, Nico and Deecke, Lucas and Siddiqui, Shoaib A. and Binder, Alexander and M{\"u}ller, Emmanuel and Kloft, Marius}, 15 | booktitle = {Proceedings of the 35th International Conference on Machine Learning}, 16 | pages = {4393--4402}, 17 | year = {2018}, 18 | volume = {80}, 19 | } 20 | ``` 21 | 22 | If you would like to get in touch, please contact [contact@lukasruff.com](mailto:contact@lukasruff.com). 23 | 24 | 25 | ## Abstract 26 | > > Despite the great advances made by deep learning in many machine learning problems, there is a relative dearth of 27 | > > deep learning approaches for anomaly detection. Those approaches which do exist involve networks trained to perform 28 | > > a task other than anomaly detection, namely generative models or compression, which are in turn adapted for use in 29 | > > anomaly detection; they are not trained on an anomaly detection based objective. In this paper we introduce a new 30 | > > anomaly detection method—Deep Support Vector Data Description—, which is trained on an anomaly detection based 31 | > > objective. The adaptation to the deep regime necessitates that our neural network and training procedure satisfy 32 | > > certain properties, which we demonstrate theoretically. We show the effectiveness of our method on MNIST and 33 | > > CIFAR-10 image benchmark datasets as well as on the detection of adversarial examples of GTSRB stop signs. 34 | 35 | 36 | ## Installation 37 | This code is written in `Python 3.7` and requires the packages listed in `requirements.txt`. 38 | 39 | Clone the repository to your local machine and directory of choice: 40 | ``` 41 | git clone https://github.com/lukasruff/Deep-SVDD-PyTorch.git 42 | ``` 43 | 44 | To run the code, we recommend setting up a virtual environment, e.g. using `virtualenv` or `conda`: 45 | 46 | ### `virtualenv` 47 | ``` 48 | # pip install virtualenv 49 | cd 50 | virtualenv myenv 51 | source myenv/bin/activate 52 | pip install -r requirements.txt 53 | ``` 54 | 55 | ### `conda` 56 | ``` 57 | cd 58 | conda create --name myenv 59 | source activate myenv 60 | while read requirement; do conda install -n myenv --yes $requirement; done < requirements.txt 61 | ``` 62 | 63 | 64 | ## Running experiments 65 | 66 | We currently have implemented the MNIST ([http://yann.lecun.com/exdb/mnist/](http://yann.lecun.com/exdb/mnist/)) and 67 | CIFAR-10 ([https://www.cs.toronto.edu/~kriz/cifar.html](https://www.cs.toronto.edu/~kriz/cifar.html)) datasets and 68 | simple LeNet-type networks. 69 | 70 | Have a look into `main.py` for all possible arguments and options. 71 | 72 | ### MNIST example 73 | ``` 74 | cd 75 | 76 | # activate virtual environment 77 | source myenv/bin/activate # or 'source activate myenv' for conda 78 | 79 | # create folder for experimental output 80 | mkdir log/mnist_test 81 | 82 | # change to source directory 83 | cd src 84 | 85 | # run experiment 86 | python main.py mnist mnist_LeNet ../log/mnist_test ../data --objective one-class --lr 0.0001 --n_epochs 150 --lr_milestone 50 --batch_size 200 --weight_decay 0.5e-6 --pretrain True --ae_lr 0.0001 --ae_n_epochs 150 --ae_lr_milestone 50 --ae_batch_size 200 --ae_weight_decay 0.5e-3 --normal_class 3; 87 | ``` 88 | This example trains a One-Class Deep SVDD model where digit 3 (`--normal_class 3`) is considered to be the normal class. Autoencoder 89 | pretraining is used for parameter initialization. 90 | 91 | ### CIFAR-10 example 92 | ``` 93 | cd 94 | 95 | # activate virtual environment 96 | source myenv/bin/activate # or 'source activate myenv' for conda 97 | 98 | # create folder for experimental output 99 | mkdir log/cifar10_test 100 | 101 | # change to source directory 102 | cd src 103 | 104 | # run experiment 105 | python main.py cifar10 cifar10_LeNet ../log/cifar10_test ../data --objective one-class --lr 0.0001 --n_epochs 150 --lr_milestone 50 --batch_size 200 --weight_decay 0.5e-6 --pretrain True --ae_lr 0.0001 --ae_n_epochs 350 --ae_lr_milestone 250 --ae_batch_size 200 --ae_weight_decay 0.5e-6 --normal_class 3; 106 | ``` 107 | This example trains a One-Class Deep SVDD model where cats (`--normal_class 3`) is considered to be the normal class. 108 | Autoencoder pretraining is used for parameter initialization. 109 | 110 | 111 | ## Examples 112 | 113 | ### MNIST 114 | Example of the 32 most normal (left) and 32 most anomalous (right) test set examples per class on MNIST according to 115 | Deep SVDD anomaly scores. 116 | 117 | ![MNIST](imgs/mnist.png?raw=true "MNIST") 118 | 119 | ### CIFAR-10 120 | Example of the 32 most normal (left) and 32 most anomalous (right) test set examples per class on CIFAR-10 according to 121 | Deep SVDD anomaly scores. 122 | 123 | ![CIFAR-10](imgs/cifar10.png?raw=true "CIFAR-10") 124 | 125 | 126 | ## License 127 | MIT -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/data/.gitignore: -------------------------------------------------------------------------------- 1 | *.pickle 2 | *.npz 3 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/additional_methods/Deep-SVDD/data/.gitkeep -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/data/cardio.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/additional_methods/Deep-SVDD/data/cardio.npz -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/imgs/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/additional_methods/Deep-SVDD/imgs/.gitkeep -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/imgs/cifar10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/additional_methods/Deep-SVDD/imgs/cifar10.png -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/imgs/mnist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/additional_methods/Deep-SVDD/imgs/mnist.png -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/log/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/additional_methods/Deep-SVDD/log/.gitkeep -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/requirements.txt: -------------------------------------------------------------------------------- 1 | requests==2.20.1 2 | numpy==1.15.2 3 | scipy==1.1.0 4 | scikit-learn==0.20.0 5 | certifi==2018.10.15 6 | chardet==3.0.4 7 | Click==7.0 8 | cloudpickle==0.5.6 9 | cycler==0.10.0 10 | idna==2.7 11 | kiwisolver==1.0.1 12 | matplotlib==3.0.1 13 | pandas==0.23.4 14 | Pillow==5.3.0 15 | pyparsing==2.3.0 16 | python-dateutil==2.7.5 17 | pytz==2018.7 18 | six==1.11.0 19 | torch==0.4.1 20 | torchvision==0.2.1 21 | tqdm==4.28.1 22 | ujson==1.35 23 | urllib3==1.24.1 24 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/additional_methods/Deep-SVDD/src/__init__.py -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/src/base/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_dataset import * 2 | from .torchvision_dataset import * 3 | from .base_net import * 4 | from .base_trainer import * 5 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/src/base/base_dataset.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from torch.utils.data import DataLoader 3 | 4 | 5 | class BaseADDataset(ABC): 6 | """Anomaly detection dataset base class.""" 7 | 8 | def __init__(self, root: str): 9 | super().__init__() 10 | self.root = root # root path to data 11 | 12 | self.n_classes = 2 # 0: normal, 1: outlier 13 | self.normal_classes = None # tuple with original class labels that define the normal class 14 | self.outlier_classes = None # tuple with original class labels that define the outlier class 15 | 16 | self.train_set = None # must be of type torch.utils.data.Dataset 17 | self.test_set = None # must be of type torch.utils.data.Dataset 18 | 19 | @abstractmethod 20 | def loaders(self, batch_size: int, shuffle_train=True, shuffle_test=False, num_workers: int = 0) -> ( 21 | DataLoader, DataLoader): 22 | """Implement data loaders of type torch.utils.data.DataLoader for train_set and test_set.""" 23 | pass 24 | 25 | def __repr__(self): 26 | return self.__class__.__name__ 27 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/src/base/base_net.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import torch.nn as nn 3 | import numpy as np 4 | 5 | 6 | class BaseNet(nn.Module): 7 | """Base class for all neural networks.""" 8 | 9 | def __init__(self): 10 | super().__init__() 11 | self.logger = logging.getLogger(self.__class__.__name__) 12 | self.rep_dim = None # representation dimensionality, i.e. dim of the last layer 13 | 14 | def forward(self, *input): 15 | """ 16 | Forward pass logic 17 | :return: Network output 18 | """ 19 | raise NotImplementedError 20 | 21 | def summary(self): 22 | """Network summary.""" 23 | net_parameters = filter(lambda p: p.requires_grad, self.parameters()) 24 | params = sum([np.prod(p.size()) for p in net_parameters]) 25 | self.logger.info('Trainable parameters: {}'.format(params)) 26 | self.logger.info(self) 27 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/src/base/base_trainer.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from .base_dataset import BaseADDataset 3 | from .base_net import BaseNet 4 | 5 | 6 | class BaseTrainer(ABC): 7 | """Trainer base class.""" 8 | 9 | def __init__(self, optimizer_name: str, lr: float, n_epochs: int, lr_milestones: tuple, batch_size: int, 10 | weight_decay: float, device: str, n_jobs_dataloader: int): 11 | super().__init__() 12 | self.optimizer_name = optimizer_name 13 | self.lr = lr 14 | self.n_epochs = n_epochs 15 | self.lr_milestones = lr_milestones 16 | self.batch_size = batch_size 17 | self.weight_decay = weight_decay 18 | self.device = device 19 | self.n_jobs_dataloader = n_jobs_dataloader 20 | 21 | @abstractmethod 22 | def train(self, dataset: BaseADDataset, net: BaseNet) -> BaseNet: 23 | """ 24 | Implement train method that trains the given network using the train_set of dataset. 25 | :return: Trained net 26 | """ 27 | pass 28 | 29 | @abstractmethod 30 | def test(self, dataset: BaseADDataset, net: BaseNet): 31 | """ 32 | Implement test method that evaluates the test_set of dataset on the given network. 33 | """ 34 | pass 35 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/src/base/torchvision_dataset.py: -------------------------------------------------------------------------------- 1 | from .base_dataset import BaseADDataset 2 | from torch.utils.data import DataLoader 3 | 4 | 5 | class TorchvisionDataset(BaseADDataset): 6 | """TorchvisionDataset class for datasets already implemented in torchvision.datasets.""" 7 | 8 | def __init__(self, root: str): 9 | super().__init__(root) 10 | 11 | def loaders(self, batch_size: int, shuffle_train=True, shuffle_test=False, num_workers: int = 0) -> ( 12 | DataLoader, DataLoader): 13 | train_loader = DataLoader(dataset=self.train_set, batch_size=batch_size, shuffle=shuffle_train, 14 | num_workers=num_workers) 15 | test_loader = DataLoader(dataset=self.test_set, batch_size=batch_size, shuffle=shuffle_test, 16 | num_workers=num_workers) 17 | return train_loader, test_loader 18 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/src/datasets/OD_dataset.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | import pickle 3 | import numpy as np 4 | from sklearn.preprocessing import RobustScaler 5 | 6 | from base.torchvision_dataset import TorchvisionDataset 7 | 8 | class OD_Dataset(TorchvisionDataset): 9 | 10 | def __init__(self, root: str, normal_class=0): 11 | super().__init__(root) 12 | 13 | 14 | self.root = root 15 | 16 | self.n_classes = 2 # 0: normal, 1: outlier 17 | self.normal_classes = tuple([normal_class]) 18 | self.outlier_classes = [0,1] 19 | self.outlier_classes.remove(normal_class) 20 | 21 | # Subset train set to normal class 22 | self.train_set = OD_Base_Dataset(dataset_name=root) 23 | 24 | self.test_set = OD_Base_Dataset(dataset_name=root) 25 | 26 | class OD_Base_Dataset(Dataset): 27 | def __init__(self, dataset_name): 28 | 29 | data = pickle.load(open(dataset_name, 'rb')) 30 | self.X, self.y = data["X"].astype(np.float32), np.squeeze(data["y"]).astype(np.float32) 31 | 32 | scaler = RobustScaler() 33 | 34 | self.X_scaled = scaler.fit_transform(self.X) 35 | 36 | def __len__(self): 37 | return self.X.shape[0] 38 | 39 | def __getitem__(self, idx): 40 | 41 | 42 | return self.X_scaled[idx,:], self.y[idx], idx 43 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/src/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .main import load_dataset 2 | from .mnist import MNIST_Dataset 3 | from .cifar10 import CIFAR10_Dataset 4 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/src/datasets/cifar10.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Subset 2 | from PIL import Image 3 | from torchvision.datasets import CIFAR10 4 | from base.torchvision_dataset import TorchvisionDataset 5 | from .preprocessing import get_target_label_idx, global_contrast_normalization 6 | 7 | import torchvision.transforms as transforms 8 | 9 | 10 | class CIFAR10_Dataset(TorchvisionDataset): 11 | 12 | def __init__(self, root: str, normal_class=5): 13 | super().__init__(root) 14 | 15 | self.n_classes = 2 # 0: normal, 1: outlier 16 | self.normal_classes = tuple([normal_class]) 17 | self.outlier_classes = list(range(0, 10)) 18 | self.outlier_classes.remove(normal_class) 19 | 20 | # Pre-computed min and max values (after applying GCN) from train data per class 21 | min_max = [(-28.94083453598571, 13.802961825439636), 22 | (-6.681770233365245, 9.158067708230273), 23 | (-34.924463588638204, 14.419298165027628), 24 | (-10.599172931391799, 11.093187820377565), 25 | (-11.945022995801637, 10.628045447867583), 26 | (-9.691969487694928, 8.948326776180823), 27 | (-9.174940012342555, 13.847014686472365), 28 | (-6.876682005899029, 12.282371383343161), 29 | (-15.603507135507172, 15.2464923804279), 30 | (-6.132882973622672, 8.046098172351265)] 31 | 32 | # CIFAR-10 preprocessing: GCN (with L1 norm) and min-max feature scaling to [0,1] 33 | transform = transforms.Compose([transforms.ToTensor(), 34 | transforms.Lambda(lambda x: global_contrast_normalization(x, scale='l1')), 35 | transforms.Normalize([min_max[normal_class][0]] * 3, 36 | [min_max[normal_class][1] - min_max[normal_class][0]] * 3)]) 37 | 38 | target_transform = transforms.Lambda(lambda x: int(x in self.outlier_classes)) 39 | 40 | train_set = MyCIFAR10(root=self.root, train=True, download=True, 41 | transform=transform, target_transform=target_transform) 42 | # Subset train set to normal class 43 | train_idx_normal = get_target_label_idx(train_set.train_labels, self.normal_classes) 44 | self.train_set = Subset(train_set, train_idx_normal) 45 | 46 | self.test_set = MyCIFAR10(root=self.root, train=False, download=True, 47 | transform=transform, target_transform=target_transform) 48 | 49 | 50 | class MyCIFAR10(CIFAR10): 51 | """Torchvision CIFAR10 class with patch of __getitem__ method to also return the index of a data sample.""" 52 | 53 | def __init__(self, *args, **kwargs): 54 | super(MyCIFAR10, self).__init__(*args, **kwargs) 55 | 56 | def __getitem__(self, index): 57 | """Override the original method of the CIFAR10 class. 58 | Args: 59 | index (int): Index 60 | Returns: 61 | triple: (image, target, index) where target is index of the target class. 62 | """ 63 | if self.train: 64 | img, target = self.train_data[index], self.train_labels[index] 65 | else: 66 | img, target = self.test_data[index], self.test_labels[index] 67 | 68 | # doing this so that it is consistent with all other datasets 69 | # to return a PIL Image 70 | img = Image.fromarray(img) 71 | 72 | if self.transform is not None: 73 | img = self.transform(img) 74 | 75 | if self.target_transform is not None: 76 | target = self.target_transform(target) 77 | 78 | return img, target, index # only line changed 79 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/src/datasets/main.py: -------------------------------------------------------------------------------- 1 | from .OD_dataset import OD_Dataset 2 | import os 3 | 4 | def load_dataset(dataset_name, data_path, normal_class): 5 | """Loads the dataset.""" 6 | 7 | dataset_path = os.path.join(data_path, dataset_name) 8 | dataset = OD_Dataset(root=dataset_path, normal_class=normal_class) 9 | 10 | return dataset 11 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/src/datasets/mnist.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Subset 2 | from PIL import Image 3 | from torchvision.datasets import MNIST 4 | from base.torchvision_dataset import TorchvisionDataset 5 | from .preprocessing import get_target_label_idx, global_contrast_normalization 6 | 7 | import torchvision.transforms as transforms 8 | 9 | 10 | class MNIST_Dataset(TorchvisionDataset): 11 | 12 | def __init__(self, root: str, normal_class=0): 13 | super().__init__(root) 14 | 15 | self.n_classes = 2 # 0: normal, 1: outlier 16 | self.normal_classes = tuple([normal_class]) 17 | self.outlier_classes = list(range(0, 10)) 18 | self.outlier_classes.remove(normal_class) 19 | 20 | # Pre-computed min and max values (after applying GCN) from train data per class 21 | min_max = [(-0.8826567065619495, 9.001545489292527), 22 | (-0.6661464580883915, 20.108062262467364), 23 | (-0.7820454743183202, 11.665100841080346), 24 | (-0.7645772083211267, 12.895051191467457), 25 | (-0.7253923114302238, 12.683235701611533), 26 | (-0.7698501867861425, 13.103278415430502), 27 | (-0.778418217980696, 10.457837397569108), 28 | (-0.7129780970522351, 12.057777597673047), 29 | (-0.8280402650205075, 10.581538445782988), 30 | (-0.7369959242164307, 10.697039838804978)] 31 | 32 | # MNIST preprocessing: GCN (with L1 norm) and min-max feature scaling to [0,1] 33 | transform = transforms.Compose([transforms.ToTensor(), 34 | transforms.Lambda(lambda x: global_contrast_normalization(x, scale='l1')), 35 | transforms.Normalize([min_max[normal_class][0]], 36 | [min_max[normal_class][1] - min_max[normal_class][0]])]) 37 | 38 | target_transform = transforms.Lambda(lambda x: int(x in self.outlier_classes)) 39 | 40 | train_set = MyMNIST(root=self.root, train=True, download=True, 41 | transform=transform, target_transform=target_transform) 42 | # Subset train_set to normal class 43 | train_idx_normal = get_target_label_idx(train_set.train_labels.clone().data.cpu().numpy(), self.normal_classes) 44 | self.train_set = Subset(train_set, train_idx_normal) 45 | 46 | self.test_set = MyMNIST(root=self.root, train=False, download=True, 47 | transform=transform, target_transform=target_transform) 48 | 49 | 50 | class MyMNIST(MNIST): 51 | """Torchvision MNIST class with patch of __getitem__ method to also return the index of a data sample.""" 52 | 53 | def __init__(self, *args, **kwargs): 54 | super(MyMNIST, self).__init__(*args, **kwargs) 55 | 56 | def __getitem__(self, index): 57 | """Override the original method of the MNIST class. 58 | Args: 59 | index (int): Index 60 | Returns: 61 | triple: (image, target, index) where target is index of the target class. 62 | """ 63 | if self.train: 64 | img, target = self.train_data[index], self.train_labels[index] 65 | else: 66 | img, target = self.test_data[index], self.test_labels[index] 67 | 68 | # doing this so that it is consistent with all other datasets 69 | # to return a PIL Image 70 | img = Image.fromarray(img.numpy(), mode='L') 71 | 72 | if self.transform is not None: 73 | img = self.transform(img) 74 | 75 | if self.target_transform is not None: 76 | target = self.target_transform(target) 77 | 78 | return img, target, index # only line changed 79 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/src/datasets/preprocessing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | def get_target_label_idx(labels, targets): 6 | """ 7 | Get the indices of labels that are included in targets. 8 | :param labels: array of labels 9 | :param targets: list/tuple of target labels 10 | :return: list with indices of target labels 11 | """ 12 | return np.argwhere(np.isin(labels, targets)).flatten().tolist() 13 | 14 | 15 | def global_contrast_normalization(x: torch.tensor, scale='l2'): 16 | """ 17 | Apply global contrast normalization to tensor, i.e. subtract mean across features (pixels) and normalize by scale, 18 | which is either the standard deviation, L1- or L2-norm across features (pixels). 19 | Note this is a *per sample* normalization globally across features (and not across the dataset). 20 | """ 21 | 22 | assert scale in ('l1', 'l2') 23 | 24 | n_features = int(np.prod(x.shape)) 25 | 26 | mean = torch.mean(x) # mean over all features (pixels) per sample 27 | x -= mean 28 | 29 | if scale == 'l1': 30 | x_scale = torch.mean(torch.abs(x)) 31 | 32 | if scale == 'l2': 33 | x_scale = torch.sqrt(torch.sum(x ** 2)) / n_features 34 | 35 | x /= x_scale 36 | 37 | return x 38 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/src/deepSVDD.py: -------------------------------------------------------------------------------- 1 | import json 2 | import torch 3 | 4 | from base.base_dataset import BaseADDataset 5 | from networks.networks import network, auto_encoder 6 | from optim.deepSVDD_trainer import DeepSVDDTrainer 7 | from optim.ae_trainer import AETrainer 8 | 9 | 10 | class DeepSVDD(object): 11 | """A class for the Deep SVDD method. 12 | 13 | Attributes: 14 | objective: A string specifying the Deep SVDD objective (either 'one-class' or 'soft-boundary'). 15 | nu: Deep SVDD hyperparameter nu (must be 0 < nu <= 1). 16 | R: Hypersphere radius R. 17 | c: Hypersphere center c. 18 | net_name: A string indicating the name of the neural network to use. 19 | net: The neural network \phi. 20 | ae_net: The autoencoder network corresponding to \phi for network weights pretraining. 21 | trainer: DeepSVDDTrainer to train a Deep SVDD model. 22 | optimizer_name: A string indicating the optimizer to use for training the Deep SVDD network. 23 | ae_trainer: AETrainer to train an autoencoder in pretraining. 24 | ae_optimizer_name: A string indicating the optimizer to use for pretraining the autoencoder. 25 | results: A dictionary to save the results. 26 | """ 27 | 28 | def __init__(self, n_vars, objective: str = 'one-class', nu: float = 0.1): 29 | """Inits DeepSVDD with one of the two objectives and hyperparameter nu.""" 30 | 31 | 32 | self.n_vars = n_vars 33 | 34 | assert objective in ('one-class', 'soft-boundary'), "Objective must be either 'one-class' or 'soft-boundary'." 35 | self.objective = objective 36 | assert (0 < nu) & (nu <= 1), "For hyperparameter nu, it must hold: 0 < nu <= 1." 37 | self.nu = nu 38 | self.R = 0.0 # hypersphere radius R 39 | self.c = None # hypersphere center c 40 | 41 | self.net_name = None 42 | self.net = None # neural network \phi 43 | 44 | self.trainer = None 45 | self.optimizer_name = None 46 | 47 | self.ae_net = None # autoencoder network for pretraining 48 | self.ae_trainer = None 49 | self.ae_optimizer_name = None 50 | 51 | self.results = { 52 | 'train_time': None, 53 | 'test_auc': None, 54 | 'test_time': None, 55 | 'test_scores': None, 56 | } 57 | 58 | def set_networks(self, pretrain, n_layers, shrinkage_factor): 59 | """Builds the neural network \phi.""" 60 | self.net, self.ae_net = self.build_networks(pretrain, n_layers, shrinkage_factor) 61 | 62 | def build_networks(self, pretrain, n_layers, shrinkage_factor): 63 | """Builds the neural networks.""" 64 | 65 | net = network(self.n_vars, n_layers, shrinkage_factor) 66 | 67 | if pretrain: 68 | ae_net = auto_encoder(self.n_vars, n_layers, shrinkage_factor) 69 | 70 | else: 71 | ae_net = None 72 | 73 | return net, ae_net 74 | 75 | 76 | 77 | def train(self, dataset: BaseADDataset, optimizer_name: str = 'adam', lr: float = 0.001, n_epochs: int = 50, 78 | lr_milestones: tuple = (), batch_size: int = 128, weight_decay: float = 1e-6, device: str = 'cuda', 79 | n_jobs_dataloader: int = 0): 80 | """Trains the Deep SVDD model on the training data.""" 81 | 82 | self.optimizer_name = optimizer_name 83 | self.trainer = DeepSVDDTrainer(self.objective, self.R, self.c, self.nu, optimizer_name, lr=lr, 84 | n_epochs=n_epochs, lr_milestones=lr_milestones, batch_size=batch_size, 85 | weight_decay=weight_decay, device=device, n_jobs_dataloader=n_jobs_dataloader) 86 | # Get the model 87 | self.net = self.trainer.train(dataset, self.net) 88 | self.R = float(self.trainer.R.cpu().data.numpy()) # get float 89 | self.c = self.trainer.c.cpu().data.numpy().tolist() # get list 90 | self.results['train_time'] = self.trainer.train_time 91 | 92 | def test(self, dataset: BaseADDataset, device: str = 'cuda', n_jobs_dataloader: int = 0): 93 | """Tests the Deep SVDD model on the test data.""" 94 | 95 | if self.trainer is None: 96 | self.trainer = DeepSVDDTrainer(self.objective, self.R, self.c, self.nu, 97 | device=device, n_jobs_dataloader=n_jobs_dataloader) 98 | 99 | self.trainer.test(dataset, self.net) 100 | # Get results 101 | self.results['test_auc'] = self.trainer.test_auc 102 | self.results['test_time'] = self.trainer.test_time 103 | self.results['test_scores'] = self.trainer.test_scores 104 | 105 | def pretrain(self, dataset: BaseADDataset, optimizer_name: str = 'adam', lr: float = 0.001, n_epochs: int = 100, 106 | lr_milestones: tuple = (), batch_size: int = 128, weight_decay: float = 1e-6, device: str = 'cuda', 107 | n_jobs_dataloader: int = 0): 108 | """Pretrains the weights for the Deep SVDD network \phi via autoencoder.""" 109 | 110 | self.ae_optimizer_name = optimizer_name 111 | self.ae_trainer = AETrainer(optimizer_name, lr=lr, n_epochs=n_epochs, lr_milestones=lr_milestones, 112 | batch_size=batch_size, weight_decay=weight_decay, device=device, 113 | n_jobs_dataloader=n_jobs_dataloader) 114 | self.ae_net = self.ae_trainer.train(dataset, self.ae_net) 115 | self.ae_trainer.test(dataset, self.ae_net) 116 | self.init_network_weights_from_pretraining() 117 | 118 | def init_network_weights_from_pretraining(self): 119 | """Initialize the Deep SVDD network weights from the encoder weights of the pretraining autoencoder.""" 120 | 121 | net_dict = self.net.state_dict() 122 | ae_net_dict = self.ae_net.state_dict() 123 | 124 | # Filter out decoder network keys 125 | ae_net_dict = {k: v for k, v in ae_net_dict.items() if k in net_dict} 126 | # Overwrite values in the existing state_dict 127 | net_dict.update(ae_net_dict) 128 | # Load the new state_dict 129 | self.net.load_state_dict(net_dict) 130 | 131 | def save_model(self, export_model, save_ae=True): 132 | """Save Deep SVDD model to export_model.""" 133 | 134 | net_dict = self.net.state_dict() 135 | ae_net_dict = self.ae_net.state_dict() if save_ae and self.ae_net is not None else None 136 | 137 | torch.save({'R': self.R, 138 | 'c': self.c, 139 | 'net_dict': net_dict, 140 | 'ae_net_dict': ae_net_dict}, export_model) 141 | 142 | def load_model(self, model_path, load_ae=False): 143 | """Load Deep SVDD model from model_path.""" 144 | 145 | model_dict = torch.load(model_path) 146 | 147 | self.R = model_dict['R'] 148 | self.c = model_dict['c'] 149 | self.net.load_state_dict(model_dict['net_dict']) 150 | if load_ae: 151 | if self.ae_net is None: 152 | self.ae_net = build_autoencoder(self.net_name) 153 | self.ae_net.load_state_dict(model_dict['ae_net_dict']) 154 | 155 | def save_results(self, export_json): 156 | """Save results dict to a JSON-file.""" 157 | with open(export_json, 'w') as fp: 158 | json.dump(self.results, fp) 159 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/src/main.py: -------------------------------------------------------------------------------- 1 | import click 2 | import torch 3 | import logging 4 | import random 5 | import numpy as np 6 | import os 7 | 8 | from utils.config import Config 9 | from deepSVDD import DeepSVDD 10 | from datasets.main import load_dataset 11 | 12 | 13 | ################################################################################ 14 | # Settings 15 | ################################################################################ 16 | @click.command() 17 | @click.argument('dataset_name', type=str) 18 | #@click.argument('net_name', type=click.Choice(['mnist_LeNet', 'cifar10_LeNet', 'cifar10_LeNet_ELU'])) 19 | @click.argument('n_layers', type=int) 20 | @click.argument('shrinkage_factor', type=float) 21 | @click.argument('xp_path', type=click.Path(exists=False)) 22 | @click.argument('data_path', type=click.Path(exists=False)) 23 | @click.argument("target_scorefile_path", type=click.Path(exists=False)) 24 | @click.option('--load_config', type=click.Path(exists=True), default=None, 25 | help='Config JSON-file path (default: None).') 26 | @click.option('--load_model', type=click.Path(exists=True), default=None, 27 | help='Model file path (default: None).') 28 | @click.option('--objective', type=click.Choice(['one-class', 'soft-boundary']), default='one-class', 29 | help='Specify Deep SVDD objective ("one-class" or "soft-boundary").') 30 | @click.option('--nu', type=float, default=0.1, help='Deep SVDD hyperparameter nu (must be 0 < nu <= 1).') 31 | @click.option('--device', type=str, default='cuda', help='Computation device to use ("cpu", "cuda", "cuda:2", etc.).') 32 | @click.option('--seed', type=int, default=-1, help='Set seed. If -1, use randomization.') 33 | @click.option('--optimizer_name', type=click.Choice(['adam', 'amsgrad']), default='adam', 34 | help='Name of the optimizer to use for Deep SVDD network training.') 35 | @click.option('--lr', type=float, default=0.001, 36 | help='Initial learning rate for Deep SVDD network training. Default=0.001') 37 | @click.option('--n_epochs', type=int, default=50, help='Number of epochs to train.') 38 | @click.option('--lr_milestone', type=int, default=0, multiple=True, 39 | help='Lr scheduler milestones at which lr is multiplied by 0.1. Can be multiple and must be increasing.') 40 | @click.option('--batch_size', type=int, default=128, help='Batch size for mini-batch training.') 41 | @click.option('--weight_decay', type=float, default=1e-6, 42 | help='Weight decay (L2 penalty) hyperparameter for Deep SVDD objective.') 43 | @click.option('--pretrain', type=bool, default=True, 44 | help='Pretrain neural network parameters via autoencoder.') 45 | @click.option('--ae_optimizer_name', type=click.Choice(['adam', 'amsgrad']), default='adam', 46 | help='Name of the optimizer to use for autoencoder pretraining.') 47 | @click.option('--ae_lr', type=float, default=0.001, 48 | help='Initial learning rate for autoencoder pretraining. Default=0.001') 49 | @click.option('--ae_n_epochs', type=int, default=100, help='Number of epochs to train autoencoder.') 50 | @click.option('--ae_lr_milestone', type=int, default=0, multiple=True, 51 | help='Lr scheduler milestones at which lr is multiplied by 0.1. Can be multiple and must be increasing.') 52 | @click.option('--ae_batch_size', type=int, default=128, help='Batch size for mini-batch autoencoder training.') 53 | @click.option('--ae_weight_decay', type=float, default=1e-6, 54 | help='Weight decay (L2 penalty) hyperparameter for autoencoder objective.') 55 | @click.option('--n_jobs_dataloader', type=int, default=0, 56 | help='Number of workers for data loading. 0 means that the data will be loaded in the main process.') 57 | @click.option('--normal_class', type=int, default=0, 58 | help='Specify the normal class of the dataset (all other classes are considered anomalous).') 59 | def main(dataset_name, n_layers, shrinkage_factor, xp_path, data_path, target_scorefile_path, load_config, load_model, objective, nu, device, seed, 60 | optimizer_name, lr, n_epochs, lr_milestone, batch_size, weight_decay, pretrain, ae_optimizer_name, ae_lr, 61 | ae_n_epochs, ae_lr_milestone, ae_batch_size, ae_weight_decay, n_jobs_dataloader, normal_class): 62 | """ 63 | Deep SVDD, a fully deep method for anomaly detection. 64 | 65 | :arg DATASET_NAME: Name of the dataset to load. 66 | :arg N_LAYERS: Number of hidden layers used for network. If auto-encoder pretraining is used, the auto-encoder will have n_layers*2-1 hidden layers. 67 | :arg SHRINKAGE_FACTOR: Factor by which the neurons per layer will decay between each size. Must be between 0 and 1. Shrinkage is reversed for the auto-encoder after the bottleneck layer 68 | :arg XP_PATH: Export path for logging the experiment. 69 | :arg DATA_PATH: Root path of data. 70 | """ 71 | 72 | 73 | # Get configuration 74 | cfg = Config(locals().copy()) 75 | 76 | # Set up logging 77 | logging.basicConfig(level=logging.INFO) 78 | logger = logging.getLogger() 79 | logger.setLevel(logging.INFO) 80 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 81 | if not os.path.exists(xp_path): 82 | os.makedirs(xp_path) 83 | 84 | log_file = xp_path + '/log.txt' 85 | file_handler = logging.FileHandler(log_file) 86 | file_handler.setLevel(logging.INFO) 87 | file_handler.setFormatter(formatter) 88 | logger.addHandler(file_handler) 89 | 90 | # Print arguments 91 | logger.info('Log file is %s.' % log_file) 92 | logger.info('Data path is %s.' % data_path) 93 | logger.info('Export path is %s.' % xp_path) 94 | 95 | logger.info('Dataset: %s' % dataset_name) 96 | logger.info('Normal class: %d' % normal_class) 97 | logger.info('Network (n_layers, shrinkage_factor): %s, %s' % (n_layers, shrinkage_factor)) 98 | 99 | # If specified, load experiment config from JSON-file 100 | if load_config: 101 | cfg.load_config(import_json=load_config) 102 | logger.info('Loaded configuration from %s.' % load_config) 103 | 104 | # Print configuration 105 | logger.info('Deep SVDD objective: %s' % cfg.settings['objective']) 106 | logger.info('Nu-parameter: %.2f' % cfg.settings['nu']) 107 | 108 | # Set seed 109 | if cfg.settings['seed'] != -1: 110 | random.seed(cfg.settings['seed']) 111 | np.random.seed(cfg.settings['seed']) 112 | torch.manual_seed(cfg.settings['seed']) 113 | logger.info('Set seed to %d.' % cfg.settings['seed']) 114 | 115 | # Default device to 'cpu' if cuda is not available 116 | if not torch.cuda.is_available(): 117 | device = 'cpu' 118 | logger.info('Computation device: %s' % device) 119 | logger.info('Number of dataloader workers: %d' % n_jobs_dataloader) 120 | 121 | # Load data 122 | dataset = load_dataset(dataset_name, data_path, normal_class) 123 | n_vars = dataset.train_set.X.shape[1] 124 | 125 | # Initialize DeepSVDD model and set neural network \phi 126 | deep_SVDD = DeepSVDD(n_vars, cfg.settings['objective'], cfg.settings['nu']) 127 | deep_SVDD.set_networks(cfg.settings['pretrain'], n_layers, shrinkage_factor) 128 | # If specified, load Deep SVDD model (radius R, center c, network weights, and possibly autoencoder weights) 129 | if load_model: 130 | deep_SVDD.load_model(model_path=load_model, load_ae=True) 131 | logger.info('Loading model from %s.' % load_model) 132 | 133 | logger.info('Pretraining: %s' % pretrain) 134 | if pretrain: 135 | # Log pretraining details 136 | logger.info('Pretraining optimizer: %s' % cfg.settings['ae_optimizer_name']) 137 | logger.info('Pretraining learning rate: %g' % cfg.settings['ae_lr']) 138 | logger.info('Pretraining epochs: %d' % cfg.settings['ae_n_epochs']) 139 | logger.info('Pretraining learning rate scheduler milestones: %s' % (cfg.settings['ae_lr_milestone'],)) 140 | logger.info('Pretraining batch size: %d' % cfg.settings['ae_batch_size']) 141 | logger.info('Pretraining weight decay: %g' % cfg.settings['ae_weight_decay']) 142 | 143 | # Pretrain model on dataset (via autoencoder) 144 | deep_SVDD.pretrain(dataset, 145 | optimizer_name=cfg.settings['ae_optimizer_name'], 146 | lr=cfg.settings['ae_lr'], 147 | n_epochs=cfg.settings['ae_n_epochs'], 148 | lr_milestones=cfg.settings['ae_lr_milestone'], 149 | batch_size=cfg.settings['ae_batch_size'], 150 | weight_decay=cfg.settings['ae_weight_decay'], 151 | device=device, 152 | n_jobs_dataloader=n_jobs_dataloader) 153 | 154 | # Log training details 155 | logger.info('Training optimizer: %s' % cfg.settings['optimizer_name']) 156 | logger.info('Training learning rate: %g' % cfg.settings['lr']) 157 | logger.info('Training epochs: %d' % cfg.settings['n_epochs']) 158 | logger.info('Training learning rate scheduler milestones: %s' % (cfg.settings['lr_milestone'],)) 159 | logger.info('Training batch size: %d' % cfg.settings['batch_size']) 160 | logger.info('Training weight decay: %g' % cfg.settings['weight_decay']) 161 | 162 | # Train model on dataset 163 | deep_SVDD.train(dataset, 164 | optimizer_name=cfg.settings['optimizer_name'], 165 | lr=cfg.settings['lr'], 166 | n_epochs=cfg.settings['n_epochs'], 167 | lr_milestones=cfg.settings['lr_milestone'], 168 | batch_size=cfg.settings['batch_size'], 169 | weight_decay=cfg.settings['weight_decay'], 170 | device=device, 171 | n_jobs_dataloader=n_jobs_dataloader) 172 | 173 | # Test model 174 | deep_SVDD.test(dataset, device=device, n_jobs_dataloader=n_jobs_dataloader) 175 | 176 | # Plot most anomalous and most normal (within-class) test samples 177 | indices, labels, scores = zip(*deep_SVDD.results['test_scores']) 178 | indices, labels, scores = np.array(indices), np.array(labels), np.array(scores) 179 | idx_sorted = indices[labels == 0][np.argsort(scores[labels == 0])] # sorted from lowest to highest anomaly score 180 | 181 | # Save results, model, and configuration 182 | deep_SVDD.save_results(export_json=xp_path + '/results.json') 183 | deep_SVDD.save_model(export_model=xp_path + '/model.tar') 184 | cfg.save_config(export_json=xp_path + '/config.json') 185 | 186 | test_scores_index_ordered = scores[indices] 187 | 188 | np.savetxt(target_scorefile_path, test_scores_index_ordered) 189 | 190 | 191 | 192 | if __name__ == '__main__': 193 | main() 194 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/src/networks/__init__.py: -------------------------------------------------------------------------------- 1 | from .networks import network, auto_encoder 2 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/src/networks/cifar10_LeNet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from base.base_net import BaseNet 6 | 7 | 8 | class CIFAR10_LeNet(BaseNet): 9 | 10 | def __init__(self): 11 | super().__init__() 12 | 13 | self.rep_dim = 128 14 | self.pool = nn.MaxPool2d(2, 2) 15 | 16 | self.conv1 = nn.Conv2d(3, 32, 5, bias=False, padding=2) 17 | self.bn2d1 = nn.BatchNorm2d(32, eps=1e-04, affine=False) 18 | self.conv2 = nn.Conv2d(32, 64, 5, bias=False, padding=2) 19 | self.bn2d2 = nn.BatchNorm2d(64, eps=1e-04, affine=False) 20 | self.conv3 = nn.Conv2d(64, 128, 5, bias=False, padding=2) 21 | self.bn2d3 = nn.BatchNorm2d(128, eps=1e-04, affine=False) 22 | self.fc1 = nn.Linear(128 * 4 * 4, self.rep_dim, bias=False) 23 | 24 | def forward(self, x): 25 | x = self.conv1(x) 26 | x = self.pool(F.leaky_relu(self.bn2d1(x))) 27 | x = self.conv2(x) 28 | x = self.pool(F.leaky_relu(self.bn2d2(x))) 29 | x = self.conv3(x) 30 | x = self.pool(F.leaky_relu(self.bn2d3(x))) 31 | x = x.view(x.size(0), -1) 32 | x = self.fc1(x) 33 | return x 34 | 35 | 36 | class CIFAR10_LeNet_Autoencoder(BaseNet): 37 | 38 | def __init__(self): 39 | super().__init__() 40 | 41 | self.rep_dim = 128 42 | self.pool = nn.MaxPool2d(2, 2) 43 | 44 | # Encoder (must match the Deep SVDD network above) 45 | self.conv1 = nn.Conv2d(3, 32, 5, bias=False, padding=2) 46 | nn.init.xavier_uniform_(self.conv1.weight, gain=nn.init.calculate_gain('leaky_relu')) 47 | self.bn2d1 = nn.BatchNorm2d(32, eps=1e-04, affine=False) 48 | self.conv2 = nn.Conv2d(32, 64, 5, bias=False, padding=2) 49 | nn.init.xavier_uniform_(self.conv2.weight, gain=nn.init.calculate_gain('leaky_relu')) 50 | self.bn2d2 = nn.BatchNorm2d(64, eps=1e-04, affine=False) 51 | self.conv3 = nn.Conv2d(64, 128, 5, bias=False, padding=2) 52 | nn.init.xavier_uniform_(self.conv3.weight, gain=nn.init.calculate_gain('leaky_relu')) 53 | self.bn2d3 = nn.BatchNorm2d(128, eps=1e-04, affine=False) 54 | self.fc1 = nn.Linear(128 * 4 * 4, self.rep_dim, bias=False) 55 | self.bn1d = nn.BatchNorm1d(self.rep_dim, eps=1e-04, affine=False) 56 | 57 | # Decoder 58 | self.deconv1 = nn.ConvTranspose2d(int(self.rep_dim / (4 * 4)), 128, 5, bias=False, padding=2) 59 | nn.init.xavier_uniform_(self.deconv1.weight, gain=nn.init.calculate_gain('leaky_relu')) 60 | self.bn2d4 = nn.BatchNorm2d(128, eps=1e-04, affine=False) 61 | self.deconv2 = nn.ConvTranspose2d(128, 64, 5, bias=False, padding=2) 62 | nn.init.xavier_uniform_(self.deconv2.weight, gain=nn.init.calculate_gain('leaky_relu')) 63 | self.bn2d5 = nn.BatchNorm2d(64, eps=1e-04, affine=False) 64 | self.deconv3 = nn.ConvTranspose2d(64, 32, 5, bias=False, padding=2) 65 | nn.init.xavier_uniform_(self.deconv3.weight, gain=nn.init.calculate_gain('leaky_relu')) 66 | self.bn2d6 = nn.BatchNorm2d(32, eps=1e-04, affine=False) 67 | self.deconv4 = nn.ConvTranspose2d(32, 3, 5, bias=False, padding=2) 68 | nn.init.xavier_uniform_(self.deconv4.weight, gain=nn.init.calculate_gain('leaky_relu')) 69 | 70 | def forward(self, x): 71 | x = self.conv1(x) 72 | x = self.pool(F.leaky_relu(self.bn2d1(x))) 73 | x = self.conv2(x) 74 | x = self.pool(F.leaky_relu(self.bn2d2(x))) 75 | x = self.conv3(x) 76 | x = self.pool(F.leaky_relu(self.bn2d3(x))) 77 | x = x.view(x.size(0), -1) 78 | x = self.bn1d(self.fc1(x)) 79 | x = x.view(x.size(0), int(self.rep_dim / (4 * 4)), 4, 4) 80 | x = F.leaky_relu(x) 81 | x = self.deconv1(x) 82 | x = F.interpolate(F.leaky_relu(self.bn2d4(x)), scale_factor=2) 83 | x = self.deconv2(x) 84 | x = F.interpolate(F.leaky_relu(self.bn2d5(x)), scale_factor=2) 85 | x = self.deconv3(x) 86 | x = F.interpolate(F.leaky_relu(self.bn2d6(x)), scale_factor=2) 87 | x = self.deconv4(x) 88 | x = torch.sigmoid(x) 89 | return x 90 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/src/networks/cifar10_LeNet_elu.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from base.base_net import BaseNet 6 | 7 | 8 | class CIFAR10_LeNet_ELU(BaseNet): 9 | 10 | def __init__(self): 11 | super().__init__() 12 | 13 | self.rep_dim = 128 14 | self.pool = nn.MaxPool2d(2, 2) 15 | 16 | self.conv1 = nn.Conv2d(3, 32, 5, bias=False, padding=2) 17 | self.bn2d1 = nn.BatchNorm2d(32, eps=1e-04, affine=False) 18 | self.conv2 = nn.Conv2d(32, 64, 5, bias=False, padding=2) 19 | self.bn2d2 = nn.BatchNorm2d(64, eps=1e-04, affine=False) 20 | self.conv3 = nn.Conv2d(64, 128, 5, bias=False, padding=2) 21 | self.bn2d3 = nn.BatchNorm2d(128, eps=1e-04, affine=False) 22 | self.fc1 = nn.Linear(128 * 4 * 4, self.rep_dim, bias=False) 23 | 24 | def forward(self, x): 25 | x = self.conv1(x) 26 | x = self.pool(F.elu(self.bn2d1(x))) 27 | x = self.conv2(x) 28 | x = self.pool(F.elu(self.bn2d2(x))) 29 | x = self.conv3(x) 30 | x = self.pool(F.elu(self.bn2d3(x))) 31 | x = x.view(x.size(0), -1) 32 | x = self.fc1(x) 33 | return x 34 | 35 | 36 | class CIFAR10_LeNet_ELU_Autoencoder(BaseNet): 37 | 38 | def __init__(self): 39 | super().__init__() 40 | 41 | self.rep_dim = 128 42 | self.pool = nn.MaxPool2d(2, 2) 43 | 44 | # Encoder (must match the Deep SVDD network above) 45 | self.conv1 = nn.Conv2d(3, 32, 5, bias=False, padding=2) 46 | nn.init.xavier_uniform_(self.conv1.weight) 47 | self.bn2d1 = nn.BatchNorm2d(32, eps=1e-04, affine=False) 48 | self.conv2 = nn.Conv2d(32, 64, 5, bias=False, padding=2) 49 | nn.init.xavier_uniform_(self.conv2.weight) 50 | self.bn2d2 = nn.BatchNorm2d(64, eps=1e-04, affine=False) 51 | self.conv3 = nn.Conv2d(64, 128, 5, bias=False, padding=2) 52 | nn.init.xavier_uniform_(self.conv3.weight) 53 | self.bn2d3 = nn.BatchNorm2d(128, eps=1e-04, affine=False) 54 | self.fc1 = nn.Linear(128 * 4 * 4, self.rep_dim, bias=False) 55 | self.bn1d = nn.BatchNorm1d(self.rep_dim, eps=1e-04, affine=False) 56 | 57 | # Decoder 58 | self.deconv1 = nn.ConvTranspose2d(int(self.rep_dim / (4 * 4)), 128, 5, bias=False, padding=2) 59 | nn.init.xavier_uniform_(self.deconv1.weight) 60 | self.bn2d4 = nn.BatchNorm2d(128, eps=1e-04, affine=False) 61 | self.deconv2 = nn.ConvTranspose2d(128, 64, 5, bias=False, padding=2) 62 | nn.init.xavier_uniform_(self.deconv2.weight) 63 | self.bn2d5 = nn.BatchNorm2d(64, eps=1e-04, affine=False) 64 | self.deconv3 = nn.ConvTranspose2d(64, 32, 5, bias=False, padding=2) 65 | nn.init.xavier_uniform_(self.deconv3.weight) 66 | self.bn2d6 = nn.BatchNorm2d(32, eps=1e-04, affine=False) 67 | self.deconv4 = nn.ConvTranspose2d(32, 3, 5, bias=False, padding=2) 68 | nn.init.xavier_uniform_(self.deconv4.weight) 69 | 70 | def forward(self, x): 71 | x = self.conv1(x) 72 | x = self.pool(F.elu(self.bn2d1(x))) 73 | x = self.conv2(x) 74 | x = self.pool(F.elu(self.bn2d2(x))) 75 | x = self.conv3(x) 76 | x = self.pool(F.elu(self.bn2d3(x))) 77 | x = x.view(x.size(0), -1) 78 | x = self.bn1d(self.fc1(x)) 79 | x = x.view(x.size(0), int(self.rep_dim / (4 * 4)), 4, 4) 80 | x = F.elu(x) 81 | x = self.deconv1(x) 82 | x = F.interpolate(F.elu(self.bn2d4(x)), scale_factor=2) 83 | x = self.deconv2(x) 84 | x = F.interpolate(F.elu(self.bn2d5(x)), scale_factor=2) 85 | x = self.deconv3(x) 86 | x = F.interpolate(F.elu(self.bn2d6(x)), scale_factor=2) 87 | x = self.deconv4(x) 88 | x = torch.sigmoid(x) 89 | return x 90 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/src/networks/mnist_LeNet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from base.base_net import BaseNet 6 | 7 | 8 | class MNIST_LeNet(BaseNet): 9 | 10 | def __init__(self): 11 | super().__init__() 12 | 13 | self.rep_dim = 32 14 | self.pool = nn.MaxPool2d(2, 2) 15 | 16 | self.conv1 = nn.Conv2d(1, 8, 5, bias=False, padding=2) 17 | self.bn1 = nn.BatchNorm2d(8, eps=1e-04, affine=False) 18 | self.conv2 = nn.Conv2d(8, 4, 5, bias=False, padding=2) 19 | self.bn2 = nn.BatchNorm2d(4, eps=1e-04, affine=False) 20 | self.fc1 = nn.Linear(4 * 7 * 7, self.rep_dim, bias=False) 21 | 22 | def forward(self, x): 23 | x = self.conv1(x) 24 | x = self.pool(F.leaky_relu(self.bn1(x))) 25 | x = self.conv2(x) 26 | x = self.pool(F.leaky_relu(self.bn2(x))) 27 | x = x.view(x.size(0), -1) 28 | x = self.fc1(x) 29 | return x 30 | 31 | 32 | class MNIST_LeNet_Autoencoder(BaseNet): 33 | 34 | def __init__(self): 35 | super().__init__() 36 | 37 | self.rep_dim = 32 38 | self.pool = nn.MaxPool2d(2, 2) 39 | 40 | # Encoder (must match the Deep SVDD network above) 41 | self.conv1 = nn.Conv2d(1, 8, 5, bias=False, padding=2) 42 | self.bn1 = nn.BatchNorm2d(8, eps=1e-04, affine=False) 43 | self.conv2 = nn.Conv2d(8, 4, 5, bias=False, padding=2) 44 | self.bn2 = nn.BatchNorm2d(4, eps=1e-04, affine=False) 45 | self.fc1 = nn.Linear(4 * 7 * 7, self.rep_dim, bias=False) 46 | 47 | # Decoder 48 | self.deconv1 = nn.ConvTranspose2d(2, 4, 5, bias=False, padding=2) 49 | self.bn3 = nn.BatchNorm2d(4, eps=1e-04, affine=False) 50 | self.deconv2 = nn.ConvTranspose2d(4, 8, 5, bias=False, padding=3) 51 | self.bn4 = nn.BatchNorm2d(8, eps=1e-04, affine=False) 52 | self.deconv3 = nn.ConvTranspose2d(8, 1, 5, bias=False, padding=2) 53 | 54 | def forward(self, x): 55 | x = self.conv1(x) 56 | x = self.pool(F.leaky_relu(self.bn1(x))) 57 | x = self.conv2(x) 58 | x = self.pool(F.leaky_relu(self.bn2(x))) 59 | x = x.view(x.size(0), -1) 60 | x = self.fc1(x) 61 | x = x.view(x.size(0), int(self.rep_dim / 16), 4, 4) 62 | x = F.interpolate(F.leaky_relu(x), scale_factor=2) 63 | x = self.deconv1(x) 64 | x = F.interpolate(F.leaky_relu(self.bn3(x)), scale_factor=2) 65 | x = self.deconv2(x) 66 | x = F.interpolate(F.leaky_relu(self.bn4(x)), scale_factor=2) 67 | x = self.deconv3(x) 68 | x = torch.sigmoid(x) 69 | 70 | return x 71 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/src/networks/networks.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | from base.base_net import BaseNet 8 | 9 | 10 | class network(BaseNet): 11 | 12 | def __init__(self, n_vars, n_layers, shrinkage_factor): 13 | super().__init__() 14 | 15 | layer_sizes = [math.ceil(n_vars * (1-shrinkage_factor)**(i)) for i in range(n_layers+1)] 16 | self.rep_dim = math.ceil(layer_sizes[-1] * (1-shrinkage_factor)) 17 | 18 | self.layers = [] 19 | 20 | for i in range(n_layers): 21 | self.layers.append(nn.Linear(layer_sizes[i], layer_sizes[i+1], bias=False)) 22 | if i is not n_layers-1: 23 | self.layers.append(nn.BatchNorm1d(layer_sizes[i+1], eps=1e-04, affine=False)) 24 | self.layers.append(nn.LeakyReLU()) 25 | 26 | #bottleneck layer 27 | self.layers.append(nn.Linear(layer_sizes[-1], self.rep_dim, bias=False)) 28 | 29 | self.encoder = nn.Sequential(*self.layers) 30 | 31 | def forward(self, x): 32 | x = self.encoder(x) 33 | return x 34 | 35 | 36 | class auto_encoder(BaseNet): 37 | 38 | def __init__(self, n_vars, n_layers, shrinkage_factor): 39 | super().__init__() 40 | 41 | layer_sizes = [math.ceil(n_vars * (1-shrinkage_factor)**(i)) for i in range(n_layers+1)] 42 | self.rep_dim = math.ceil(layer_sizes[-1] * (1-shrinkage_factor)) 43 | 44 | #encoder 45 | self.layers = [] 46 | 47 | for i in range(n_layers): 48 | self.layers.append(nn.Linear(layer_sizes[i], layer_sizes[i+1], bias=False)) 49 | 50 | if i is not n_layers-1: 51 | nn.init.xavier_uniform_(self.layers[-1].weight, gain=nn.init.calculate_gain('leaky_relu')) 52 | self.layers.append(nn.BatchNorm1d(layer_sizes[i+1], eps=1e-04, affine=False)) 53 | self.layers.append(nn.LeakyReLU()) 54 | 55 | #bottleneck layer 56 | self.layers.append(nn.Linear(layer_sizes[-1], self.rep_dim, bias=False)) 57 | 58 | self.encoder = nn.Sequential(*self.layers) 59 | 60 | #decoder 61 | 62 | reverse_layer_sizes = [self.rep_dim] + list(reversed(layer_sizes)) 63 | self.layers = [] 64 | 65 | for i in range(n_layers+1): 66 | self.layers.append(nn.Linear(reverse_layer_sizes[i], reverse_layer_sizes[i+1], bias=False)) 67 | if i < n_layers: 68 | nn.init.xavier_uniform_(self.layers[-1].weight, gain=nn.init.calculate_gain('leaky_relu')) 69 | self.layers.append(nn.BatchNorm1d(reverse_layer_sizes[i+1], eps=1e-04, affine=False)) 70 | self.layers.append(nn.LeakyReLU()) 71 | 72 | 73 | 74 | #self.layers.append(nn.Linear(reverse_layer_sizes[-1], n_vars, bias=False)) 75 | 76 | self.decoder = nn.Sequential(*self.layers) 77 | 78 | def forward(self, x): 79 | x = self.encoder(x) 80 | x = self.decoder(x) 81 | return x 82 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/src/optim/__init__.py: -------------------------------------------------------------------------------- 1 | from .deepSVDD_trainer import DeepSVDDTrainer 2 | from .ae_trainer import AETrainer 3 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/src/optim/ae_trainer.py: -------------------------------------------------------------------------------- 1 | from base.base_trainer import BaseTrainer 2 | from base.base_dataset import BaseADDataset 3 | from base.base_net import BaseNet 4 | from sklearn.metrics import roc_auc_score 5 | 6 | import logging 7 | import time 8 | import torch 9 | import torch.optim as optim 10 | import numpy as np 11 | 12 | 13 | class AETrainer(BaseTrainer): 14 | 15 | def __init__(self, optimizer_name: str = 'adam', lr: float = 0.001, n_epochs: int = 150, lr_milestones: tuple = (), 16 | batch_size: int = 128, weight_decay: float = 1e-6, device: str = 'cuda', n_jobs_dataloader: int = 0): 17 | super().__init__(optimizer_name, lr, n_epochs, lr_milestones, batch_size, weight_decay, device, 18 | n_jobs_dataloader) 19 | 20 | def train(self, dataset: BaseADDataset, ae_net: BaseNet): 21 | logger = logging.getLogger() 22 | 23 | # Set device for network 24 | ae_net = ae_net.to(self.device) 25 | 26 | # Get train data loader 27 | train_loader, _ = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader) 28 | 29 | # Set optimizer (Adam optimizer for now) 30 | optimizer = optim.Adam(ae_net.parameters(), lr=self.lr, weight_decay=self.weight_decay, 31 | amsgrad=self.optimizer_name == 'amsgrad') 32 | 33 | # Set learning rate scheduler 34 | scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=self.lr_milestones, gamma=0.1) 35 | 36 | # Training 37 | logger.info('Starting pretraining...') 38 | start_time = time.time() 39 | ae_net.train() 40 | for epoch in range(self.n_epochs): 41 | 42 | scheduler.step() 43 | if epoch in self.lr_milestones: 44 | logger.info(' LR scheduler: new learning rate is %g' % float(scheduler.get_lr()[0])) 45 | 46 | loss_epoch = 0.0 47 | n_batches = 0 48 | epoch_start_time = time.time() 49 | for data in train_loader: 50 | inputs, _, _ = data 51 | inputs = inputs.to(self.device) 52 | 53 | # Zero the network parameter gradients 54 | optimizer.zero_grad() 55 | 56 | # Update network parameters via backpropagation: forward + backward + optimize 57 | outputs = ae_net(inputs) 58 | scores = torch.sum((outputs - inputs) ** 2, dim=tuple(range(1, outputs.dim()))) 59 | loss = torch.mean(scores) 60 | loss.backward() 61 | optimizer.step() 62 | 63 | loss_epoch += loss.item() 64 | n_batches += 1 65 | 66 | # log epoch statistics 67 | epoch_train_time = time.time() - epoch_start_time 68 | logger.info(' Epoch {}/{}\t Time: {:.3f}\t Loss: {:.8f}' 69 | .format(epoch + 1, self.n_epochs, epoch_train_time, loss_epoch / n_batches)) 70 | 71 | pretrain_time = time.time() - start_time 72 | logger.info('Pretraining time: %.3f' % pretrain_time) 73 | logger.info('Finished pretraining.') 74 | 75 | return ae_net 76 | 77 | def test(self, dataset: BaseADDataset, ae_net: BaseNet): 78 | logger = logging.getLogger() 79 | 80 | # Set device for network 81 | ae_net = ae_net.to(self.device) 82 | 83 | # Get test data loader 84 | _, test_loader = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader) 85 | 86 | # Testing 87 | logger.info('Testing autoencoder...') 88 | loss_epoch = 0.0 89 | n_batches = 0 90 | start_time = time.time() 91 | idx_label_score = [] 92 | ae_net.eval() 93 | with torch.no_grad(): 94 | for data in test_loader: 95 | inputs, labels, idx = data 96 | inputs = inputs.to(self.device) 97 | outputs = ae_net(inputs) 98 | scores = torch.sum((outputs - inputs) ** 2, dim=tuple(range(1, outputs.dim()))) 99 | loss = torch.mean(scores) 100 | 101 | # Save triple of (idx, label, score) in a list 102 | idx_label_score += list(zip(idx.cpu().data.numpy().tolist(), 103 | labels.cpu().data.numpy().tolist(), 104 | scores.cpu().data.numpy().tolist())) 105 | 106 | loss_epoch += loss.item() 107 | n_batches += 1 108 | 109 | logger.info('Test set Loss: {:.8f}'.format(loss_epoch / n_batches)) 110 | 111 | _, labels, scores = zip(*idx_label_score) 112 | labels = np.array(labels) 113 | scores = np.array(scores) 114 | 115 | auc = roc_auc_score(labels, scores) 116 | logger.info('Test set AUC: {:.2f}%'.format(100. * auc)) 117 | 118 | test_time = time.time() - start_time 119 | logger.info('Autoencoder testing time: %.3f' % test_time) 120 | logger.info('Finished testing autoencoder.') 121 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/src/optim/deepSVDD_trainer.py: -------------------------------------------------------------------------------- 1 | from base.base_trainer import BaseTrainer 2 | from base.base_dataset import BaseADDataset 3 | from base.base_net import BaseNet 4 | from torch.utils.data.dataloader import DataLoader 5 | from sklearn.metrics import roc_auc_score 6 | 7 | import logging 8 | import time 9 | import torch 10 | import torch.optim as optim 11 | import numpy as np 12 | 13 | 14 | class DeepSVDDTrainer(BaseTrainer): 15 | 16 | def __init__(self, objective, R, c, nu: float, optimizer_name: str = 'adam', lr: float = 0.001, n_epochs: int = 150, 17 | lr_milestones: tuple = (), batch_size: int = 128, weight_decay: float = 1e-6, device: str = 'cuda', 18 | n_jobs_dataloader: int = 0): 19 | super().__init__(optimizer_name, lr, n_epochs, lr_milestones, batch_size, weight_decay, device, 20 | n_jobs_dataloader) 21 | 22 | assert objective in ('one-class', 'soft-boundary'), "Objective must be either 'one-class' or 'soft-boundary'." 23 | self.objective = objective 24 | 25 | # Deep SVDD parameters 26 | self.R = torch.tensor(R, device=self.device) # radius R initialized with 0 by default. 27 | self.c = torch.tensor(c, device=self.device) if c is not None else None 28 | self.nu = nu 29 | 30 | # Optimization parameters 31 | self.warm_up_n_epochs = 10 # number of training epochs for soft-boundary Deep SVDD before radius R gets updated 32 | 33 | # Results 34 | self.train_time = None 35 | self.test_auc = None 36 | self.test_time = None 37 | self.test_scores = None 38 | 39 | def train(self, dataset: BaseADDataset, net: BaseNet): 40 | logger = logging.getLogger() 41 | 42 | # Set device for network 43 | net = net.to(self.device) 44 | 45 | # Get train data loader 46 | train_loader, _ = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader) 47 | 48 | # Set optimizer (Adam optimizer for now) 49 | optimizer = optim.Adam(net.parameters(), lr=self.lr, weight_decay=self.weight_decay, 50 | amsgrad=self.optimizer_name == 'amsgrad') 51 | 52 | # Set learning rate scheduler 53 | scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=self.lr_milestones, gamma=0.1) 54 | 55 | # Initialize hypersphere center c (if c not loaded) 56 | if self.c is None: 57 | logger.info('Initializing center c...') 58 | self.c = self.init_center_c(train_loader, net) 59 | logger.info('Center c initialized at %s', self.c) 60 | 61 | # Training 62 | logger.info('Starting training...') 63 | start_time = time.time() 64 | net.train() 65 | for epoch in range(self.n_epochs): 66 | 67 | scheduler.step() 68 | if epoch in self.lr_milestones: 69 | logger.info(' LR scheduler: new learning rate is %g' % float(scheduler.get_lr()[0])) 70 | 71 | loss_epoch = 0.0 72 | n_batches = 0 73 | epoch_start_time = time.time() 74 | for data in train_loader: 75 | inputs, _, _ = data 76 | inputs = inputs.to(self.device) 77 | 78 | # Zero the network parameter gradients 79 | optimizer.zero_grad() 80 | 81 | # Update network parameters via backpropagation: forward + backward + optimize 82 | outputs = net(inputs) 83 | dist = torch.sum((outputs - self.c) ** 2, dim=1) 84 | if self.objective == 'soft-boundary': 85 | scores = dist - self.R ** 2 86 | loss = self.R ** 2 + (1 / self.nu) * torch.mean(torch.max(torch.zeros_like(scores), scores)) 87 | else: 88 | loss = torch.mean(dist) 89 | loss.backward() 90 | optimizer.step() 91 | 92 | # Update hypersphere radius R on mini-batch distances 93 | if (self.objective == 'soft-boundary') and (epoch >= self.warm_up_n_epochs): 94 | self.R.data = torch.tensor(get_radius(dist, self.nu), device=self.device) 95 | 96 | loss_epoch += loss.item() 97 | n_batches += 1 98 | 99 | # log epoch statistics 100 | epoch_train_time = time.time() - epoch_start_time 101 | logger.info(' Epoch {}/{}\t Time: {:.3f}\t Loss: {:.8f}' 102 | .format(epoch + 1, self.n_epochs, epoch_train_time, loss_epoch / n_batches)) 103 | 104 | self.train_time = time.time() - start_time 105 | logger.info('Training time: %.3f' % self.train_time) 106 | 107 | logger.info('Finished training.') 108 | 109 | return net 110 | 111 | def test(self, dataset: BaseADDataset, net: BaseNet): 112 | logger = logging.getLogger() 113 | 114 | # Set device for network 115 | net = net.to(self.device) 116 | 117 | # Get test data loader 118 | _, test_loader = dataset.loaders(batch_size=self.batch_size, num_workers=self.n_jobs_dataloader) 119 | 120 | # Testing 121 | logger.info('Starting testing...') 122 | start_time = time.time() 123 | idx_label_score = [] 124 | net.eval() 125 | with torch.no_grad(): 126 | for data in test_loader: 127 | inputs, labels, idx = data 128 | inputs = inputs.to(self.device) 129 | outputs = net(inputs) 130 | dist = torch.sum((outputs - self.c) ** 2, dim=1) 131 | if self.objective == 'soft-boundary': 132 | scores = dist - self.R ** 2 133 | else: 134 | scores = dist 135 | 136 | # Save triples of (idx, label, score) in a list 137 | idx_label_score += list(zip(idx.cpu().data.numpy().tolist(), 138 | labels.cpu().data.numpy().tolist(), 139 | scores.cpu().data.numpy().tolist())) 140 | 141 | self.test_time = time.time() - start_time 142 | logger.info('Testing time: %.3f' % self.test_time) 143 | 144 | self.test_scores = idx_label_score 145 | 146 | # Compute AUC 147 | _, labels, scores = zip(*idx_label_score) 148 | labels = np.array(labels) 149 | scores = np.array(scores) 150 | 151 | self.test_auc = roc_auc_score(labels, scores) 152 | logger.info('Test set AUC: {:.2f}%'.format(100. * self.test_auc)) 153 | 154 | logger.info('Finished testing.') 155 | 156 | def init_center_c(self, train_loader: DataLoader, net: BaseNet, eps=0.1): 157 | """Initialize hypersphere center c as the mean from an initial forward pass on the data.""" 158 | n_samples = 0 159 | c = torch.zeros(net.rep_dim, device=self.device) 160 | 161 | net.eval() 162 | with torch.no_grad(): 163 | for data in train_loader: 164 | # get the inputs of the batch 165 | inputs, _, _ = data 166 | inputs = inputs.to(self.device) 167 | outputs = net(inputs) 168 | n_samples += outputs.shape[0] 169 | c += torch.sum(outputs, dim=0) 170 | 171 | c /= n_samples 172 | 173 | # If c_i is too close to 0, set to +-eps. Reason: a zero unit can be trivially matched with zero weights. 174 | c[(abs(c) < eps) & (c < 0)] = -eps 175 | c[(abs(c) < eps) & (c > 0)] = eps 176 | 177 | return c 178 | 179 | 180 | def get_radius(dist: torch.Tensor, nu: float): 181 | """Optimally solve for radius R via the (1-nu)-quantile of distances.""" 182 | return np.quantile(np.sqrt(dist.clone().data.cpu().numpy()), 1 - nu) 183 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .config import Config 2 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/src/utils/collect_results.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | 4 | 5 | base_path = '/Users/lukasruff/Repos/Deep-SVDD-PyTorch/log/mnist/test/mnist/soft_deepSVDD' 6 | n_exps = 3 7 | n_seeds = 3 8 | 9 | exps = range(n_exps) 10 | seeds = range(1, n_seeds) 11 | 12 | for exp in exps: 13 | 14 | exp_folder = str(exp) + 'vsall' 15 | aucs = np.zeros(n_seeds, dtype=np.float32) 16 | 17 | for seed in seeds: 18 | 19 | seed_folder = 'seed_' + str(seed) 20 | file_name = 'results.json' 21 | file_path = base_path + '/' + exp_folder + '/' + seed_folder + '/' + file_name 22 | 23 | with open(file_path, 'r') as fp: 24 | results = json.load(fp) 25 | 26 | aucs[seed - 1] = results['test_auc'] 27 | 28 | mean = np.mean(aucs[aucs > 0]) 29 | std = np.std(aucs[aucs > 0]) 30 | 31 | # Write results 32 | log_file = '{}/result.txt'.format(base_path) 33 | log = open(log_file, 'a') 34 | log.write('Experiment: {}\n'.format(exp_folder)) 35 | log.write('Test Set AUC [mean]: {} %\n'.format(round(float(mean * 100), 4))) 36 | log.write('Test Set AUC [std]: {} %\n'.format(round(float(std * 100), 4))) 37 | log.write('\n') 38 | 39 | log.write('\n') 40 | log.close() 41 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/src/utils/config.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class Config(object): 5 | """Base class for experimental setting/configuration.""" 6 | 7 | def __init__(self, settings): 8 | self.settings = settings 9 | 10 | def load_config(self, import_json): 11 | """Load settings dict from import_json (path/filename.json) JSON-file.""" 12 | 13 | with open(import_json, 'r') as fp: 14 | settings = json.load(fp) 15 | 16 | for key, value in settings.items(): 17 | self.settings[key] = value 18 | 19 | def save_config(self, export_json): 20 | """Save settings dict to export_json (path/filename.json) JSON-file.""" 21 | 22 | with open(export_json, 'w') as fp: 23 | json.dump(self.settings, fp) 24 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/src/utils/visualization/plot_images_grid.py: -------------------------------------------------------------------------------- 1 | import torch 2 | # import matplotlib 3 | # matplotlib.use('Agg') # or 'PS', 'PDF', 'SVG' 4 | 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | from torchvision.utils import make_grid 8 | 9 | 10 | def plot_images_grid(x: torch.tensor, export_img, title: str = '', nrow=8, padding=2, normalize=False, pad_value=0): 11 | """Plot 4D Tensor of images of shape (B x C x H x W) as a grid.""" 12 | 13 | grid = make_grid(x, nrow=nrow, padding=padding, normalize=normalize, pad_value=pad_value) 14 | npgrid = grid.cpu().numpy() 15 | 16 | plt.imshow(np.transpose(npgrid, (1, 2, 0)), interpolation='nearest') 17 | 18 | ax = plt.gca() 19 | ax.xaxis.set_visible(False) 20 | ax.yaxis.set_visible(False) 21 | 22 | if not (title == ''): 23 | plt.title(title) 24 | 25 | plt.savefig(export_img, bbox_inches='tight', pad_inches=0.1) 26 | plt.clf() 27 | -------------------------------------------------------------------------------- /additional_methods/Deep-SVDD/test_instruction.txt: -------------------------------------------------------------------------------- 1 | python main.py "wine.pickle" 3 0.2 ../log/mnist_test ../../../formatted_data test.csv --objective one-class --lr 0.00001 --n_epochs 1500 --lr_milestone 500 --batch_size 200 --weight_decay 0.5e-6 --pretrain True --ae_lr 0.00001 --ae_n_epochs 1500 --ae_lr_milestone 500 --ae_batch_size 200 --ae_weight_decay 0.5e-3 --normal_class 0; 2 | -------------------------------------------------------------------------------- /additional_methods/ODIN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Outlier Detection using Indegree Number (ODIN) Algorithm 3 | """ 4 | # Author: Roel Bouman 5 | # License: BSD 2 clause 6 | 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | from sklearn.neighbors import kneighbors_graph 11 | from sklearn.utils.validation import check_is_fitted 12 | from sklearn.utils.validation import check_array 13 | 14 | from pyod.models.base import BaseDetector 15 | from pyod.utils.utility import invert_order 16 | 17 | import numpy as np 18 | 19 | #Note, PREDICT is not implemented properly yet. It looks only to 1 matrix of input data at a time. 20 | class ODIN(BaseDetector): 21 | """ 22 | """ 23 | def __init__(self, n_neighbors=20, 24 | metric='minkowski', p=2, metric_params=None, 25 | contamination=0.1, n_jobs=None): 26 | super(ODIN, self).__init__(contamination=contamination) 27 | self.n_neighbors = n_neighbors 28 | self.metric = metric 29 | self.p = p 30 | self.metric_params = metric_params 31 | self.n_jobs = n_jobs 32 | 33 | 34 | def fit(self, X, y=None): 35 | """Fit detector. y is ignored in unsupervised methods. 36 | Parameters 37 | ---------- 38 | X : numpy array of shape (n_samples, n_features) 39 | The input samples. 40 | y : Ignored 41 | Not used, present for API consistency by convention. 42 | Returns 43 | ------- 44 | self : object 45 | Fitted estimator. 46 | """ 47 | # validate inputs X and y (optional) 48 | X = check_array(X) 49 | 50 | self.knn_graph_ = kneighbors_graph(X, n_neighbors=self.n_neighbors, 51 | metric=self.metric, 52 | p=self.p, 53 | metric_params=self.metric_params, 54 | n_jobs=self.n_jobs, 55 | include_self=False) 56 | 57 | 58 | 59 | 60 | 61 | # Invert decision_scores_. Outliers comes with higher outlier scores 62 | self.decision_scores_ = invert_order(np.asarray(np.sum(self.knn_graph_, axis=0)).flatten()) 63 | self._process_decision_scores() 64 | return self 65 | 66 | def decision_function(self, X): 67 | """TEMP 68 | """ 69 | X = check_array(X) 70 | 71 | self.knn_graph_ = kneighbors_graph(X, n_neighbors=self.n_neighbors, 72 | metric=self.metric, 73 | p=self.p, 74 | metric_params=self.metric_params, 75 | n_jobs=self.n_jobs, 76 | include_self=False) 77 | 78 | 79 | 80 | 81 | # Invert decision_scores_. Outliers comes with higher outlier scores 82 | self.decision_scores_ = invert_order(np.asarray(np.sum(self.knn_graph_, axis=0)).flatten()) 83 | self._process_decision_scores() 84 | -------------------------------------------------------------------------------- /additional_methods/SVDD/.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /additional_methods/SVDD/.spyproject/config/backups/codestyle.ini.bak: -------------------------------------------------------------------------------- 1 | [codestyle] 2 | indentation = True 3 | edge_line = True 4 | edge_line_columns = 79 5 | 6 | [main] 7 | version = 0.2.0 8 | 9 | -------------------------------------------------------------------------------- /additional_methods/SVDD/.spyproject/config/backups/encoding.ini.bak: -------------------------------------------------------------------------------- 1 | [encoding] 2 | text_encoding = utf-8 3 | 4 | [main] 5 | version = 0.2.0 6 | 7 | -------------------------------------------------------------------------------- /additional_methods/SVDD/.spyproject/config/backups/vcs.ini.bak: -------------------------------------------------------------------------------- 1 | [vcs] 2 | use_version_control = False 3 | version_control_system = 4 | 5 | [main] 6 | version = 0.2.0 7 | 8 | -------------------------------------------------------------------------------- /additional_methods/SVDD/.spyproject/config/backups/workspace.ini.bak: -------------------------------------------------------------------------------- 1 | [workspace] 2 | restore_data_on_startup = True 3 | save_data_on_exit = True 4 | save_history = True 5 | save_non_project_files = False 6 | 7 | [main] 8 | version = 0.2.0 9 | recent_files = ['src\\BaseSVDD.py'] 10 | 11 | -------------------------------------------------------------------------------- /additional_methods/SVDD/.spyproject/config/codestyle.ini: -------------------------------------------------------------------------------- 1 | [codestyle] 2 | indentation = True 3 | edge_line = True 4 | edge_line_columns = 79 5 | 6 | [main] 7 | version = 0.2.0 8 | 9 | -------------------------------------------------------------------------------- /additional_methods/SVDD/.spyproject/config/defaults/defaults-codestyle-0.2.0.ini: -------------------------------------------------------------------------------- 1 | [codestyle] 2 | indentation = True 3 | edge_line = True 4 | edge_line_columns = 79 5 | 6 | -------------------------------------------------------------------------------- /additional_methods/SVDD/.spyproject/config/defaults/defaults-encoding-0.2.0.ini: -------------------------------------------------------------------------------- 1 | [encoding] 2 | text_encoding = utf-8 3 | 4 | -------------------------------------------------------------------------------- /additional_methods/SVDD/.spyproject/config/defaults/defaults-vcs-0.2.0.ini: -------------------------------------------------------------------------------- 1 | [vcs] 2 | use_version_control = False 3 | version_control_system = 4 | 5 | -------------------------------------------------------------------------------- /additional_methods/SVDD/.spyproject/config/defaults/defaults-workspace-0.2.0.ini: -------------------------------------------------------------------------------- 1 | [workspace] 2 | restore_data_on_startup = True 3 | save_data_on_exit = True 4 | save_history = True 5 | save_non_project_files = False 6 | 7 | -------------------------------------------------------------------------------- /additional_methods/SVDD/.spyproject/config/encoding.ini: -------------------------------------------------------------------------------- 1 | [encoding] 2 | text_encoding = utf-8 3 | 4 | [main] 5 | version = 0.2.0 6 | 7 | -------------------------------------------------------------------------------- /additional_methods/SVDD/.spyproject/config/vcs.ini: -------------------------------------------------------------------------------- 1 | [vcs] 2 | use_version_control = False 3 | version_control_system = 4 | 5 | [main] 6 | version = 0.2.0 7 | 8 | -------------------------------------------------------------------------------- /additional_methods/SVDD/.spyproject/config/workspace.ini: -------------------------------------------------------------------------------- 1 | [workspace] 2 | restore_data_on_startup = True 3 | save_data_on_exit = True 4 | save_history = True 5 | save_non_project_files = False 6 | 7 | [main] 8 | version = 0.2.0 9 | recent_files = [] 10 | 11 | -------------------------------------------------------------------------------- /additional_methods/SVDD/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Kepeng Qiu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /additional_methods/SVDD/README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 |

Support Vector Data Description (SVDD)

6 | 7 |

Python code for abnormal detection or fault detection using Support Vector Data Description (SVDD)

8 |

Version 1.1, 11-NOV-2021

9 |

Email: iqiukp@outlook.com

10 | 11 |
12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 |
20 | 21 |
22 | 23 | ## Main features 24 | 25 | - SVDD BaseEstimator based on sklearn.base for one-class or binary classification 26 | - Multiple kinds of kernel functions (linear, gaussian, polynomial, sigmoid) 27 | - Visualization of decision boundaries for 2D data 28 | 29 | ## Requirements 30 | 31 | - cvxopt 32 | - matplotlib 33 | - numpy 34 | - scikit_learn 35 | - scikit-opt (optional, only used for parameter optimization) 36 | 37 | ## Notices 38 | 39 | - The label must be 1 for positive sample or -1 for negative sample. 40 | - Detailed applications please see the examples. 41 | - This code is for reference only. 42 | 43 | ## Examples 44 | 45 | ### 01. svdd_example_unlabeled_data.py 46 | 47 | An example for SVDD model fitting using unlabeled data. 48 | 49 |

50 | 51 | 52 |

53 | 54 | ### 02. svdd_example_hybrid_data.py 55 | 56 | An example for SVDD model fitting with negataive samples. 57 | 58 | ```Python 59 | import sys 60 | sys.path.append("..") 61 | from sklearn.datasets import load_wine 62 | from src.BaseSVDD import BaseSVDD, BananaDataset 63 | 64 | # Banana-shaped dataset generation and partitioning 65 | X, y = BananaDataset.generate(number=100, display='on') 66 | X_train, X_test, y_train, y_test = BananaDataset.split(X, y, ratio=0.3) 67 | 68 | # 69 | svdd = BaseSVDD(C=0.9, gamma=0.3, kernel='rbf', display='on') 70 | 71 | # 72 | svdd.fit(X_train, y_train) 73 | 74 | # 75 | svdd.plot_boundary(X_train, y_train) 76 | 77 | # 78 | y_test_predict = svdd.predict(X_test, y_test) 79 | 80 | # 81 | radius = svdd.radius 82 | distance = svdd.get_distance(X_test) 83 | svdd.plot_distance(radius, distance) 84 | ``` 85 | 86 |

87 | 88 | 89 |

90 | 91 | ### 03. svdd_example_kernel.py 92 | 93 | An example for SVDD model fitting using different kernels. 94 | 95 | ```Python 96 | import sys 97 | sys.path.append("..") 98 | from src.BaseSVDD import BaseSVDD, BananaDataset 99 | 100 | # Banana-shaped dataset generation and partitioning 101 | X, y = BananaDataset.generate(number=100, display='on') 102 | X_train, X_test, y_train, y_test = BananaDataset.split(X, y, ratio=0.3) 103 | 104 | # kernel list 105 | kernelList = {"1": BaseSVDD(C=0.9, kernel='rbf', gamma=0.3, display='on'), 106 | "2": BaseSVDD(C=0.9, kernel='poly',degree=2, display='on'), 107 | "3": BaseSVDD(C=0.9, kernel='linear', display='on') 108 | } 109 | 110 | # 111 | for i in range(len(kernelList)): 112 | svdd = kernelList.get(str(i+1)) 113 | svdd.fit(X_train, y_train) 114 | svdd.plot_boundary(X_train, y_train) 115 | ``` 116 | 117 |

118 | 119 | 120 | 121 |

122 | 123 | 124 | ### 04. svdd_example_KPCA.py 125 | 126 | An example for SVDD model fitting using nonlinear principal component. 127 | 128 | The KPCA algorithm is used to reduce the dimension of the original data. 129 | 130 | ```Python 131 | import sys 132 | sys.path.append("..") 133 | import numpy as np 134 | from src.BaseSVDD import BaseSVDD 135 | from sklearn.decomposition import KernelPCA 136 | 137 | 138 | # create 100 points with 5 dimensions 139 | X = np.r_[np.random.randn(50, 5) + 1, np.random.randn(50, 5)] 140 | y = np.append(np.ones((50, 1), dtype=np.int64), 141 | -np.ones((50, 1), dtype=np.int64), 142 | axis=0) 143 | 144 | # number of the dimensionality 145 | kpca = KernelPCA(n_components=2, kernel="rbf", gamma=0.1, fit_inverse_transform=True) 146 | X_kpca = kpca.fit_transform(X) 147 | 148 | # fit the SVDD model 149 | svdd = BaseSVDD(C=0.9, gamma=10, kernel='rbf', display='on') 150 | 151 | # fit and predict 152 | svdd.fit(X_kpca, y) 153 | y_test_predict = svdd.predict(X_kpca, y) 154 | 155 | # plot the distance curve 156 | radius = svdd.radius 157 | distance = svdd.get_distance(X_kpca) 158 | svdd.plot_distance(radius, distance) 159 | 160 | # plot the boundary 161 | svdd.plot_boundary(X_kpca, y) 162 | ``` 163 | 164 |

165 | 166 | 167 |

168 | 169 | ### 05. svdd_example_PSO.py 170 | 171 | An example for parameter optimization using PSO. 172 | 173 | "scikit-opt" is required in this example. 174 | 175 | https://github.com/guofei9987/scikit-opt 176 | 177 | 178 | ```Python 179 | import sys 180 | sys.path.append("..") 181 | from src.BaseSVDD import BaseSVDD, BananaDataset 182 | from sko.PSO import PSO 183 | import matplotlib.pyplot as plt 184 | 185 | 186 | # Banana-shaped dataset generation and partitioning 187 | X, y = BananaDataset.generate(number=100, display='off') 188 | X_train, X_test, y_train, y_test = BananaDataset.split(X, y, ratio=0.3) 189 | 190 | # objective function 191 | def objective_func(x): 192 | x1, x2 = x 193 | svdd = BaseSVDD(C=x1, gamma=x2, kernel='rbf', display='off') 194 | y = 1-svdd.fit(X_train, y_train).accuracy 195 | return y 196 | 197 | # Do PSO 198 | pso = PSO(func=objective_func, n_dim=2, pop=10, max_iter=20, 199 | lb=[0.01, 0.01], ub=[1, 3], w=0.8, c1=0.5, c2=0.5) 200 | pso.run() 201 | 202 | print('best_x is', pso.gbest_x) 203 | print('best_y is', pso.gbest_y) 204 | 205 | # plot the result 206 | fig = plt.figure(figsize=(6, 4)) 207 | ax = fig.add_subplot(1, 1, 1) 208 | ax.plot(pso.gbest_y_hist) 209 | ax.yaxis.grid() 210 | plt.show() 211 | ``` 212 | 213 |

214 | 215 |

216 | 217 | ### 06. svdd_example_confusion_matrix.py 218 | 219 | An example for drawing the confusion matrix and ROC curve. 220 | 221 |

222 | 223 | 224 |

225 | 226 | ### 07. svdd_example_cross_validation.py 227 | 228 | An example for cross validation. 229 | 230 | ```Python 231 | import sys 232 | sys.path.append("..") 233 | from src.BaseSVDD import BaseSVDD, BananaDataset 234 | from sklearn.model_selection import cross_val_score 235 | 236 | 237 | # Banana-shaped dataset generation and partitioning 238 | X, y = BananaDataset.generate(number=100, display='on') 239 | X_train, X_test, y_train, y_test = BananaDataset.split(X, y, ratio=0.3) 240 | 241 | # 242 | svdd = BaseSVDD(C=0.9, gamma=0.3, kernel='rbf', display='on') 243 | 244 | 245 | # cross validation (k-fold) 246 | k = 5 247 | scores = cross_val_score(svdd, X_train, y_train, cv=k, scoring='accuracy') 248 | 249 | # 250 | print("Cross validation scores:") 251 | for scores_ in scores: 252 | print(scores_) 253 | 254 | print("Mean cross validation score: {:4f}".format(scores.mean())) 255 | ``` 256 | Results 257 | ``` 258 | Cross validation scores: 259 | 0.5714285714285714 260 | 0.75 261 | 0.9642857142857143 262 | 1.0 263 | 1.0 264 | Mean cross validation score: 0.857143 265 | ``` 266 | 267 | ### 08. svdd_example_grid_search.py 268 | 269 | An example for parameter selection using grid search. 270 | 271 | ```Python 272 | import sys 273 | sys.path.append("..") 274 | from sklearn.datasets import load_wine 275 | from src.BaseSVDD import BaseSVDD, BananaDataset 276 | from sklearn.model_selection import KFold, LeaveOneOut, ShuffleSplit 277 | from sklearn.model_selection import learning_curve, GridSearchCV 278 | 279 | # Banana-shaped dataset generation and partitioning 280 | X, y = BananaDataset.generate(number=100, display='off') 281 | X_train, X_test, y_train, y_test = BananaDataset.split(X, y, ratio=0.3) 282 | 283 | param_grid = [ 284 | {"kernel": ["rbf"], "gamma": [0.1, 0.2, 0.5], "C": [0.1, 0.5, 1]}, 285 | {"kernel": ["linear"], "C": [0.1, 0.5, 1]}, 286 | {"kernel": ["poly"], "C": [0.1, 0.5, 1], "degree": [2, 3, 4, 5]}, 287 | ] 288 | 289 | svdd = GridSearchCV(BaseSVDD(display='off'), param_grid, cv=5, scoring="accuracy") 290 | svdd.fit(X_train, y_train) 291 | print("best parameters:") 292 | print(svdd.best_params_) 293 | print("\n") 294 | 295 | # 296 | best_model = svdd.best_estimator_ 297 | means = svdd.cv_results_["mean_test_score"] 298 | stds = svdd.cv_results_["std_test_score"] 299 | 300 | for mean, std, params in zip(means, stds, svdd.cv_results_["params"]): 301 | print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) 302 | print() 303 | 304 | ``` 305 | Results 306 | ```Python 307 | best parameters: 308 | {'C': 0.5, 'gamma': 0.1, 'kernel': 'rbf'} 309 | 310 | 311 | 0.921 (+/-0.159) for {'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf'} 312 | 0.893 (+/-0.192) for {'C': 0.1, 'gamma': 0.2, 'kernel': 'rbf'} 313 | 0.857 (+/-0.296) for {'C': 0.1, 'gamma': 0.5, 'kernel': 'rbf'} 314 | 0.950 (+/-0.086) for {'C': 0.5, 'gamma': 0.1, 'kernel': 'rbf'} 315 | 0.921 (+/-0.131) for {'C': 0.5, 'gamma': 0.2, 'kernel': 'rbf'} 316 | 0.864 (+/-0.273) for {'C': 0.5, 'gamma': 0.5, 'kernel': 'rbf'} 317 | 0.950 (+/-0.086) for {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'} 318 | 0.921 (+/-0.131) for {'C': 1, 'gamma': 0.2, 'kernel': 'rbf'} 319 | 0.864 (+/-0.273) for {'C': 1, 'gamma': 0.5, 'kernel': 'rbf'} 320 | 0.807 (+/-0.246) for {'C': 0.1, 'kernel': 'linear'} 321 | 0.821 (+/-0.278) for {'C': 0.5, 'kernel': 'linear'} 322 | 0.793 (+/-0.273) for {'C': 1, 'kernel': 'linear'} 323 | 0.879 (+/-0.184) for {'C': 0.1, 'degree': 2, 'kernel': 'poly'} 324 | 0.836 (+/-0.305) for {'C': 0.1, 'degree': 3, 'kernel': 'poly'} 325 | 0.771 (+/-0.416) for {'C': 0.1, 'degree': 4, 'kernel': 'poly'} 326 | 0.757 (+/-0.448) for {'C': 0.1, 'degree': 5, 'kernel': 'poly'} 327 | 0.871 (+/-0.224) for {'C': 0.5, 'degree': 2, 'kernel': 'poly'} 328 | 0.814 (+/-0.311) for {'C': 0.5, 'degree': 3, 'kernel': 'poly'} 329 | 0.800 (+/-0.390) for {'C': 0.5, 'degree': 4, 'kernel': 'poly'} 330 | 0.764 (+/-0.432) for {'C': 0.5, 'degree': 5, 'kernel': 'poly'} 331 | 0.871 (+/-0.224) for {'C': 1, 'degree': 2, 'kernel': 'poly'} 332 | 0.850 (+/-0.294) for {'C': 1, 'degree': 3, 'kernel': 'poly'} 333 | 0.800 (+/-0.390) for {'C': 1, 'degree': 4, 'kernel': 'poly'} 334 | 0.771 (+/-0.416) for {'C': 1, 'degree': 5, 'kernel': 'poly'} 335 | ``` 336 | -------------------------------------------------------------------------------- /additional_methods/SVDD/SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Supported Versions 4 | 5 | Use this section to tell people about which versions of your project are 6 | currently being supported with security updates. 7 | 8 | | Version | Supported | 9 | | ------- | ------------------ | 10 | | 5.1.x | :white_check_mark: | 11 | | 5.0.x | :x: | 12 | | 4.0.x | :white_check_mark: | 13 | | < 4.0 | :x: | 14 | 15 | ## Reporting a Vulnerability 16 | 17 | Use this section to tell people how to report a vulnerability. 18 | 19 | Tell them where to go, how often they can expect to get an update on a 20 | reported vulnerability, what to expect if the vulnerability is accepted or 21 | declined, etc. 22 | -------------------------------------------------------------------------------- /additional_methods/SVDD/examples/svdd_example_KPCA.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 4 | An example for SVDD model fitting using nonlinear principal component. 5 | 6 | The KPCA algorithm is used to reduce the dimension of the original data. 7 | 8 | """ 9 | 10 | import sys 11 | sys.path.append("..") 12 | import numpy as np 13 | from src.BaseSVDD import BaseSVDD 14 | from sklearn.decomposition import KernelPCA 15 | 16 | 17 | # create 100 points with 5 dimensions 18 | X = np.r_[np.random.randn(50, 5) + 1, np.random.randn(50, 5)] 19 | y = np.append(np.ones((50, 1), dtype=np.int64), 20 | -np.ones((50, 1), dtype=np.int64), 21 | axis=0) 22 | 23 | # number of the dimensionality 24 | kpca = KernelPCA(n_components=2, kernel="rbf", gamma=0.1, fit_inverse_transform=True) 25 | X_kpca = kpca.fit_transform(X) 26 | 27 | # fit the SVDD model 28 | svdd = BaseSVDD(C=0.9, gamma=10, kernel='rbf', display='on') 29 | 30 | # fit and predict 31 | svdd.fit(X_kpca, y) 32 | y_test_predict = svdd.predict(X_kpca, y) 33 | 34 | # plot the distance curve 35 | radius = svdd.radius 36 | distance = svdd.get_distance(X_kpca) 37 | svdd.plot_distance(radius, distance) 38 | 39 | # plot the boundary 40 | svdd.plot_boundary(X_kpca, y) 41 | 42 | -------------------------------------------------------------------------------- /additional_methods/SVDD/examples/svdd_example_PSO.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 4 | An example for parameter optimization using PSO. 5 | 6 | "scikit-opt" is required in this examples. 7 | 8 | https://github.com/guofei9987/scikit-opt 9 | 10 | """ 11 | 12 | import sys 13 | sys.path.append("..") 14 | from src.BaseSVDD import BaseSVDD, BananaDataset 15 | from sko.PSO import PSO 16 | import matplotlib.pyplot as plt 17 | 18 | 19 | # Banana-shaped dataset generation and partitioning 20 | X, y = BananaDataset.generate(number=100, display='off') 21 | X_train, X_test, y_train, y_test = BananaDataset.split(X, y, ratio=0.3) 22 | 23 | # objective function 24 | def objective_func(x): 25 | x1, x2 = x 26 | svdd = BaseSVDD(C=x1, gamma=x2, kernel='rbf', display='off') 27 | y = 1-svdd.fit(X_train, y_train).accuracy 28 | return y 29 | 30 | # Do PSO 31 | pso = PSO(func=objective_func, n_dim=2, pop=10, max_iter=20, 32 | lb=[0.01, 0.01], ub=[1, 3], w=0.8, c1=0.5, c2=0.5) 33 | pso.run() 34 | 35 | print('best_x is', pso.gbest_x) 36 | print('best_y is', pso.gbest_y) 37 | 38 | # plot the result 39 | fig = plt.figure(figsize=(6, 4)) 40 | ax = fig.add_subplot(1, 1, 1) 41 | ax.plot(pso.gbest_y_hist) 42 | ax.yaxis.grid() 43 | plt.show() 44 | -------------------------------------------------------------------------------- /additional_methods/SVDD/examples/svdd_example_confusion_matrix.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | An example for drawing the confusion matrix and ROC curve 4 | 5 | """ 6 | import sys 7 | sys.path.append("..") 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | from src.BaseSVDD import BaseSVDD 11 | from sklearn.metrics import confusion_matrix 12 | from sklearn.metrics import ConfusionMatrixDisplay 13 | from sklearn.metrics import roc_curve, auc 14 | from sklearn.model_selection import train_test_split 15 | 16 | # generate data 17 | n = 100 18 | dim = 5 19 | X = np.r_[np.random.randn(n, dim) + 1, np.random.randn(n, dim)] 20 | y = np.append(np.ones((n, 1), dtype=np.int64), 21 | -np.ones((n, 1), dtype=np.int64), 22 | axis=0) 23 | 24 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) 25 | 26 | # SVDD model 27 | svdd = BaseSVDD(C=0.9, gamma=0.1, kernel='rbf', display='on') 28 | svdd.fit(X_train, y_train) 29 | y_test_predict = svdd.predict(X_test, y_test) 30 | 31 | # plot the distance curve 32 | radius = svdd.radius 33 | distance = svdd.get_distance(X_test) 34 | svdd.plot_distance(radius, distance) 35 | 36 | # confusion matrix and ROC curve 37 | cm = confusion_matrix(y_test, y_test_predict) 38 | cm_display = ConfusionMatrixDisplay(cm).plot() 39 | y_score = svdd.decision_function(X_test) 40 | 41 | fpr, tpr, _ = roc_curve(y_test, y_score) 42 | roc_auc = auc(fpr, tpr) 43 | 44 | plt.figure() 45 | plt.plot(fpr, tpr, color="darkorange", lw=3, label="ROC curve (area = %0.2f)" % roc_auc) 46 | plt.plot([0, 1], [0, 1], color="navy", lw=3, linestyle="--") 47 | plt.xlim([0.0, 1.0]) 48 | plt.ylim([0.0, 1.05]) 49 | plt.xlabel("False Positive Rate") 50 | plt.ylabel("True Positive Rate") 51 | plt.title("Receiver operating characteristic") 52 | plt.legend(loc="lower right") 53 | plt.grid() 54 | plt.show() 55 | -------------------------------------------------------------------------------- /additional_methods/SVDD/examples/svdd_example_cross_validation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | An example for cross validation 4 | 5 | """ 6 | import sys 7 | sys.path.append("..") 8 | from src.BaseSVDD import BaseSVDD, BananaDataset 9 | from sklearn.model_selection import cross_val_score 10 | 11 | 12 | # Banana-shaped dataset generation and partitioning 13 | X, y = BananaDataset.generate(number=100, display='on') 14 | X_train, X_test, y_train, y_test = BananaDataset.split(X, y, ratio=0.3) 15 | 16 | # 17 | svdd = BaseSVDD(C=0.9, gamma=0.3, kernel='rbf', display='on') 18 | 19 | 20 | # cross validation (k-fold) 21 | k = 5 22 | scores = cross_val_score(svdd, X_train, y_train, cv=k, scoring='accuracy') 23 | 24 | # 25 | print("Cross validation scores:") 26 | for scores_ in scores: 27 | print(scores_) 28 | 29 | print("Mean cross validation score: {:4f}".format(scores.mean())) 30 | 31 | -------------------------------------------------------------------------------- /additional_methods/SVDD/examples/svdd_example_grid_search.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 4 | An example for parameter selection using grid search 5 | 6 | """ 7 | import sys 8 | sys.path.append("..") 9 | from sklearn.datasets import load_wine 10 | from src.BaseSVDD import BaseSVDD, BananaDataset 11 | from sklearn.model_selection import KFold, LeaveOneOut, ShuffleSplit 12 | from sklearn.model_selection import learning_curve, GridSearchCV 13 | 14 | # Banana-shaped dataset generation and partitioning 15 | X, y = BananaDataset.generate(number=100, display='off') 16 | X_train, X_test, y_train, y_test = BananaDataset.split(X, y, ratio=0.3) 17 | 18 | param_grid = [ 19 | {"kernel": ["rbf"], "gamma": [0.1, 0.2, 0.5], "C": [0.1, 0.5, 1]}, 20 | {"kernel": ["linear"], "C": [0.1, 0.5, 1]}, 21 | {"kernel": ["poly"], "C": [0.1, 0.5, 1], "degree": [2, 3, 4, 5]}, 22 | ] 23 | 24 | svdd = GridSearchCV(BaseSVDD(display='off'), param_grid, cv=5, scoring="accuracy") 25 | svdd.fit(X_train, y_train) 26 | print("best parameters:") 27 | print(svdd.best_params_) 28 | print("\n") 29 | 30 | # 31 | best_model = svdd.best_estimator_ 32 | means = svdd.cv_results_["mean_test_score"] 33 | stds = svdd.cv_results_["std_test_score"] 34 | 35 | for mean, std, params in zip(means, stds, svdd.cv_results_["params"]): 36 | print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) 37 | print() 38 | -------------------------------------------------------------------------------- /additional_methods/SVDD/examples/svdd_example_hybrid_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 4 | An example for SVDD model fitting with negataive samples 5 | 6 | """ 7 | import sys 8 | sys.path.append("..") 9 | from sklearn.datasets import load_wine 10 | from src.BaseSVDD import BaseSVDD, BananaDataset 11 | 12 | # Banana-shaped dataset generation and partitioning 13 | X, y = BananaDataset.generate(number=100, display='on') 14 | X_train, X_test, y_train, y_test = BananaDataset.split(X, y, ratio=0.3) 15 | 16 | # 17 | svdd = BaseSVDD(C=0.9, gamma=0.3, kernel='rbf', display='on') 18 | 19 | # 20 | svdd.fit(X_train, y_train) 21 | 22 | # 23 | svdd.plot_boundary(X_train, y_train) 24 | 25 | # 26 | y_test_predict = svdd.predict(X_test, y_test) 27 | 28 | # 29 | radius = svdd.radius 30 | distance = svdd.get_distance(X_test) 31 | svdd.plot_distance(radius, distance) -------------------------------------------------------------------------------- /additional_methods/SVDD/examples/svdd_example_kernel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 4 | An example for SVDD model fitting using different kernels 5 | 6 | """ 7 | import sys 8 | sys.path.append("..") 9 | from src.BaseSVDD import BaseSVDD, BananaDataset 10 | 11 | # Banana-shaped dataset generation and partitioning 12 | X, y = BananaDataset.generate(number=100, display='on') 13 | X_train, X_test, y_train, y_test = BananaDataset.split(X, y, ratio=0.3) 14 | 15 | # kernel list 16 | kernelList = {"1": BaseSVDD(C=0.9, kernel='rbf', gamma=0.3, display='on'), 17 | "2": BaseSVDD(C=0.9, kernel='poly',degree=2, display='on'), 18 | "3": BaseSVDD(C=0.9, kernel='linear', display='on') 19 | } 20 | 21 | # 22 | for i in range(len(kernelList)): 23 | svdd = kernelList.get(str(i+1)) 24 | svdd.fit(X_train, y_train) 25 | svdd.plot_boundary(X_train, y_train) 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /additional_methods/SVDD/examples/svdd_example_unlabeled_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 4 | An example for SVDD model fitting with negataive samples 5 | 6 | """ 7 | import sys 8 | sys.path.append("..") 9 | import numpy as np 10 | from src.BaseSVDD import BaseSVDD 11 | 12 | # create 100 points with 2 dimensions 13 | n = 100 14 | dim = 2 15 | X = np.r_[np.random.randn(n, dim)] 16 | 17 | # svdd object using rbf kernel 18 | svdd = BaseSVDD(C=0.9, gamma=0.3, kernel='rbf', display='on') 19 | 20 | # fit the SVDD model 21 | svdd.fit(X) 22 | 23 | # predict the label 24 | y_predict = svdd.predict(X) 25 | 26 | # plot the boundary 27 | svdd.plot_boundary(X) 28 | 29 | # plot the distance 30 | radius = svdd.radius 31 | distance = svdd.get_distance(X) 32 | svdd.plot_distance(radius, distance) -------------------------------------------------------------------------------- /additional_methods/SVDD/requirements.txt: -------------------------------------------------------------------------------- 1 | cvxopt==1.2.7 2 | matplotlib==3.4.2 3 | numpy==1.22.0 4 | scikit_learn==1.0.1 5 | -------------------------------------------------------------------------------- /additional_methods/abod.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Angle-based Outlier Detector (ABOD) 3 | """ 4 | # Author: Yue Zhao 5 | # License: BSD 2 clause 6 | 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import warnings 11 | from itertools import combinations 12 | 13 | import numpy as np 14 | from numba import njit 15 | from sklearn.neighbors import KDTree 16 | from sklearn.neighbors import NearestNeighbors 17 | from sklearn.utils import check_array 18 | from sklearn.utils.validation import check_is_fitted 19 | 20 | from pyod.models.base import BaseDetector 21 | from pyod.utils.utility import check_parameter 22 | 23 | 24 | @njit 25 | def _wcos(curr_pt, a, b): # pragma: no cover 26 | """Internal function to calculate weighted cosine using optimized 27 | numba code. 28 | 29 | Parameters 30 | ---------- 31 | curr_pt : numpy array of shape (n_samples, n_features) 32 | Current sample to be calculated. 33 | 34 | a : numpy array of shape (n_samples, n_features) 35 | Training sample a. 36 | 37 | b : numpy array of shape (n_samples, n_features) 38 | Training sample b. 39 | 40 | Returns 41 | ------- 42 | wcos : float in range [-1, 1] 43 | Cosine similarity between a-curr_pt and b-curr_pt. 44 | 45 | """ 46 | 47 | a_curr = a - curr_pt 48 | b_curr = b - curr_pt 49 | 50 | # wcos = (/((|a_curr|*|b_curr|)^2) 51 | wcos = np.dot(a_curr, b_curr) / ( 52 | np.linalg.norm(a_curr, 2) ** 2) / ( 53 | np.linalg.norm(b_curr, 2) ** 2) 54 | return wcos 55 | 56 | 57 | def _calculate_wocs(curr_pt, X, X_ind): 58 | """Calculated the variance of weighted cosine of a point. 59 | wcos = (/((|a_curr|*|b_curr|)^2) 60 | 61 | Parameters 62 | ---------- 63 | curr_pt : numpy array, shape (1, n_features) 64 | The sample to be calculated. 65 | 66 | X : numpy array of shape (n_samples, n_features) 67 | The training dataset. 68 | 69 | X_ind : list 70 | The valid index of the training data. 71 | 72 | Returns 73 | ------- 74 | cos_angle_var : float 75 | The variance of cosine angle 76 | 77 | """ 78 | wcos_list = [] 79 | curr_pair_inds = list(combinations(X_ind, 2)) 80 | for j, (a_ind, b_ind) in enumerate(curr_pair_inds): 81 | a = X[a_ind, :] 82 | b = X[b_ind, :] 83 | 84 | # skip if no angle can be formed 85 | if np.array_equal(a, curr_pt) or np.array_equal(b, curr_pt): 86 | wcos_list.append(0) 87 | else: 88 | # add the weighted cosine to the list 89 | wcos_list.append(_wcos(curr_pt, a, b)) 90 | return np.var(wcos_list) 91 | 92 | 93 | # noinspection PyPep8Naming 94 | class ABOD(BaseDetector): 95 | """ABOD class for Angle-base Outlier Detection. 96 | For an observation, the variance of its weighted cosine scores to all 97 | neighbors could be viewed as the outlying score. 98 | See :cite:`kriegel2008angle` for details. 99 | 100 | Two version of ABOD are supported: 101 | 102 | - Fast ABOD: use k nearest neighbors to approximate. 103 | - Original ABOD: consider all training points with high time complexity at 104 | O(n^3). 105 | 106 | Parameters 107 | ---------- 108 | contamination : float in (0., 0.5), optional (default=0.1) 109 | The amount of contamination of the data set, i.e. 110 | the proportion of outliers in the data set. Used when fitting to 111 | define the threshold on the decision function. 112 | 113 | n_neighbors : int, optional (default=10) 114 | Number of neighbors to use by default for k neighbors queries. 115 | 116 | method: str, optional (default='fast') 117 | Valid values for metric are: 118 | 119 | - 'fast': fast ABOD. Only consider n_neighbors of training points 120 | - 'default': original ABOD with all training points, which could be 121 | slow 122 | 123 | Attributes 124 | ---------- 125 | decision_scores_ : numpy array of shape (n_samples,) 126 | The outlier scores of the training data. 127 | The higher, the more abnormal. Outliers tend to have higher 128 | scores. This value is available once the detector is 129 | fitted. 130 | 131 | threshold_ : float 132 | The threshold is based on ``contamination``. It is the 133 | ``n_samples * contamination`` most abnormal samples in 134 | ``decision_scores_``. The threshold is calculated for generating 135 | binary outlier labels. 136 | 137 | labels_ : int, either 0 or 1 138 | The binary labels of the training data. 0 stands for inliers 139 | and 1 for outliers/anomalies. It is generated by applying 140 | ``threshold_`` on ``decision_scores_``. 141 | """ 142 | 143 | def __init__(self, contamination=0.1, n_neighbors=5, method='fast'): 144 | super(ABOD, self).__init__(contamination=contamination) 145 | self.method = method 146 | self.n_neighbors = n_neighbors 147 | 148 | def fit(self, X, y=None): 149 | """Fit detector. y is ignored in unsupervised methods. 150 | 151 | Parameters 152 | ---------- 153 | X : numpy array of shape (n_samples, n_features) 154 | The input samples. 155 | 156 | y : Ignored 157 | Not used, present for API consistency by convention. 158 | 159 | Returns 160 | ------- 161 | self : object 162 | Fitted estimator. 163 | """ 164 | # validate inputs X and y (optional) 165 | X = check_array(X) 166 | self._set_n_classes(y) 167 | 168 | self.X_train_ = X 169 | self.n_train_ = X.shape[0] 170 | self.decision_scores_ = np.zeros([self.n_train_, 1]) 171 | 172 | if self.method == 'fast': 173 | self._fit_fast() 174 | elif self.method == 'default': 175 | self._fit_default() 176 | else: 177 | raise ValueError(self.method, "is not a valid method") 178 | 179 | # flip the scores 180 | self.decision_scores_ = self.decision_scores_.ravel() * -1 181 | self._process_decision_scores() 182 | return self 183 | 184 | def _fit_default(self): 185 | """Default ABOD method. Use all training points with high complexity 186 | O(n^3). For internal use only. 187 | """ 188 | for i in range(self.n_train_): 189 | curr_pt = self.X_train_[i, :] 190 | 191 | # get the index pairs of the neighbors, remove itself from index 192 | X_ind = list(range(0, self.n_train_)) 193 | X_ind.remove(i) 194 | 195 | self.decision_scores_[i, 0] = _calculate_wocs(curr_pt, 196 | self.X_train_, 197 | X_ind) 198 | return self 199 | 200 | def _fit_fast(self): 201 | """Fast ABOD method. Only use n_neighbors for angle calculation. 202 | Internal use only 203 | """ 204 | 205 | # make sure the n_neighbors is in the range 206 | if self.n_neighbors >= self.n_train_: 207 | self.n_neighbors = self.n_train_ - 1 208 | warnings.warn("n_neighbors is set to the number of " 209 | "training points minus 1: {0}".format(self.n_train_)) 210 | 211 | check_parameter(self.n_neighbors, 1, self.n_train_, 212 | include_left=True, include_right=True) 213 | 214 | self.tree_ = KDTree(self.X_train_) 215 | 216 | neigh = NearestNeighbors(n_neighbors=self.n_neighbors) 217 | neigh.fit(self.X_train_) 218 | ind_arr = neigh.kneighbors(n_neighbors=self.n_neighbors, 219 | return_distance=False) 220 | 221 | for i in range(self.n_train_): 222 | curr_pt = self.X_train_[i, :] 223 | X_ind = ind_arr[i, :] 224 | self.decision_scores_[i, 0] = _calculate_wocs(curr_pt, 225 | self.X_train_, 226 | X_ind) 227 | return self 228 | 229 | # noinspection PyPep8Naming 230 | def decision_function(self, X): 231 | """Predict raw anomaly score of X using the fitted detector. 232 | 233 | The anomaly score of an input sample is computed based on different 234 | detector algorithms. For consistency, outliers are assigned with 235 | larger anomaly scores. 236 | 237 | Parameters 238 | ---------- 239 | X : numpy array of shape (n_samples, n_features) 240 | The training input samples. Sparse matrices are accepted only 241 | if they are supported by the base estimator. 242 | 243 | Returns 244 | ------- 245 | anomaly_scores : numpy array of shape (n_samples,) 246 | The anomaly score of the input samples. 247 | """ 248 | 249 | check_is_fitted(self, ['X_train_', 'n_train_', 'decision_scores_', 250 | 'threshold_', 'labels_']) 251 | X = check_array(X) 252 | 253 | if self.method == 'fast': # fast ABOD 254 | # outliers have higher outlier scores 255 | return self._decision_function_fast(X) * -1 256 | else: # default ABOD 257 | return self._decision_function_default(X) * -1 258 | 259 | def _decision_function_default(self, X): 260 | """Internal method for predicting outlier scores using default ABOD. 261 | 262 | Parameters 263 | ---------- 264 | X : numpy array of shape (n_samples, n_features) 265 | The training input samples. 266 | 267 | Returns 268 | ------- 269 | pred_score : array, shape (n_samples,) 270 | The anomaly score of the input samples. 271 | 272 | """ 273 | # initialize the output score 274 | pred_score = np.zeros([X.shape[0], 1]) 275 | 276 | for i in range(X.shape[0]): 277 | curr_pt = X[i, :] 278 | # get the index pairs of the neighbors 279 | X_ind = list(range(0, self.n_train_)) 280 | pred_score[i, :] = _calculate_wocs(curr_pt, self.X_train_, X_ind) 281 | 282 | return pred_score.ravel() 283 | 284 | def _decision_function_fast(self, X): 285 | """Internal method for predicting outlier scores using Fast ABOD. 286 | 287 | Parameters 288 | ---------- 289 | X : numpy array of shape (n_samples, n_features) 290 | The training input samples. 291 | 292 | Returns 293 | ------- 294 | pred_score : array, shape (n_samples,) 295 | The anomaly score of the input samples. 296 | 297 | """ 298 | 299 | check_is_fitted(self, ['tree_']) 300 | # initialize the output score 301 | pred_score = np.zeros([X.shape[0], 1]) 302 | 303 | # get the indexes of the X's k nearest training points 304 | _, ind_arr = self.tree_.query(X, k=self.n_neighbors) 305 | 306 | for i in range(X.shape[0]): 307 | curr_pt = X[i, :] 308 | X_ind = ind_arr[i, :] 309 | pred_score[i, :] = _calculate_wocs(curr_pt, self.X_train_, X_ind) 310 | 311 | return pred_score.ravel() -------------------------------------------------------------------------------- /additional_methods/ensemble.py: -------------------------------------------------------------------------------- 1 | from sklearn.utils import check_array 2 | 3 | from pyod.models.base import BaseDetector 4 | from pyod.models.combination import average 5 | from pyod.models.lof import LOF 6 | 7 | import numpy as np 8 | 9 | class Ensemble(BaseDetector): 10 | 11 | def __init__(self, estimators=[LOF()], combination_function=average, contamination=0.1, **kwargs): 12 | super(Ensemble, self).__init__(contamination=contamination) 13 | self.estimators = estimators 14 | self.n_estimators_ = len(estimators) 15 | self.combination_function = combination_function 16 | self.kwargs = kwargs 17 | 18 | def fit(self, X, y=None): 19 | X = check_array(X) 20 | n_samples = X.shape[0] 21 | 22 | all_scores = np.zeros((n_samples,self.n_estimators_)) 23 | 24 | for i, estimator in enumerate(self.estimators): 25 | estimator.fit(X) 26 | all_scores[:,i] = estimator.decision_scores_ 27 | 28 | self.decision_scores_ = self.combination_function(all_scores, **self.kwargs) 29 | 30 | return self 31 | 32 | def decision_function(self, X): 33 | n_samples = X.shape[0] 34 | 35 | all_scores = np.zeros((n_samples,self.n_estimators_)) 36 | 37 | for i, estimator in enumerate(self.estimators): 38 | all_scores[:,i] = estimator.decision_function(X) 39 | 40 | return self.combination_function(all_scores, **self.kwargs) -------------------------------------------------------------------------------- /additional_methods/gen2out/gen2out.py: -------------------------------------------------------------------------------- 1 | #################################### 2 | # Author: Jeremy (Meng-Chieh) Lee # 3 | # Email : mengchil@cs.cmu.edu # 4 | #################################### 5 | 6 | 7 | import numpy as np 8 | from scipy.spatial.distance import cityblock 9 | 10 | from sklearn.cluster import DBSCAN 11 | from sklearn.linear_model import LinearRegression 12 | 13 | import time 14 | from joblib import Parallel, delayed 15 | from tqdm import tqdm 16 | 17 | from .iforest import IsolationForest 18 | 19 | from pyod.utils.utility import invert_order 20 | 21 | 22 | class gen2Out: 23 | def __init__(self, lower_bound=9, upper_bound=12, max_depth=7, 24 | rotate=True, contamination='auto', random_state=None): 25 | self.lower_bound = lower_bound 26 | self.upper_bound = upper_bound 27 | self.max_depth = max_depth 28 | self.rotate = rotate 29 | self.contamination = contamination if contamination == 'auto' else float(contamination) 30 | self.random_state = random_state 31 | 32 | def func(self, Xs, i): 33 | ### Fit forest with full-grown trees 34 | clf = IsolationForest(random_state=self.random_state, 35 | max_samples=len(Xs), 36 | contamination=self.contamination, 37 | rotate=self.rotate).fit(Xs, max_depth=100000000) 38 | depths = np.mean(clf._compute_actual_depth(Xs), axis=0) 39 | bins = np.arange(int(depths.min()), int(depths.max() + 2)) 40 | y, x = np.histogram(depths, bins=bins) 41 | return i, x[np.argmax(y)] 42 | 43 | def fit(self, X, y=None): 44 | if self.random_state: 45 | np.random.seed(self.random_state) 46 | self.n_sample = X.shape[0] 47 | 48 | params_arr = Parallel(n_jobs=self.upper_bound-self.lower_bound)( 49 | [delayed(self.func)(X[np.random.choice(self.n_sample, 2 ** i, replace=True)], i) 50 | for i in np.arange(self.lower_bound, self.upper_bound)]) 51 | x_arr, y_arr = np.array(params_arr).T 52 | 53 | self.reg = LinearRegression(fit_intercept=False).fit(x_arr.reshape(-1, 1), y_arr) 54 | self.clf = IsolationForest(random_state=self.random_state, 55 | max_samples=len(X), 56 | contamination=self.contamination, 57 | rotate=self.rotate).fit(X, max_depth=self.max_depth) 58 | 59 | return self 60 | 61 | def average_path_length(self, n): 62 | n = np.array(n) 63 | apl = self.reg.predict(np.log2([n]).T) 64 | apl[apl < 0] = 0 65 | return apl 66 | 67 | def decision_function(self, X): 68 | depths, leaves = self.clf._compute_actual_depth_leaf(X) 69 | 70 | new_depths = np.zeros(X.shape[0]) 71 | for d, l in zip(depths, leaves): 72 | new_depths += d + self.average_path_length(l) 73 | 74 | scores = 2 ** (-new_depths 75 | / (len(self.clf.estimators_) 76 | * self.average_path_length([self.n_sample]))) 77 | 78 | return invert_order(scores) 79 | 80 | def point_anomaly_scores(self, X): 81 | self = self.fit(X) 82 | return self.decision_function(X) 83 | 84 | def group_anomaly_scores(self, X, trials=10): 85 | ### Fit a sequence of gen2Out0 86 | self.min_rate = int(np.log2(len(X)) - 8) + 1 87 | self.scores = np.zeros((self.min_rate, trials, len(X))) 88 | 89 | print('Fitting gen2Out0...') 90 | for i in tqdm(range(self.min_rate)): 91 | for j in range(trials): 92 | X_sampled = X[np.random.choice(len(X), int(len(X) * (1 / (2 ** i))))] 93 | clf = self.fit(X_sampled) 94 | self.scores[i][j] = clf.decision_function(X) 95 | 96 | ### Create X-ray plot 97 | smax = np.max(np.mean(self.scores, axis=1), axis=0) 98 | self.threshold = np.mean(smax) + 3 * np.std(smax) 99 | 100 | sr_list = [] 101 | xrays = np.max(np.mean(self.scores, axis=1), axis=0) 102 | for idx, xray in enumerate(xrays): 103 | if xray >= self.threshold: 104 | sr_list.append(idx) 105 | sr_list = np.array(sr_list) 106 | 107 | ### Outlier grouping 108 | groups = DBSCAN().fit_predict(X[sr_list]) 109 | 110 | self.labels = -np.ones(len(X)).astype(int) 111 | for idx, g in zip(sr_list, groups): 112 | if g != -1: 113 | self.labels[idx] = g + 1 114 | 115 | ### Compute iso-curves 116 | xline = 1 / (2 ** np.arange(0, self.min_rate)) 117 | self.s_arr = [[] for l in np.unique(self.labels) if l != -1] 118 | xrays_max = np.argmax(np.mean(self.scores, axis=1), axis=0) 119 | for idx in sr_list: 120 | if self.labels[idx] != -1: 121 | dis = cityblock([np.log2(xrays_max[idx]) / 10 + 1, xrays[idx]], [1, 1]) 122 | self.s_arr[self.labels[idx]-1].append((2 - dis) / 2) 123 | 124 | ga_scores = np.array([np.median(s) for s in self.s_arr]) 125 | ga_indices = [np.where(self.labels == l)[0] for l in np.unique(self.labels) if l != -1] 126 | 127 | return ga_scores, ga_indices 128 | -------------------------------------------------------------------------------- /additional_methods/gen2out/main.py: -------------------------------------------------------------------------------- 1 | #################################### 2 | # Author: Jeremy (Meng-Chieh) Lee # 3 | # Email : mengchil@cs.cmu.edu # 4 | #################################### 5 | 6 | 7 | import numpy as np 8 | import time 9 | import argparse 10 | 11 | from .gen2out import gen2Out 12 | from .utils import sythetic_group_anomaly, plot_results 13 | 14 | 15 | if __name__ == '__main__': 16 | 17 | parser = argparse.ArgumentParser(description='Parameters for gen2Out') 18 | parser.add_argument('--lower_bound', default=9, type=int, help='Lower bound of sampling (2^i)') 19 | parser.add_argument('--upper_bound', default=12, type=int, help='Upper bound of sampling (2^i)') 20 | parser.add_argument('--max_depth', default=7, type=int, help='Maximum depth of each tree') 21 | parser.add_argument('--rotate', default=True, type=bool, help='Whether to use the rotated IF or not') 22 | parser.add_argument('--contamination', default='auto', type=str, help='Contamination rate of the dataset') 23 | parser.add_argument('--random_state', default=0, type=int, help='Control the randomness') 24 | args = parser.parse_args() 25 | 26 | model = gen2Out(lower_bound=args.lower_bound, 27 | upper_bound=args.upper_bound, 28 | max_depth=args.max_depth, 29 | rotate=args.rotate, 30 | contamination=args.contamination, 31 | random_state=args.random_state) 32 | 33 | X = sythetic_group_anomaly() 34 | 35 | print('Start point anomaly detection:') 36 | t1 = time.time() 37 | pscores = model.point_anomaly_scores(X) 38 | t2 = time.time() 39 | print('Finish in %.1f seconds!\n' % (t2 - t1)) 40 | 41 | print('Start group anomaly detection:') 42 | t1 = time.time() 43 | gscores = model.group_anomaly_scores(X) 44 | t2 = time.time() 45 | print('Finish in %.1f seconds!\n' % (t2 - t1)) 46 | 47 | print('Generating plots...') 48 | plot_results(X, model) 49 | print('Finish!') 50 | 51 | 52 | -------------------------------------------------------------------------------- /additional_methods/gen2out/utils.py: -------------------------------------------------------------------------------- 1 | #################################### 2 | # Author: Jeremy (Meng-Chieh) Lee # 3 | # Email : mengchil@cs.cmu.edu # 4 | #################################### 5 | 6 | 7 | import numpy as np 8 | from scipy.spatial.distance import cityblock, euclidean 9 | 10 | import matplotlib.pyplot as plt 11 | 12 | 13 | def uni_disk(n, low=0, high=1): 14 | r = np.random.uniform(low=low, high=high, size=n) # radius 15 | theta = np.random.uniform(low=0, high=2*np.pi, size=n) # angle 16 | x = np.sqrt(r) * np.cos(theta) 17 | y = np.sqrt(r) * np.sin(theta) 18 | return x, y 19 | 20 | def sythetic_group_anomaly(seed=0): 21 | np.random.seed(seed) 22 | 23 | x1, y1 = uni_disk(100000) 24 | x1 *= 5 25 | y1 *= 5 26 | 27 | x2, y2 = uni_disk(1000) 28 | x2 = x2 * 1.5 + 10 29 | y2 = y2 * 1.5 + 5 30 | 31 | x3, y3 = uni_disk(2000) 32 | x3 = x3 * 6 + 3 33 | y3 = y3 - 10 34 | 35 | x4 = [11, -2, 13, 14] 36 | y4 = [0, 9, -10, 10] 37 | 38 | x = np.concatenate([x1, x2, x3, x4]) 39 | y = np.concatenate([y1, y2, y3, y4]) 40 | X_norm = np.array([x, y]).T 41 | 42 | return X_norm 43 | 44 | def plot_xray(X, model, idx_arr, line=False): 45 | plt.scatter(1, 1, s=100, c='k', marker='*') 46 | xline = 1 / (2 ** np.arange(0, model.min_rate)) 47 | 48 | for idx in idx_arr: 49 | s = model.scores.T[idx].T 50 | std, mean = np.std(s, axis=1), np.mean(s, axis=1) 51 | if line: 52 | plt.plot(xline, mean, c='k', alpha=0.7) 53 | plt.fill_between(xline, mean-std, mean+std, color='grey', alpha=0.2) 54 | 55 | max_idx = np.argmax(mean) 56 | plt.scatter(xline[max_idx], mean[max_idx], s=20, c='k') 57 | 58 | plt.plot([2 ** (-(model.min_rate - 0.7)), 1.2], [model.threshold, model.threshold], '--', label='Mean + 3 * Std', alpha=0.8, c='r') 59 | plt.ylim(-0.05, 1.05) 60 | plt.xlim(2 ** (-(model.min_rate - 0.7)), 1.2) 61 | plt.xscale('log', base=2) 62 | plt.xlabel('Qualification Rate', fontsize=20) 63 | plt.ylabel('Anomaly Score', fontsize=20) 64 | plt.legend(fontsize=12) 65 | 66 | def plot_results(X, model): 67 | ### Randomly sample when plotting 68 | idx_arr = np.concatenate([np.arange(300), 69 | np.arange(100000, 100300), 70 | np.arange(101000, 101300), 71 | np.arange(103000, 103004)]) 72 | 73 | ### Plot heatmap 74 | plt.figure(figsize=(4.8, 4)) 75 | plt.hexbin(X[:, 0], X[:, 1], cmap='cool', gridsize=30, bins='log', mincnt=1) 76 | plt.colorbar() 77 | plt.tight_layout() 78 | plt.savefig('results/step0_heatmap.png') 79 | 80 | ### Step 1: X-ray plot 81 | plt.figure(figsize=(4, 4)) 82 | plot_xray(X, model, idx_arr, line=True) 83 | plt.tight_layout() 84 | plt.savefig('results/step1_xray_plot.png') 85 | 86 | ### Step 2: Apex extraction 87 | plt.figure(figsize=(4, 4)) 88 | plot_xray(X, model, idx_arr, line=False) 89 | plt.tight_layout() 90 | plt.savefig('results/step2_apex_extraction.png') 91 | 92 | ### Step 3: Outlier grouping 93 | c_arr = ['', 'b', 'r', 'y', 'm', 'g', 'c'] 94 | plt.figure(figsize=(4, 4)) 95 | plt.scatter(X[:, 0], X[:, 1], c='lightgrey', alpha=0.5) 96 | 97 | for l in np.unique(model.labels): 98 | if l != -1: 99 | idx = np.where(model.labels == l)[0] 100 | plt.scatter(X[idx, 0], X[idx, 1], c=c_arr[l], label='GA ' + str(l)) 101 | 102 | plt.legend(fontsize=12) 103 | plt.tight_layout() 104 | plt.savefig('results/step3_outlier_grouping.png') 105 | 106 | ### Step 4: Anomaly iso-curves 107 | man_x, man_y, man_dis = [], [], [] 108 | for i in np.arange(0, model.min_rate, 0.01): 109 | for j in np.arange(0, 1.01, 0.01): 110 | ix = 1 / (2 ** i) 111 | man_x.append(ix) 112 | man_y.append(j) 113 | man_dis.append(cityblock([np.log2(ix) / 10, j], [1, 1])) 114 | man_x, man_y, man_dis = np.array(man_x), np.array(man_y), np.array(man_dis) 115 | 116 | plt.figure(figsize=(4.8, 4)) 117 | plt.scatter(man_x, man_y, c=man_dis, cmap='gist_rainbow', alpha=0.1) 118 | plt.colorbar() 119 | plt.scatter(1, 1, s=100, c='k', marker='*') 120 | 121 | xline = 1 / (2 ** np.arange(0, model.min_rate)) 122 | for idx in idx_arr: 123 | if model.labels[idx] != -1: 124 | c = c_arr[model.labels[idx]] 125 | s = model.scores.T[idx].T 126 | std, mean = np.std(s, axis=1), np.mean(s, axis=1) 127 | plt.plot(xline, mean, c=c, alpha=0.05) 128 | max_idx = np.argmax(mean) 129 | plt.scatter(xline[max_idx], mean[max_idx], s=20, c=c) 130 | for l in np.unique(model.labels): 131 | if l != -1: 132 | plt.plot([], [], '-o', c=c_arr[l], label='GA ' + str(l)) 133 | 134 | plt.xscale('log', base=2) 135 | plt.ylim(-0.05, 1.05) 136 | plt.xlim(2 ** (-(model.min_rate - 0.7)), 1.2) 137 | plt.xlabel('Qualification Rate', fontsize=20) 138 | plt.ylabel('Anomaly Score', fontsize=20) 139 | plt.legend(fontsize=12, loc=4) 140 | plt.tight_layout() 141 | plt.savefig('results/step4_anomaly_isocurves.png') 142 | 143 | ### Step 5: Scoring 144 | plt.figure(figsize=(4.4, 4)) 145 | 146 | for idx, s in enumerate(model.s_arr): 147 | ymin, ymax = np.min(s), np.max(s) 148 | Q1, Q3 = np.percentile(s, 25), np.percentile(s, 75) 149 | m = np.median(s) 150 | plt.scatter([idx, idx], [ymin, ymax], facecolors='none', edgecolors='lightgrey') 151 | plt.plot([idx, idx], [Q1, Q3], c='grey', linewidth=0.9) 152 | plt.plot([idx-0.12, idx+0.12], [Q1, Q1], c='grey', linewidth=0.9) 153 | plt.plot([idx-0.12, idx+0.12], [Q3, Q3], c='grey', linewidth=0.9) 154 | plt.plot([idx-0.24, idx+0.24], [m, m], c='r', linewidth=3) 155 | 156 | plt.xticks(np.arange(len(model.s_arr)), ['GA '+str(i+1) for i in range(len(model.s_arr))], fontsize=12) 157 | plt.xlabel('Generalized Anomaly ID', fontsize=20) 158 | plt.ylabel('Distribution of\nAnomaly Score', fontsize=20) 159 | plt.tight_layout() 160 | plt.savefig('results/step5_scoring.png') -------------------------------------------------------------------------------- /additional_methods/lmdd.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Linear Model Deviation-base outlier detection (LMDD). 3 | """ 4 | # Author: Yahya Almardeny 5 | # License: BSD 2 clause 6 | 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import numpy as np 11 | from numba import njit, prange 12 | from scipy import stats 13 | from sklearn.utils import check_array, check_random_state 14 | 15 | from pyod.utils import check_parameter 16 | from pyod.models.base import BaseDetector 17 | 18 | 19 | @njit 20 | def _aad(X): 21 | """Internal Function to Calculate Average Absolute Deviation 22 | (a.k.a Mean Absolute Deviation) 23 | """ 24 | return np.mean(np.absolute(X - np.mean(X))) 25 | 26 | @njit(parallel=True) 27 | def _dis(X, dis_measure_=_aad): 28 | """ 29 | Internal function to calculate for 30 | dissimilarity in a sequence of sets. 31 | """ 32 | n = X.shape[0] 33 | res_ = np.zeros((n,)) 34 | _var = np.zeros((n,)) 35 | var_max, j = -np.inf, 0 36 | # this can be vectorized but just for comforting memory 37 | for i in prange(1, n): 38 | _var[i] = dis_measure_(X[:i + 1]) - dis_measure_(X[:i]) 39 | 40 | j = np.argmax(_var) 41 | var_max = _var[j] 42 | 43 | if var_max > res_[j]: 44 | res_[j] = var_max 45 | 46 | for k in prange(j + 1, n): 47 | dk_diff = dis_measure_(np.vstack((X[:j], np.expand_dims(X[k], axis=0)))) - dis_measure_(np.vstack((X[:j + 1], np.expand_dims(X[k], axis=0)))) 48 | 49 | if dk_diff >= 0: 50 | res_[k] = dk_diff + var_max 51 | 52 | return res_ 53 | 54 | def _check_params(n_iter, dis_measure, random_state): 55 | """Internal function to check for and validate class parameters. 56 | Also, to return random state instance and the appropriate dissimilarity 57 | measure if valid. 58 | """ 59 | if isinstance(n_iter, int): 60 | check_parameter(n_iter, low=1, param_name='n_iter') 61 | else: 62 | raise TypeError("n_iter should be int, got %s" % n_iter) 63 | 64 | if isinstance(dis_measure, str): 65 | if dis_measure not in ('aad', 'var', 'iqr'): 66 | raise ValueError("Unknown dissimilarity measure type, " 67 | "dis_measure should be in " 68 | "(\'aad\', \'var\', \'iqr\'), " 69 | "got %s" % dis_measure) 70 | # TO-DO: 'mad': Median Absolute Deviation to be added 71 | # once Scipy stats version 1.3.0 is released 72 | else: 73 | raise TypeError("dis_measure should be str, got %s" % dis_measure) 74 | 75 | return check_random_state(random_state), _aad if dis_measure == 'aad' \ 76 | else (np.var if dis_measure == 'var' 77 | else (stats.iqr if dis_measure == 'iqr' else None)) 78 | 79 | 80 | class LMDD(BaseDetector): 81 | """Linear Method for Deviation-based Outlier Detection. 82 | 83 | LMDD employs the concept of the smoothing factor which 84 | indicates how much the dissimilarity can be reduced by 85 | removing a subset of elements from the data-set. 86 | Read more in the :cite:`arning1996linear`. 87 | 88 | Note: this implementation has minor modification to make it output scores 89 | instead of labels. 90 | 91 | Parameters 92 | ---------- 93 | contamination : float in (0., 0.5), optional (default=0.1) 94 | The amount of contamination of the data set, i.e. 95 | the proportion of outliers in the data set. Used when fitting to 96 | define the threshold on the decision function. 97 | 98 | n_iter : int, optional (default=50) 99 | Number of iterations where in each iteration, 100 | the process is repeated after randomizing the order of the input. 101 | Note that n_iter is a very important factor that affects the accuracy. 102 | The higher the better the accuracy and the longer the execution. 103 | 104 | dis_measure: str, optional (default='aad') 105 | Dissimilarity measure to be used in calculating the smoothing factor 106 | for points, options available: 107 | 108 | - 'aad': Average Absolute Deviation 109 | - 'var': Variance 110 | - 'iqr': Interquartile Range 111 | 112 | random_state : int, RandomState instance or None, optional (default=None) 113 | If int, random_state is the seed used by the random number generator; 114 | If RandomState instance, random_state is the random number generator; 115 | If None, the random number generator is the RandomState instance used 116 | by `np.random`. 117 | 118 | Attributes 119 | ---------- 120 | decision_scores_ : numpy array of shape (n_samples,) 121 | The outlier scores of the training data. 122 | The higher, the more abnormal. Outliers tend to have higher 123 | scores. This value is available once the detector is fitted. 124 | 125 | threshold_ : float 126 | The threshold is based on ``contamination``. It is the 127 | ``n_samples * contamination`` most abnormal samples in 128 | ``decision_scores_``. The threshold is calculated for generating 129 | binary outlier labels. 130 | 131 | labels_ : int, either 0 or 1 132 | The binary labels of the training data. 0 stands for inliers 133 | and 1 for outliers/anomalies. It is generated by applying 134 | ``threshold_`` on ``decision_scores_``. 135 | """ 136 | 137 | def __init__(self, contamination=0.1, n_iter=50, dis_measure='aad', 138 | random_state=None): 139 | super(LMDD, self).__init__(contamination=contamination) 140 | self.n_iter, self.n_iter_ = n_iter, n_iter 141 | self.dis_measure, self.dis_measure_ = dis_measure, dis_measure 142 | 143 | # add this assignment to prevent clone error; not being used. 144 | self.random_state = random_state 145 | self.random_state_, self.dis_measure_ = _check_params(n_iter, 146 | dis_measure, 147 | random_state) 148 | 149 | def fit(self, X, y=None): 150 | """Fit detector. y is ignored in unsupervised methods. 151 | 152 | Parameters 153 | ---------- 154 | X : numpy array of shape (n_samples, n_features) 155 | The input samples. 156 | 157 | y : Ignored 158 | Not used, present for API consistency by convention. 159 | 160 | Returns 161 | ------- 162 | self : object 163 | Fitted estimator. 164 | """ 165 | X = check_array(X) 166 | self._set_n_classes(y) 167 | self.decision_scores_ = self.decision_function(X) 168 | self._process_decision_scores() 169 | return self 170 | 171 | def decision_function(self, X): 172 | """Predict raw anomaly score of X using the fitted detector. 173 | 174 | The anomaly score of an input sample is computed based on different 175 | detector algorithms. For consistency, outliers are assigned with 176 | larger anomaly scores. 177 | 178 | Parameters 179 | ---------- 180 | X : numpy array of shape (n_samples, n_features) 181 | The training input samples. Sparse matrices are accepted only 182 | if they are supported by the base estimator. 183 | 184 | Returns 185 | ------- 186 | anomaly_scores : numpy array of shape (n_samples,) 187 | The anomaly score of the input samples. 188 | """ 189 | return self.__sf(X) 190 | 191 | # def __dis(self, X): 192 | # """ 193 | # Internal function to calculate for 194 | # dissimilarity in a sequence of sets. 195 | # """ 196 | # res_ = np.zeros(shape=(X.shape[0],)) 197 | # var_max, j = -np.inf, 0 198 | # # this can be vectorized but just for comforting memory 199 | # test = [] 200 | # for i in range(1, X.shape[0]): 201 | # _var = self.dis_measure_(X[:i + 1]) - self.dis_measure_(X[:i]) 202 | # test.append(_var) 203 | # if _var > var_max: 204 | # var_max = _var 205 | # j = i 206 | 207 | 208 | 209 | # if var_max > res_[j]: 210 | # res_[j] = var_max 211 | 212 | # for k in range(j + 1, X.shape[0]): 213 | # dk_diff = self.dis_measure_(np.vstack((X[:j], X[k])))\ 214 | # - self.dis_measure_(np.vstack((X[:j + 1], X[k]))) 215 | # if dk_diff >= 0: 216 | # res_[k] = dk_diff + var_max 217 | 218 | # return res_ 219 | 220 | 221 | 222 | 223 | def __sf(self, X): 224 | """Internal function to calculate for Smoothing Factors of data points 225 | Repeated n_iter_ of times in randomized mode. 226 | """ 227 | dis_ = np.zeros(shape=(X.shape[0],)) 228 | card_ = np.zeros(shape=(X.shape[0],)) 229 | # perform one process with the original input order 230 | itr_res = _dis(X) 231 | np.put(card_, X.shape[0] - sum([i > 0. for i in itr_res]), 232 | np.where(itr_res > 0.)) 233 | 234 | # create a copy of random state to preserve original state for 235 | # future fits (if any) 236 | random_state = np.random.RandomState( 237 | seed=self.random_state_.get_state()[1][0]) 238 | indices = np.arange(X.shape[0]) 239 | for _ in range(self.n_iter_): 240 | ind_ = indices 241 | random_state.shuffle(ind_) 242 | _x = X[indices] 243 | # get dissimilarity of this iteration and restore original order 244 | itr_res = _dis(_x)[np.argsort(ind_)] 245 | current_card = X.shape[0] - sum([i > 0. for i in itr_res]) 246 | # compare with previous iteration to get the maximal dissimilarity 247 | for i, j in enumerate(itr_res): 248 | if j > dis_[i]: 249 | dis_[i] = j 250 | card_[i] = current_card 251 | # Increase random state seed by one to reorder input next iteration 252 | random_state.seed(random_state.get_state()[1][0] + 1) 253 | 254 | return np.multiply(dis_, card_) 255 | -------------------------------------------------------------------------------- /additional_methods/sod.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Subspace Outlier Detection (SOD) 3 | """ 4 | # Author: Yahya Almardeny 5 | # License: BSD 2 clause 6 | 7 | import numba as nb 8 | import numpy as np 9 | from sklearn.neighbors import NearestNeighbors 10 | from sklearn.utils import check_array 11 | 12 | from pyod.models.base import BaseDetector 13 | from pyod.utils.utility import check_parameter 14 | 15 | 16 | @nb.jit(nopython=True, parallel=True) 17 | def _snn_imp(ind, ref_set_): 18 | n = ind.shape[0] 19 | _count = np.zeros((n, ref_set_), dtype=np.uint32) 20 | 21 | for i in nb.prange(n): 22 | temp = np.zeros(n, dtype=np.uint32) 23 | test_element_set = ind[i] 24 | 25 | for j in range(n): 26 | count = 0 27 | for idx in ind[j]: 28 | if idx in test_element_set: 29 | count += 1 30 | temp[j] = count 31 | 32 | temp[i] = np.iinfo(np.uint32).max 33 | _count[i] = np.argsort(temp)[::-1][1:ref_set_ + 1] 34 | 35 | return _count 36 | 37 | class SOD(BaseDetector): 38 | """Subspace outlier detection (SOD) schema aims to detect outlier in 39 | varying subspaces of a high dimensional feature space. For each data 40 | object, SOD explores the axis-parallel subspace spanned by the data 41 | object's neighbors and determines how much the object deviates from the 42 | neighbors in this subspace. 43 | 44 | See :cite:`kriegel2009outlier` for details. 45 | 46 | Parameters 47 | ---------- 48 | n_neighbors : int, optional (default=20) 49 | Number of neighbors to use by default for k neighbors queries. 50 | 51 | ref_set: int, optional (default=10) 52 | specifies the number of shared nearest neighbors to create the 53 | reference set. Note that ref_set must be smaller than n_neighbors. 54 | 55 | alpha: float in (0., 1.), optional (default=0.8) 56 | specifies the lower limit for selecting subspace. 57 | 0.8 is set as default as suggested in the original paper. 58 | 59 | contamination : float in (0., 0.5), optional (default=0.1) 60 | The amount of contamination of the data set, i.e. 61 | the proportion of outliers in the data set. Used when fitting to 62 | define the threshold on the decision function. 63 | 64 | Attributes 65 | ---------- 66 | decision_scores_ : numpy array of shape (n_samples,) 67 | The outlier scores of the training data. 68 | The higher, the more abnormal. Outliers tend to have higher 69 | scores. This value is available once the detector is 70 | fitted. 71 | 72 | threshold_ : float 73 | The threshold is based on ``contamination``. It is the 74 | ``n_samples * contamination`` most abnormal samples in 75 | ``decision_scores_``. The threshold is calculated for generating 76 | binary outlier labels. 77 | 78 | labels_ : int, either 0 or 1 79 | The binary labels of the training data. 0 stands for inliers 80 | and 1 for outliers/anomalies. It is generated by applying 81 | ``threshold_`` on ``decision_scores_``. 82 | """ 83 | 84 | def __init__(self, contamination=0.1, n_neighbors=20, ref_set=10, 85 | alpha=0.8): 86 | super(SOD, self).__init__(contamination=contamination) 87 | if isinstance(n_neighbors, int): 88 | check_parameter(n_neighbors, low=1, param_name='n_neighbors') 89 | else: 90 | raise ValueError( 91 | "n_neighbors should be int. Got %s" % type(n_neighbors)) 92 | 93 | if isinstance(ref_set, int): 94 | check_parameter(ref_set, low=1, high=n_neighbors, 95 | param_name='ref_set') 96 | else: 97 | raise ValueError("ref_set should be int. Got %s" % type(ref_set)) 98 | 99 | if isinstance(alpha, float): 100 | check_parameter(alpha, low=0.0, high=1.0, param_name='alpha') 101 | else: 102 | raise ValueError("alpha should be float. Got %s" % type(alpha)) 103 | 104 | self.n_neighbors = n_neighbors 105 | self.ref_set = ref_set 106 | self.alpha = alpha 107 | 108 | def fit(self, X, y=None): 109 | """Fit detector. y is ignored in unsupervised methods. 110 | 111 | Parameters 112 | ---------- 113 | X : numpy array of shape (n_samples, n_features) 114 | The input samples. 115 | 116 | y : Ignored 117 | Not used, present for API consistency by convention. 118 | 119 | Returns 120 | ------- 121 | self : object 122 | Fitted estimator. 123 | """ 124 | 125 | # validate inputs X and y (optional) 126 | X = check_array(X) 127 | self._set_n_classes(y) 128 | self.decision_scores_ = self.decision_function(X) 129 | self._process_decision_scores() 130 | 131 | return self 132 | 133 | def decision_function(self, X): 134 | """Predict raw anomaly score of X using the fitted detector. 135 | The anomaly score of an input sample is computed based on different 136 | detector algorithms. For consistency, outliers are assigned with 137 | larger anomaly scores. 138 | 139 | Parameters 140 | ---------- 141 | X : numpy array of shape (n_samples, n_features) 142 | The training input samples. Sparse matrices are accepted only 143 | if they are supported by the base estimator. 144 | 145 | Returns 146 | ------- 147 | anomaly_scores : numpy array of shape (n_samples,) 148 | The anomaly score of the input samples. 149 | """ 150 | return self._sod(X) 151 | 152 | def _snn(self, X): 153 | """This function is called internally to calculate the shared nearest 154 | neighbors (SNN). SNN is reported to be more robust than k nearest 155 | neighbors. 156 | 157 | Returns 158 | ------- 159 | snn_indices : numpy array of shape (n_shared_nearest_neighbors,) 160 | The indices of top k shared nearest neighbors for each observation. 161 | """ 162 | knn = NearestNeighbors(n_neighbors=self.n_neighbors) 163 | knn.fit(X) 164 | # Get the knn index 165 | ind = knn.kneighbors(return_distance=False) 166 | return _snn_imp(ind, self.ref_set) 167 | 168 | def _sod(self, X): 169 | """This function is called internally to perform subspace outlier 170 | detection algorithm. 171 | 172 | Returns 173 | ------- 174 | anomaly_scores : numpy array of shape (n_samples,) 175 | The anomaly score of the input samples. 176 | """ 177 | ref_inds = self._snn(X) 178 | anomaly_scores = np.zeros(shape=(X.shape[0],)) 179 | for i in range(X.shape[0]): 180 | obs = X[i] 181 | ref = X[ref_inds[i,],] 182 | means = np.mean(ref, axis=0) # mean of each column 183 | # average squared distance of the reference to the mean 184 | var_total = np.sum(np.sum(np.square(ref - means))) / self.ref_set 185 | var_expect = self.alpha * var_total / X.shape[1] 186 | var_actual = np.var(ref, axis=0) # variance of each attribute 187 | var_inds = [1 if (j < var_expect) else 0 for j in var_actual] 188 | rel_dim = np.sum(var_inds) 189 | if rel_dim != 0: 190 | anomaly_scores[i] = np.sqrt( 191 | np.dot(var_inds, np.square(obs - means)) / rel_dim) 192 | 193 | return anomaly_scores 194 | -------------------------------------------------------------------------------- /additional_methods/wrappers/AE.py: -------------------------------------------------------------------------------- 1 | from pyod.models.auto_encoder import AutoEncoder 2 | import math 3 | 4 | class AE_wrapper(AutoEncoder): 5 | def __init__(self, n_layers=1, shrinkage_factor=0.3, **args): 6 | 7 | self.n_layers = n_layers 8 | self.shrinkage_factor = shrinkage_factor 9 | 10 | try: 11 | del args["encoder_neurons"] 12 | except KeyError: 13 | pass 14 | 15 | try: 16 | del args["decoder_neurons"] 17 | except KeyError: 18 | pass 19 | 20 | self.args = args 21 | 22 | def fit(self, X, y=None): 23 | 24 | n_features = X.shape[1] 25 | 26 | self.encoder_neurons = [math.ceil(n_features * (1-self.shrinkage_factor)**(i+1)) for i in range(self.n_layers)] 27 | 28 | self.decoder_neurons = list(reversed(self.encoder_neurons)) 29 | 30 | self.hidden_neurons = self.encoder_neurons + self.decoder_neurons 31 | 32 | super().__init__(hidden_neurons=self.hidden_neurons, **self.args) 33 | 34 | super().fit(X, y) -------------------------------------------------------------------------------- /additional_methods/wrappers/ALAD.py: -------------------------------------------------------------------------------- 1 | from pyod.models.alad import ALAD 2 | import math 3 | 4 | class ALAD_wrapper(ALAD): 5 | def __init__(self, n_layers=1, shrinkage_factor=0.3, **args): 6 | 7 | self.n_layers = n_layers 8 | self.shrinkage_factor = shrinkage_factor 9 | 10 | try: 11 | del args["encoder_neurons"] 12 | except KeyError: 13 | pass 14 | 15 | try: 16 | del args["decoder_neurons"] 17 | except KeyError: 18 | pass 19 | 20 | self.args = args 21 | 22 | def fit(self, X, y=None): 23 | 24 | n_features = X.shape[1] 25 | 26 | self.encoder_neurons = [math.ceil(n_features * (1-self.shrinkage_factor)**(i+1)) for i in range(self.n_layers)] 27 | 28 | self.decoder_neurons = list(reversed(self.encoder_neurons)) 29 | 30 | 31 | super().__init__(dec_layers=self.decoder_neurons, enc_layers=self.encoder_neurons, disc_xx_layers=self.encoder_neurons, disc_zz_layers=self.encoder_neurons, disc_xz_layers=self.encoder_neurons, **self.args) 32 | 33 | super().fit(X, y) -------------------------------------------------------------------------------- /additional_methods/wrappers/AnoGAN.py: -------------------------------------------------------------------------------- 1 | from pyod.models.anogan import AnoGAN 2 | import math 3 | 4 | class AnoGAN_wrapper(AnoGAN): 5 | def __init__(self, D_n_layers=1, G_n_layers=1, G_shrinkage_factor=0.3, D_shrinkage_factor=0.3, **args): 6 | 7 | self.D_n_layers = D_n_layers 8 | self.G_n_layers = G_n_layers 9 | self.D_shrinkage_factor = D_shrinkage_factor 10 | self.G_shrinkage_factor = G_shrinkage_factor 11 | 12 | try: 13 | del args["G_layers"] 14 | except KeyError: 15 | pass 16 | 17 | try: 18 | del args["D_layers"] 19 | except KeyError: 20 | pass 21 | 22 | self.args = args 23 | 24 | def fit(self, X, y=None): 25 | 26 | 27 | n_features = X.shape[1] 28 | 29 | self.G_encoder_neurons = [math.ceil(n_features * (1-self.G_shrinkage_factor)**(i+1)) for i in range(self.G_n_layers)] 30 | 31 | self.G_decoder_neurons = list(reversed(self.G_encoder_neurons)) 32 | 33 | self.G_layers = self.G_encoder_neurons + self.G_decoder_neurons 34 | 35 | self.D_layers = [math.ceil(n_features * (1-self.D_shrinkage_factor)**(i+1)) for i in range(self.D_n_layers)] 36 | 37 | super().__init__(G_layers=self.G_layers, D_layers=self.D_layers, **self.args) 38 | 39 | super().fit(X, y) -------------------------------------------------------------------------------- /additional_methods/wrappers/ExtendedIForest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jan 31 23:59:31 2022 4 | 5 | @author: Roel 6 | """ 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import eif as iso 11 | 12 | from pyod.models.base import BaseDetector 13 | 14 | 15 | from sklearn.utils import check_array 16 | 17 | 18 | class ExtendedIForest(BaseDetector): 19 | """ 20 | 21 | """ 22 | 23 | def __init__(self, n_estimators=100, 24 | max_samples=256, 25 | contamination=0.1, 26 | extension_level=1, 27 | verbose=0): 28 | super(ExtendedIForest, self).__init__(contamination=contamination) 29 | 30 | self.n_estimators = n_estimators 31 | self.max_samples = max_samples 32 | self.extension_level = extension_level 33 | 34 | 35 | def fit(self, X, y=None): 36 | """Fit detector. y is ignored in unsupervised methods. 37 | Parameters 38 | ---------- 39 | X : numpy array of shape (n_samples, n_features) 40 | The input samples. 41 | y : Ignored 42 | Not used, present for API consistency by convention. 43 | Returns 44 | ------- 45 | self : object 46 | Fitted estimator. 47 | """ 48 | # validate inputs X and y (optional) 49 | X = check_array(X) 50 | self._set_n_classes(y) 51 | 52 | max_samples = min(X.shape[0], self.max_samples) 53 | self.detector_ = iso.iForest(X, ntrees=self.n_estimators, sample_size=max_samples, ExtensionLevel=self.extension_level) 54 | 55 | 56 | self.decision_scores_ = self.decision_function(X) 57 | 58 | return self 59 | 60 | def decision_function(self, X): 61 | """Predict raw anomaly score of X using the fitted detector. 62 | The anomaly score of an input sample is computed based on different 63 | detector algorithms. For consistency, outliers are assigned with 64 | larger anomaly scores. 65 | Parameters 66 | ---------- 67 | X : numpy array of shape (n_samples, n_features) 68 | The training input samples. Sparse matrices are accepted only 69 | if they are supported by the base estimator. 70 | Returns 71 | ------- 72 | anomaly_scores : numpy array of shape (n_samples,) 73 | The anomaly score of the input samples. 74 | """ 75 | 76 | 77 | return self.detector_.compute_paths(X_in=X) 78 | 79 | 80 | @property 81 | def max_samples_(self): 82 | """The actual number of samples. 83 | Decorator for scikit-learn Isolation Forest attributes. 84 | """ 85 | return self.detector_.max_samples_ -------------------------------------------------------------------------------- /additional_methods/wrappers/HBOS.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Dec 12 17:53:21 2023 5 | 6 | @author: rbouman 7 | """ 8 | 9 | from __future__ import division 10 | from __future__ import print_function 11 | 12 | from ..HBOS.hbos import HBOS 13 | from sklearn.utils import check_array 14 | 15 | import pandas as pd 16 | 17 | 18 | from pyod.models.base import BaseDetector 19 | 20 | 21 | class DynamicHBOS(BaseDetector): 22 | """ 23 | 24 | """ 25 | 26 | def __init__(self, contamination=0.1): 27 | super(DynamicHBOS, self).__init__(contamination=contamination) 28 | 29 | 30 | 31 | def fit(self, X, y=None): 32 | """Fit detector. y is ignored in unsupervised methods. 33 | Parameters 34 | ---------- 35 | X : numpy array of shape (n_samples, n_features) 36 | The input samples. 37 | y : Ignored 38 | Not used, present for API consistency by convention. 39 | Returns 40 | ------- 41 | self : object 42 | Fitted estimator. 43 | """ 44 | # validate inputs X and y (optional) 45 | X = check_array(X) 46 | self._set_n_classes(y) 47 | 48 | self.detector_ = HBOS() 49 | 50 | self.detector_.fit(pd.DataFrame(X)) 51 | self.decision_scores_ = self.decision_function(X) 52 | 53 | return self 54 | 55 | def decision_function(self, X): 56 | """Predict raw anomaly score of X using the fitted detector. 57 | The anomaly score of an input sample is computed based on different 58 | detector algorithms. For consistency, outliers are assigned with 59 | larger anomaly scores. 60 | Parameters 61 | ---------- 62 | X : numpy array of shape (n_samples, n_features) 63 | The training input samples. Sparse matrices are accepted only 64 | if they are supported by the base estimator. 65 | Returns 66 | ------- 67 | anomaly_scores : numpy array of shape (n_samples,) 68 | The anomaly score of the input samples. 69 | """ 70 | 71 | 72 | return self.detector_.predict(pd.DataFrame(X)) 73 | 74 | 75 | @property 76 | def max_samples_(self): 77 | """The actual number of samples. 78 | Decorator for scikit-learn Isolation Forest attributes. 79 | """ 80 | return self.detector_.max_samples_ -------------------------------------------------------------------------------- /additional_methods/wrappers/VAE.py: -------------------------------------------------------------------------------- 1 | from pyod.models.vae import VAE 2 | import math 3 | 4 | class VAE_wrapper(VAE): 5 | def __init__(self, n_layers=1, shrinkage_factor=0.3, **args): 6 | 7 | self.n_layers = n_layers 8 | self.shrinkage_factor = shrinkage_factor 9 | 10 | try: 11 | del args["encoder_neurons"] 12 | except KeyError: 13 | pass 14 | 15 | try: 16 | del args["decoder_neurons"] 17 | except KeyError: 18 | pass 19 | 20 | self.args = args 21 | 22 | def fit(self, X, y=None): 23 | 24 | n_features = X.shape[1] 25 | 26 | self.encoder_neurons = [math.ceil(n_features * (1-self.shrinkage_factor)**(i+1)) for i in range(self.n_layers)] 27 | 28 | self.decoder_neurons = list(reversed(self.encoder_neurons)) 29 | 30 | super().__init__(encoder_neurons=self.encoder_neurons, decoder_neurons=self.decoder_neurons, **self.args) 31 | 32 | super().fit(X, y) -------------------------------------------------------------------------------- /additional_methods/wrappers/rrcf.py: -------------------------------------------------------------------------------- 1 | from pyod.models.base import BaseDetector 2 | import rrcf 3 | import numpy as np 4 | import pandas as pd 5 | 6 | class rrcf_wrapper(): 7 | def __init__(self, n_trees, tree_size): 8 | 9 | self.n_trees = n_trees 10 | self.tree_size = tree_size 11 | 12 | # based on example batch code from: https://github.com/kLabUM/rrcf 13 | def fit(self, X, y=None): 14 | 15 | n = X.shape[0] 16 | 17 | tree_size = min(self.tree_size, n) 18 | 19 | forest = [] 20 | 21 | if self.n_trees * tree_size < n: 22 | self.n_trees = np.ceil(n / tree_size) #increase n_trees if not all samples are covered. 23 | while len(forest) < self.n_trees: 24 | # Select random subsets of points uniformly from point set 25 | ixs = np.random.choice(n, size=(n//tree_size, tree_size), 26 | replace=False) 27 | # Add sampled trees to forest 28 | trees = [rrcf.RCTree(X[ix], index_labels=ix) for ix in ixs] 29 | forest.extend(trees) 30 | 31 | 32 | # Compute average CoDisp 33 | avg_codisp = pd.Series(0.0, index=np.arange(n)) 34 | index = np.zeros(n) 35 | for tree in forest: 36 | codisp = pd.Series({leaf : tree.codisp(leaf) for leaf in tree.leaves}) 37 | avg_codisp[codisp.index] += codisp 38 | np.add.at(index, codisp.index.values, 1) 39 | avg_codisp /= index 40 | 41 | self.decision_scores_ = avg_codisp -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: OD_benchmark 2 | dependencies: 3 | - python=3.8 4 | - numpy 5 | - pandas 6 | - scikit-learn=1.0.2 7 | - matplotlib 8 | - seaborn 9 | - scipy 10 | - cvxopt 11 | - pytorch 12 | - pip 13 | - pip: 14 | - cython 15 | - tensorflow 16 | - pyod 17 | - eif 18 | - combo 19 | - tqdm 20 | - rrcf 21 | -------------------------------------------------------------------------------- /evaluation_metrics.py: -------------------------------------------------------------------------------- 1 | from pyod.utils.utility import get_label_n #precision_n_scores with n=None is equal to the R-precision measure 2 | from sklearn.utils import column_or_1d 3 | from sklearn.metrics import precision_score, average_precision_score 4 | import numpy as np 5 | 6 | #copied from pyod, but changed default behaviour of precision_score warnings when y_pred is all zeroes 7 | def precision_n_scores(y, y_pred, n=None): 8 | """Utility function to calculate precision @ rank n. 9 | 10 | Parameters 11 | ---------- 12 | y : list or numpy array of shape (n_samples,) 13 | The ground truth. Binary (0: inliers, 1: outliers). 14 | 15 | y_pred : list or numpy array of shape (n_samples,) 16 | The raw outlier scores as returned by a fitted model. 17 | 18 | n : int, optional (default=None) 19 | The number of outliers. if not defined, infer using ground truth. 20 | 21 | Returns 22 | ------- 23 | precision_at_rank_n : float 24 | Precision at rank n score. 25 | 26 | """ 27 | 28 | # turn raw prediction decision scores into binary labels 29 | y_pred = get_label_n(y, y_pred, n) 30 | 31 | # enforce formats of y and labels_ 32 | y = column_or_1d(y) 33 | y_pred = column_or_1d(y_pred) 34 | 35 | return precision_score(y, y_pred, zero_division=0) 36 | 37 | def adjusted_precision_n_scores(y_true, y_pred, n=None): 38 | 39 | p_at_n = precision_n_scores(y_true, y_pred, n=n) 40 | 41 | # calculate the percentage of outliers 42 | if n is not None: 43 | outliers_fraction = n /len(y_true) 44 | else: 45 | outliers_fraction = np.count_nonzero(y_true) / len(y_true) 46 | 47 | adjusted_p_at_n = (p_at_n - outliers_fraction)/(1 - outliers_fraction) 48 | 49 | return(adjusted_p_at_n) 50 | 51 | def adjusted_average_precision(y_true, y_pred): 52 | 53 | ap = average_precision_score(y_true, y_pred) 54 | 55 | # calculate the percentage of outliers 56 | outliers_fraction = np.count_nonzero(y_true) / len(y_true) 57 | 58 | adjusted_average_precision = (ap - outliers_fraction)/(1 - outliers_fraction) 59 | 60 | return(adjusted_average_precision) 61 | 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /figures/.gitignore: -------------------------------------------------------------------------------- 1 | *.eps 2 | *.png 3 | *.pdf 4 | -------------------------------------------------------------------------------- /formatted_data/aloi.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/aloi.npz -------------------------------------------------------------------------------- /formatted_data/annthyroid.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/annthyroid.npz -------------------------------------------------------------------------------- /formatted_data/arrhythmia.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/arrhythmia.npz -------------------------------------------------------------------------------- /formatted_data/breastw.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/breastw.npz -------------------------------------------------------------------------------- /formatted_data/campaign.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/campaign.npz -------------------------------------------------------------------------------- /formatted_data/cardio.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/cardio.npz -------------------------------------------------------------------------------- /formatted_data/cover.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/cover.npz -------------------------------------------------------------------------------- /formatted_data/donors.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/donors.npz -------------------------------------------------------------------------------- /formatted_data/fault.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/fault.npz -------------------------------------------------------------------------------- /formatted_data/glass.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/glass.npz -------------------------------------------------------------------------------- /formatted_data/hepatitis.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/hepatitis.npz -------------------------------------------------------------------------------- /formatted_data/hrss_anomalous_optimized.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/hrss_anomalous_optimized.npz -------------------------------------------------------------------------------- /formatted_data/hrss_anomalous_standard.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/hrss_anomalous_standard.npz -------------------------------------------------------------------------------- /formatted_data/http.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/http.npz -------------------------------------------------------------------------------- /formatted_data/internetads.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/internetads.npz -------------------------------------------------------------------------------- /formatted_data/ionosphere.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/ionosphere.npz -------------------------------------------------------------------------------- /formatted_data/landsat.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/landsat.npz -------------------------------------------------------------------------------- /formatted_data/letter.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/letter.npz -------------------------------------------------------------------------------- /formatted_data/magic.gamma.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/magic.gamma.npz -------------------------------------------------------------------------------- /formatted_data/mammography.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/mammography.npz -------------------------------------------------------------------------------- /formatted_data/mi-f.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/mi-f.npz -------------------------------------------------------------------------------- /formatted_data/mi-v.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/mi-v.npz -------------------------------------------------------------------------------- /formatted_data/mnist.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/mnist.npz -------------------------------------------------------------------------------- /formatted_data/musk.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/musk.npz -------------------------------------------------------------------------------- /formatted_data/nasa.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/nasa.npz -------------------------------------------------------------------------------- /formatted_data/optdigits.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/optdigits.npz -------------------------------------------------------------------------------- /formatted_data/pageblocks.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/pageblocks.npz -------------------------------------------------------------------------------- /formatted_data/parkinson.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/parkinson.npz -------------------------------------------------------------------------------- /formatted_data/pen-global.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/pen-global.npz -------------------------------------------------------------------------------- /formatted_data/pen-local.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/pen-local.npz -------------------------------------------------------------------------------- /formatted_data/pendigits.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/pendigits.npz -------------------------------------------------------------------------------- /formatted_data/pima.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/pima.npz -------------------------------------------------------------------------------- /formatted_data/satellite.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/satellite.npz -------------------------------------------------------------------------------- /formatted_data/satimage-2.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/satimage-2.npz -------------------------------------------------------------------------------- /formatted_data/seismic-bumps.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/seismic-bumps.npz -------------------------------------------------------------------------------- /formatted_data/shuttle.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/shuttle.npz -------------------------------------------------------------------------------- /formatted_data/skin.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/skin.npz -------------------------------------------------------------------------------- /formatted_data/smtp.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/smtp.npz -------------------------------------------------------------------------------- /formatted_data/spambase.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/spambase.npz -------------------------------------------------------------------------------- /formatted_data/speech.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/speech.npz -------------------------------------------------------------------------------- /formatted_data/stamps.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/stamps.npz -------------------------------------------------------------------------------- /formatted_data/thyroid.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/thyroid.npz -------------------------------------------------------------------------------- /formatted_data/vertebral.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/vertebral.npz -------------------------------------------------------------------------------- /formatted_data/vowels.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/vowels.npz -------------------------------------------------------------------------------- /formatted_data/waveform.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/waveform.npz -------------------------------------------------------------------------------- /formatted_data/wbc.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/wbc.npz -------------------------------------------------------------------------------- /formatted_data/wbc2.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/wbc2.npz -------------------------------------------------------------------------------- /formatted_data/wilt.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/wilt.npz -------------------------------------------------------------------------------- /formatted_data/wine.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/wine.npz -------------------------------------------------------------------------------- /formatted_data/wpbc.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/wpbc.npz -------------------------------------------------------------------------------- /formatted_data/yeast.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/yeast.npz -------------------------------------------------------------------------------- /formatted_data/yeast6.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/formatted_data/yeast6.npz -------------------------------------------------------------------------------- /generate_and_plot_types_of_anomalies.py: -------------------------------------------------------------------------------- 1 | #%% 2 | import numpy as np 3 | import seaborn as sns 4 | import matplotlib.pyplot as plt 5 | 6 | sns.set_style("white") 7 | 8 | 9 | #%% peripheral point 10 | 11 | # Peripheral point plot 12 | distribution_1 = np.random.randn(1000, 2) + np.array([1, 1]) 13 | anomalies_1 = np.array([[-3, 5], [1, -3.5], [-4, -3]], dtype=np.float32) 14 | 15 | fig, axes = plt.subplots(1, 2, figsize=(18, 6)) 16 | 17 | sns.scatterplot(x=distribution_1[:, 0], y=distribution_1[:, 1], color="blue", alpha=0.2, ax=axes[0]) 18 | sns.scatterplot(x=anomalies_1[:, 0], y=anomalies_1[:, 1], marker="X", color="red", s=100, ax=axes[0]) 19 | axes[0].set_xlim(-6, 10) 20 | axes[0].set_ylim(-6, 10) 21 | axes[0].set_xlabel("X$_1$") 22 | axes[0].set_ylabel("X$_2$") 23 | axes[0].set_title("Peripheral Anomalies", fontsize=20) 24 | 25 | # Enclosed point plot 26 | distribution_1 = np.random.randn(1000, 2) + np.array([-4, -4]) 27 | distribution_2 = np.random.randn(1000, 2) + np.array([-4, 4]) 28 | distribution_3 = np.random.randn(1000, 2) + np.array([4, -4]) 29 | distribution_4 = np.random.randn(1000, 2) + np.array([4, 4]) 30 | 31 | anomalies_1 = np.array([[0, 0], [-0.5, 0.3], [0.4, -0.7]], dtype=np.float32) 32 | 33 | sns.scatterplot(x=distribution_1[:, 0], y=distribution_1[:, 1], color="blue", alpha=0.2, ax=axes[1]) 34 | sns.scatterplot(x=distribution_2[:, 0], y=distribution_2[:, 1], color="blue", alpha=0.2, ax=axes[1]) 35 | sns.scatterplot(x=distribution_3[:, 0], y=distribution_3[:, 1], color="blue", alpha=0.2, ax=axes[1]) 36 | sns.scatterplot(x=distribution_4[:, 0], y=distribution_4[:, 1], color="blue", alpha=0.2, ax=axes[1]) 37 | sns.scatterplot(x=anomalies_1[:, 0], y=anomalies_1[:, 1], marker="X", color="red", s=100, ax=axes[1]) 38 | axes[1].set_xlim(-8, 8) 39 | axes[1].set_ylim(-8, 8) 40 | axes[1].set_xlabel("X$_1$") 41 | axes[1].set_ylabel("X$_2$") 42 | axes[1].set_title("Enclosed Anomalies", fontsize=20) 43 | 44 | # Adjust layout to prevent clipping of titles 45 | plt.tight_layout() 46 | 47 | # Show the plot 48 | 49 | 50 | fig.savefig("figures/enclosed-peripheral_point_example.pdf", format="pdf") 51 | 52 | plt.show() 53 | #%% local_outlier_plot 54 | 55 | # Local outlier plot 56 | local_outlier_plot = plt.figure() 57 | 58 | distribution_1 = np.random.randn(1000, 2) + np.array([1, 1]) 59 | distribution_2 = np.random.randn(1000, 2) / 5 + np.array([7, 7]) 60 | 61 | anomalies = np.array([[6.2, 6.5], [7.2, 8], [7.9, 6.3]]) 62 | 63 | fig, axes = plt.subplots(1, 2, figsize=(18, 6)) 64 | 65 | sns.scatterplot(x=distribution_1[:, 0], y=distribution_1[:, 1], color="blue", alpha=0.2, ax=axes[0]) 66 | sns.scatterplot(x=distribution_2[:, 0], y=distribution_2[:, 1], color="blue", alpha=0.2, ax=axes[0]) 67 | sns.scatterplot(x=anomalies[:, 0], y=anomalies[:, 1], marker="X", color="red", s=100, ax=axes[0]) 68 | axes[0].set_xlim(-6, 10) 69 | axes[0].set_ylim(-6, 10) 70 | axes[0].set_xlabel("X$_1$") 71 | axes[0].set_ylabel("X$_2$") 72 | axes[0].set_title("Local Density Anomalies", fontsize=20) 73 | 74 | # Global outlier plot 75 | anomalies = np.array([[8, 0], [7.5, 1]]) 76 | 77 | sns.scatterplot(x=distribution_1[:, 0], y=distribution_1[:, 1], color="blue", alpha=0.2, ax=axes[1]) 78 | sns.scatterplot(x=distribution_2[:, 0], y=distribution_2[:, 1], color="blue", alpha=0.2, ax=axes[1]) 79 | sns.scatterplot(x=anomalies[:, 0], y=anomalies[:, 1], marker="X", color="red", s=100, ax=axes[1]) 80 | axes[1].set_xlim(-6, 10) 81 | axes[1].set_ylim(-6, 10) 82 | axes[1].set_xlabel("X$_1$") 83 | axes[1].set_ylabel("X$_2$") 84 | axes[1].set_title("Global Density Anomalies", fontsize=20) 85 | 86 | # Adjust layout to prevent clipping of titles 87 | plt.tight_layout() 88 | 89 | # Show the plot 90 | 91 | 92 | fig.savefig("figures/global-local_outlier_example.pdf", format="pdf") 93 | 94 | plt.show() 95 | #%% clustered outliers 96 | 97 | # Clustered outliers plot 98 | local_outlier_plot = plt.figure() 99 | 100 | distribution_1 = np.random.randn(1000, 2) + np.array([1, 1]) 101 | anomalies_2 = np.random.randn(10, 2) / 5 + np.array([7, 7]) 102 | 103 | fig, axes = plt.subplots(1, 2, figsize=(18, 6)) 104 | 105 | sns.scatterplot(x=distribution_1[:, 0], y=distribution_1[:, 1], color="blue", alpha=0.2, ax=axes[0]) 106 | sns.scatterplot(x=anomalies_2[:, 0], y=anomalies_2[:, 1], marker="X", color="red", s=100, ax=axes[0]) 107 | axes[0].set_xlim(-6, 10) 108 | axes[0].set_ylim(-6, 10) 109 | axes[0].set_xlabel("X$_1$") 110 | axes[0].set_ylabel("X$_2$") 111 | axes[0].set_title("Clustered Anomalies", fontsize=20) 112 | 113 | # Isolated outliers plot 114 | anomalies = np.array([[7, 7], [-4, -4], [-4, 6]]) 115 | 116 | sns.scatterplot(x=distribution_1[:, 0], y=distribution_1[:, 1], color="blue", alpha=0.2, ax=axes[1]) 117 | sns.scatterplot(x=anomalies[:, 0], y=anomalies[:, 1], marker="X", color="red", s=100, ax=axes[1]) 118 | axes[1].set_xlim(-6, 10) 119 | axes[1].set_ylim(-6, 10) 120 | axes[1].set_xlabel("X$_1$") 121 | axes[1].set_ylabel("X$_2$") 122 | axes[1].set_title("Isolated Anomalies", fontsize=20) 123 | 124 | # Adjust layout to prevent clipping of titles 125 | plt.tight_layout() 126 | 127 | # Show the plot 128 | 129 | 130 | fig.savefig("figures/isolated-clustered_outlier_example.pdf", format="pdf") 131 | plt.show() 132 | #%% univariate outliers 133 | 134 | # Univariate outliers plot 135 | distribution_1 = np.random.multivariate_normal([0, 0], [[3, 0], [0, 1]], 1000) 136 | anomalies = np.array([[0, 6], [8, 0]]) 137 | 138 | fig, axes = plt.subplots(1, 2, figsize=(18, 6)) 139 | 140 | sns.scatterplot(x=distribution_1[:, 0], y=distribution_1[:, 1], color="blue", alpha=0.2, ax=axes[0]) 141 | sns.scatterplot(x=anomalies[:, 0], y=anomalies[:, 1], marker="X", color="red", s=100, ax=axes[0]) 142 | axes[0].set_xlim(-9, 9) 143 | axes[0].set_ylim(-4, 7) 144 | axes[0].set_xlabel("X$_1$") 145 | axes[0].set_ylabel("X$_2$") 146 | axes[0].set_title("Univariate Anomalies", fontsize=20) 147 | 148 | # Multivariate outliers plot 149 | distribution_1 = np.random.multivariate_normal([0, 0], [[0.5, 0], [4, 4]], 1000) 150 | anomalies = np.array([[-3, 3], [4, -2]]) 151 | 152 | sns.scatterplot(x=distribution_1[:, 0], y=distribution_1[:, 1], color="blue", alpha=0.2, ax=axes[1]) 153 | sns.scatterplot(x=anomalies[:, 0], y=anomalies[:, 1], marker="X", color="red", s=100, ax=axes[1]) 154 | axes[1].set_xlim(-7, 7) 155 | axes[1].set_ylim(-7, 7) 156 | axes[1].set_xlabel("X$_1$") 157 | axes[1].set_ylabel("X$_2$") 158 | axes[1].set_title("Multivariate Anomalies", fontsize=20) 159 | 160 | # Adjust layout to prevent clipping of titles 161 | plt.tight_layout() 162 | 163 | # Show the plot 164 | 165 | 166 | fig.savefig("figures/multivariate-univariate_outlier_example.pdf", format="pdf") 167 | plt.show() 168 | -------------------------------------------------------------------------------- /invert_labels_calculate_metrics.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import pickle 4 | import numpy as np 5 | 6 | from sklearn.metrics import roc_auc_score, average_precision_score 7 | from pyod.utils.utility import precision_n_scores 8 | from evaluation_metrics import adjusted_precision_n_scores, adjusted_average_precision 9 | 10 | 11 | #define score function: 12 | score_functions = {"ROC/AUC": roc_auc_score, 13 | "R_precision": precision_n_scores, 14 | "adjusted_R_precision": adjusted_precision_n_scores, 15 | "average_precision": average_precision_score, 16 | "adjusted_average_precision": adjusted_average_precision} 17 | 18 | # In case anomaly detection problems are wrongly defined, labels can be switched in order to recalculate metrics 19 | 20 | # inverted datasets: 21 | # Skin originally has 1 being the skin pixel class, and 0 being the noise class. The skin class is however more homogeneous, so labels should be flipped. 22 | # Vertebral consist out of 3 classes, the normal class, and disk hernia/spondilolysthesis. The latter classes are combined and originally defined as 0 in ODDS, but they are conceptually the anomalies.ArithmeticError 23 | # yeast is poorly documented. We've replaced it with yeast6 from EOAD 24 | inverted_datasets = ["yeast", "skin", "vertebral"] 25 | 26 | pickle_dir = "formatted_data" 27 | score_dir = "results/score_dir" 28 | csv_result_dir = "results/csvresult_dir" 29 | result_dir = "results/result_dir" 30 | figure_dir = "figures" 31 | table_dir = "tables" 32 | 33 | #uncomment if new metrics need to be calculated: 34 | #all_datasets = set(os.listdir(result_dir)) 35 | 36 | #set inverted_datasets to all_datasets if new metrics need to be calculated 37 | for dataset in inverted_datasets: 38 | print(dataset) 39 | 40 | full_path_filename = os.path.join(pickle_dir, dataset+".npz") 41 | 42 | data = np.load(open(full_path_filename, 'rb')) 43 | X, y = data["X"], np.squeeze(data["y"]) 44 | 45 | #invert y: 46 | if dataset in inverted_datasets: 47 | y_inverted = np.zeros(y.shape) 48 | y_inverted[y==0] = 1 49 | y = y_inverted 50 | 51 | for method_name in os.listdir(os.path.join(score_dir, dataset)): 52 | print(method_name) 53 | score_folder_path = os.path.join(score_dir, dataset, method_name) 54 | 55 | hyperparameter_csvs = os.listdir(score_folder_path) 56 | hyperparameter_settings = [filename.replace(".csv", "") for filename in hyperparameter_csvs] 57 | 58 | results_per_setting = {} 59 | for hyperparameter_csv, hyperparameter_setting in zip(hyperparameter_csvs, hyperparameter_settings): 60 | print(hyperparameter_csv) 61 | full_path_filename = os.path.join(score_folder_path, hyperparameter_csv) 62 | 63 | outlier_scores = pd.read_csv(full_path_filename, header=None) 64 | 65 | method_performance = {method_name:{score_name: score_function(y,outlier_scores) for (score_name, score_function) in score_functions.items()}} 66 | method_performance_df = pd.DataFrame(method_performance).transpose() 67 | 68 | metric_pickle_file = os.path.join(result_dir, dataset, method_name, hyperparameter_csv.replace(".csv", ".pickle")) 69 | with open(metric_pickle_file, 'wb') as handle: 70 | pickle.dump(method_performance_df, handle, protocol=pickle.HIGHEST_PROTOCOL) 71 | 72 | metric_csv_file = os.path.join(csv_result_dir, dataset, method_name, hyperparameter_csv) 73 | 74 | #also write csv files for easy manual inspection 75 | method_performance_df.to_csv(metric_csv_file) 76 | -------------------------------------------------------------------------------- /method_example.py: -------------------------------------------------------------------------------- 1 | from preprocess_detect_outliers import preprocess_detect_outliers 2 | #%% Define parameter settings and methods 3 | 4 | 5 | from pyod.models.knn import KNN 6 | 7 | #dict of methods and functions 8 | methods = { 9 | "kNN":KNN 10 | } 11 | 12 | #dict of methods and parameters 13 | method_parameters = { 14 | "kNN":{"n_neighbors":range(5,31), "method":["mean"]} 15 | } 16 | 17 | 18 | #%% run method over all datasets 19 | 20 | preprocess_detect_outliers(methods, method_parameters) -------------------------------------------------------------------------------- /minimal_environment.yml: -------------------------------------------------------------------------------- 1 | name: OD_benchmark_minimal 2 | dependencies: 3 | - python=3.8 4 | - numpy 5 | - pandas 6 | - scikit-learn=1.0.2 7 | - matplotlib 8 | - seaborn 9 | - scipy 10 | - pip 11 | - pip: 12 | - pyod 13 | - combo 14 | -------------------------------------------------------------------------------- /preprocess_detect_outliers.py: -------------------------------------------------------------------------------- 1 | #%% setup 2 | import pickle 3 | import os 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn.metrics import roc_auc_score, average_precision_score 7 | from pyod.utils.utility import precision_n_scores 8 | from sklearn.pipeline import make_pipeline 9 | from sklearn.preprocessing import RobustScaler 10 | from sklearn.model_selection import ParameterGrid 11 | from evaluation_metrics import adjusted_precision_n_scores, adjusted_average_precision 12 | 13 | 14 | formatted_data_dir = "formatted_data" 15 | base_result_dir = "results" 16 | result_dir = "result_dir" 17 | csvresult_dir = "csvresult_dir" 18 | score_dir = "score_dir" 19 | 20 | #define score function: 21 | score_functions = {"ROC/AUC": roc_auc_score, 22 | "R_precision": precision_n_scores, 23 | "adjusted_R_precision": adjusted_precision_n_scores, 24 | "average_precision": average_precision_score, 25 | "adjusted_average_precision": adjusted_average_precision} 26 | 27 | 28 | verbose = True 29 | input_type = "npz" 30 | 31 | #%% 32 | 33 | def preprocess_detect_outliers(methods, method_parameters, verbose=True, input_type="npz"): 34 | 35 | #sort dataset_names based on size: https://stackoverflow.com/questions/20252669/get-files-from-directory-argument-sorting-by-size 36 | # make a generator for all file paths within dirpath 37 | all_files = ( os.path.join(basedir, filename) for basedir, dirs, files in os.walk(formatted_data_dir) for filename in files ) 38 | sorted_files = sorted(all_files, key = os.path.getsize) 39 | dataset_names = [filename.replace(formatted_data_dir+os.path.sep,"") for filename in sorted_files] 40 | dataset_names = [dataset_name for dataset_name in dataset_names if dataset_name.endswith(input_type)] 41 | 42 | all_methods_to_run = methods 43 | 44 | #%% loop over all data, but do not reproduce existing results 45 | 46 | target_dir = os.path.join(base_result_dir, result_dir) 47 | target_csvdir = os.path.join(base_result_dir, csvresult_dir) 48 | score_csvdir = os.path.join(base_result_dir, score_dir) 49 | 50 | if not os.path.exists(score_csvdir): 51 | os.makedirs(score_csvdir) 52 | 53 | for dataset_name in dataset_names: 54 | 55 | #print name for reporting purpose 56 | print("______"+dataset_name+"______") 57 | 58 | full_path_filename = os.path.join(formatted_data_dir, dataset_name) 59 | 60 | if input_type == "pickle": 61 | data = pickle.load(open(full_path_filename, 'rb')) 62 | elif input_type == "npz": 63 | data = data = np.load(open(full_path_filename, 'rb')) 64 | 65 | X, y = data["X"], np.squeeze(data["y"]) 66 | 67 | #loop over all methods: 68 | 69 | for method_name, OD_class in all_methods_to_run.items(): 70 | print("-" + method_name) 71 | hyperparameter_grid = method_parameters[method_name] 72 | hyperparameter_list = list(ParameterGrid(hyperparameter_grid)) 73 | 74 | #loop over hyperparameter settings 75 | for hyperparameter_setting in hyperparameter_list: 76 | 77 | hyperparameter_string = str(hyperparameter_setting) 78 | 79 | if verbose: 80 | print(hyperparameter_string) 81 | 82 | #check whether results have been calculated 83 | full_target_dir = os.path.join(target_dir, dataset_name.replace("."+input_type, ""), method_name) 84 | target_file_name = os.path.join(target_dir, dataset_name.replace("."+input_type, ""), method_name, hyperparameter_string+".pickle") 85 | if os.path.exists(target_file_name) and os.path.getsize(target_file_name) > 0: 86 | if verbose: 87 | print(" results already calculated, skipping recalculation") 88 | else: 89 | 90 | OD_method = OD_class(**hyperparameter_setting) 91 | 92 | pipeline = make_pipeline(RobustScaler(), OD_method) 93 | 94 | pipeline.fit(X) 95 | 96 | outlier_scores = pipeline[1].decision_scores_ 97 | 98 | method_performance = {method_name:{score_name: score_function(y,outlier_scores) for (score_name, score_function) in score_functions.items()}} 99 | method_performance_df = pd.DataFrame(method_performance).transpose() 100 | 101 | os.makedirs(full_target_dir, exist_ok=True) 102 | with open(target_file_name, 'wb') as handle: 103 | pickle.dump(method_performance_df, handle, protocol=pickle.HIGHEST_PROTOCOL) 104 | 105 | #also write csv files for easy manual inspection 106 | full_target_csvdir = os.path.join(target_csvdir, dataset_name.replace("."+input_type, ""), method_name) 107 | os.makedirs(full_target_csvdir, exist_ok=True) 108 | target_csvfile_name = os.path.join(full_target_csvdir, hyperparameter_string+".csv") 109 | method_performance_df.to_csv(target_csvfile_name) 110 | 111 | full_target_scoredir = os.path.join(score_csvdir, dataset_name.replace("."+input_type, ""), method_name) 112 | os.makedirs(full_target_scoredir, exist_ok=True) 113 | target_scorefile_name = os.path.join(full_target_scoredir, hyperparameter_string+".csv") 114 | np.savetxt(target_scorefile_name, outlier_scores) 115 | 116 | 117 | -------------------------------------------------------------------------------- /raw_data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/.gitkeep -------------------------------------------------------------------------------- /raw_data/ADBench_data_raw/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ADBench_data_raw/.gitkeep -------------------------------------------------------------------------------- /raw_data/ELKI_data_raw/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ELKI_data_raw/.gitkeep -------------------------------------------------------------------------------- /raw_data/ELKI_data_raw/Annthyroid/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ELKI_data_raw/Annthyroid/.gitkeep -------------------------------------------------------------------------------- /raw_data/ELKI_data_raw/Arrhythmia/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ELKI_data_raw/Arrhythmia/.gitkeep -------------------------------------------------------------------------------- /raw_data/ELKI_data_raw/Cardiotocography/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ELKI_data_raw/Cardiotocography/.gitkeep -------------------------------------------------------------------------------- /raw_data/ELKI_data_raw/HeartDisease/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ELKI_data_raw/HeartDisease/.gitkeep -------------------------------------------------------------------------------- /raw_data/ELKI_data_raw/Hepatitis/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ELKI_data_raw/Hepatitis/.gitkeep -------------------------------------------------------------------------------- /raw_data/ELKI_data_raw/InternetAds/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ELKI_data_raw/InternetAds/.gitkeep -------------------------------------------------------------------------------- /raw_data/ELKI_data_raw/PageBlocks/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ELKI_data_raw/PageBlocks/.gitkeep -------------------------------------------------------------------------------- /raw_data/ELKI_data_raw/Parkinson/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ELKI_data_raw/Parkinson/.gitkeep -------------------------------------------------------------------------------- /raw_data/ELKI_data_raw/Pima/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ELKI_data_raw/Pima/.gitkeep -------------------------------------------------------------------------------- /raw_data/ELKI_data_raw/SpamBase/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ELKI_data_raw/SpamBase/.gitkeep -------------------------------------------------------------------------------- /raw_data/ELKI_data_raw/Stamps/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ELKI_data_raw/Stamps/.gitkeep -------------------------------------------------------------------------------- /raw_data/ELKI_data_raw/Wilt/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ELKI_data_raw/Wilt/.gitkeep -------------------------------------------------------------------------------- /raw_data/GAAL_data_raw/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/GAAL_data_raw/.gitkeep -------------------------------------------------------------------------------- /raw_data/Goldstein_data_raw/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/Goldstein_data_raw/.gitkeep -------------------------------------------------------------------------------- /raw_data/ODDS_data_raw/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ODDS_data_raw/.gitkeep -------------------------------------------------------------------------------- /raw_data/ODDS_data_raw/categorical_variables_per_dataset.json: -------------------------------------------------------------------------------- 1 | { 2 | "lympho": [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] 3 | } -------------------------------------------------------------------------------- /raw_data/ODDS_data_raw/matfile_data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ODDS_data_raw/matfile_data/.gitkeep -------------------------------------------------------------------------------- /raw_data/ODDS_data_raw/other_data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/ODDS_data_raw/other_data/.gitkeep -------------------------------------------------------------------------------- /raw_data/extended_AE_data_raw/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/extended_AE_data_raw/.gitkeep -------------------------------------------------------------------------------- /raw_data/extended_AE_data_raw/CNC-kaggle/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RoelBouman/outlierdetection/556eb87f94c7ad1d6af25e2971852fd63091c60b/raw_data/extended_AE_data_raw/CNC-kaggle/.gitkeep -------------------------------------------------------------------------------- /tables/.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | -------------------------------------------------------------------------------- /testnewmethods.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu Dec 14 17:23:39 2023 5 | 6 | @author: rbouman 7 | """ 8 | 9 | from sklearn.datasets import load_breast_cancer 10 | 11 | from pyod.models.lmdd import LMDD 12 | from additional_methods.lmdd import LMDD as LMDD2 13 | 14 | import numpy as np 15 | import matplotlib.pyplot as plt 16 | 17 | import os 18 | 19 | from sklearn.metrics import roc_auc_score 20 | 21 | 22 | formatted_data_dir = "formatted_data" 23 | dataset_name = "wbc.npz" 24 | 25 | 26 | full_path_filename = os.path.join(formatted_data_dir, dataset_name) 27 | 28 | data = np.load(open(full_path_filename, 'rb')) 29 | 30 | X, y = data["X"], np.squeeze(data["y"]) 31 | 32 | #add duplicates to X and y: 33 | 34 | X = np.concatenate([X]*10) 35 | y = np.concatenate([y]*10) 36 | 37 | 38 | plt.figure() 39 | model = LMDD2(n_iter=5, dis_measure="aad") 40 | 41 | model.fit(X) 42 | 43 | dec_scores = model.decision_scores_ 44 | 45 | plt.hist(dec_scores) 46 | 47 | plt.show() 48 | 49 | print(roc_auc_score(y, dec_scores)) 50 | 51 | plt.figure() 52 | 53 | model2 = LMDD(n_iter=5, dis_measure="aad") 54 | 55 | model2.fit(X) 56 | 57 | dec_scores2 = model2.decision_scores_ 58 | 59 | plt.hist(dec_scores2) 60 | 61 | plt.show() 62 | 63 | print(roc_auc_score(y, dec_scores2)) --------------------------------------------------------------------------------