├── pythresh
    ├── thresholds
    │   ├── __init__.py
    │   ├── zscore.py
    │   ├── mad.py
    │   ├── iqr.py
    │   ├── fwfm.py
    │   ├── fgd.py
    │   ├── thresh_utility.py
    │   ├── cpd.py
    │   ├── mtt.py
    │   ├── boot.py
    │   ├── regr.py
    │   └── yj.py
    ├── models
    │   ├── meta_model_GNB.pkl
    │   ├── meta_model_GNBC.pkl
    │   ├── meta_model_GNBM.pkl
    │   └── meta_model_LIN.pkl
    ├── __init__.py
    ├── version.py
    ├── test
    │   ├── test_conf.py
    │   ├── test_fastkde.py
    │   ├── test_rank.py
    │   ├── test_eb.py
    │   ├── test_yj.py
    │   ├── test_fgd.py
    │   ├── test_iqr.py
    │   ├── test_aucp.py
    │   ├── test_boot.py
    │   ├── test_fwfm.py
    │   ├── test_mcst.py
    │   ├── test_moll.py
    │   ├── test_wind.py
    │   ├── test_chau.py
    │   ├── test_mad.py
    │   ├── test_regr.py
    │   ├── test_mtt.py
    │   ├── test_zscore.py
    │   ├── test_decomp.py
    │   └── test_karch.py
    └── utils
    │   └── rank_utility.py
├── imgs
    └── All.png
├── requirements.txt
├── docs
    ├── figs
    │   ├── All.png
    │   ├── Comb1.png
    │   ├── Comb2.png
    │   ├── Conf1.png
    │   ├── Conf2.png
    │   ├── Multi1.png
    │   ├── Multi2.png
    │   ├── Rank1.png
    │   ├── Rank2.png
    │   ├── Rank3.png
    │   ├── Rank4.png
    │   ├── Rank5.png
    │   ├── Rank6.png
    │   ├── Rank7.png
    │   ├── KNN_KARCH.png
    │   ├── Overpred.png
    │   ├── Benchmark1.png
    │   ├── Benchmark2.png
    │   ├── Randomness.png
    │   └── Overpred_best.png
    ├── command.txt
    ├── rebuild.bat
    ├── requirements.txt
    ├── pythresh.rst
    ├── Makefile
    ├── pythresh.utils.rst
    ├── tables
    │   ├── TimeComplexity.csv
    │   ├── RankingCorr.csv
    │   └── Benchmark2.csv
    ├── make.bat
    ├── install.rst
    └── api_cc.rst
├── notebooks
    └── data
    │   ├── musk.mat
    │   ├── pima.mat
    │   ├── wbc.mat
    │   ├── cardio.mat
    │   ├── glass.mat
    │   ├── letter.mat
    │   ├── lympho.mat
    │   ├── mnist.mat
    │   ├── vowels.mat
    │   ├── optdigits.mat
    │   ├── pendigits.mat
    │   ├── satellite.mat
    │   ├── shuttle.mat
    │   ├── vertebral.mat
    │   ├── arrhythmia.mat
    │   ├── ionosphere.mat
    │   ├── satimage-2.mat
    │   └── README.md
├── setup.cfg
├── requirements-test.txt
├── MANIFEST.in
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report.md
    ├── PULL_REQUEST_TEMPLATE.md
    └── workflows
    │   └── python-package.yml
├── .readthedocs.yaml
├── .codeclimate.yml
├── LICENSE
├── .pre-commit-config.yaml
├── examples
    ├── eb_example.py
    ├── boot_example.py
    ├── clf_example.py
    ├── cpd_example.py
    ├── fgd_example.py
    ├── meta_example.py
    ├── zscore_example.py
    ├── chau_example.py
    ├── clust_example.py
    ├── dsn_example.py
    ├── gamgmm_example.py
    ├── hist_example.py
    ├── iqr_example.py
    ├── mad_example.py
    ├── moll_example.py
    ├── regr_example.py
    ├── yj_example.py
    ├── fwfm_example.py
    ├── karch_example.py
    ├── mcst_example.py
    ├── mixmod_example.py
    ├── mtt_example.py
    ├── wind_example.py
    ├── filter_example.py
    ├── qmcd_example.py
    ├── ocsvm_example.py
    ├── aucp_example.py
    ├── vae_example.py
    ├── decomp_example.py
    └── gesd_example.py
├── setup.py
└── .gitignore


/pythresh/thresholds/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/imgs/All.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/imgs/All.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.13
2 | pyod
3 | scikit-learn>=0.20.0
4 | scipy>=1.3.1
5 | 


--------------------------------------------------------------------------------
/docs/figs/All.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/All.png


--------------------------------------------------------------------------------
/docs/figs/Comb1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Comb1.png


--------------------------------------------------------------------------------
/docs/figs/Comb2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Comb2.png


--------------------------------------------------------------------------------
/docs/figs/Conf1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Conf1.png


--------------------------------------------------------------------------------
/docs/figs/Conf2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Conf2.png


--------------------------------------------------------------------------------
/docs/figs/Multi1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Multi1.png


--------------------------------------------------------------------------------
/docs/figs/Multi2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Multi2.png


--------------------------------------------------------------------------------
/docs/figs/Rank1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Rank1.png


--------------------------------------------------------------------------------
/docs/figs/Rank2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Rank2.png


--------------------------------------------------------------------------------
/docs/figs/Rank3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Rank3.png


--------------------------------------------------------------------------------
/docs/figs/Rank4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Rank4.png


--------------------------------------------------------------------------------
/docs/figs/Rank5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Rank5.png


--------------------------------------------------------------------------------
/docs/figs/Rank6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Rank6.png


--------------------------------------------------------------------------------
/docs/figs/Rank7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Rank7.png


--------------------------------------------------------------------------------
/docs/figs/KNN_KARCH.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/KNN_KARCH.png


--------------------------------------------------------------------------------
/docs/figs/Overpred.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Overpred.png


--------------------------------------------------------------------------------
/notebooks/data/musk.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/musk.mat


--------------------------------------------------------------------------------
/notebooks/data/pima.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/pima.mat


--------------------------------------------------------------------------------
/notebooks/data/wbc.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/wbc.mat


--------------------------------------------------------------------------------
/docs/figs/Benchmark1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Benchmark1.png


--------------------------------------------------------------------------------
/docs/figs/Benchmark2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Benchmark2.png


--------------------------------------------------------------------------------
/docs/figs/Randomness.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Randomness.png


--------------------------------------------------------------------------------
/notebooks/data/cardio.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/cardio.mat


--------------------------------------------------------------------------------
/notebooks/data/glass.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/glass.mat


--------------------------------------------------------------------------------
/notebooks/data/letter.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/letter.mat


--------------------------------------------------------------------------------
/notebooks/data/lympho.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/lympho.mat


--------------------------------------------------------------------------------
/notebooks/data/mnist.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/mnist.mat


--------------------------------------------------------------------------------
/notebooks/data/vowels.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/vowels.mat


--------------------------------------------------------------------------------
/docs/figs/Overpred_best.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Overpred_best.png


--------------------------------------------------------------------------------
/notebooks/data/optdigits.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/optdigits.mat


--------------------------------------------------------------------------------
/notebooks/data/pendigits.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/pendigits.mat


--------------------------------------------------------------------------------
/notebooks/data/satellite.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/satellite.mat


--------------------------------------------------------------------------------
/notebooks/data/shuttle.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/shuttle.mat


--------------------------------------------------------------------------------
/notebooks/data/vertebral.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/vertebral.mat


--------------------------------------------------------------------------------
/notebooks/data/arrhythmia.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/arrhythmia.mat


--------------------------------------------------------------------------------
/notebooks/data/ionosphere.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/ionosphere.mat


--------------------------------------------------------------------------------
/notebooks/data/satimage-2.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/satimage-2.mat


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description_file = README.rst
3 | 
4 | [egg_info]
5 | tag_build =
6 | tag_date = 0
7 | 


--------------------------------------------------------------------------------
/pythresh/models/meta_model_GNB.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/pythresh/models/meta_model_GNB.pkl


--------------------------------------------------------------------------------
/pythresh/models/meta_model_GNBC.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/pythresh/models/meta_model_GNBC.pkl


--------------------------------------------------------------------------------
/pythresh/models/meta_model_GNBM.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/pythresh/models/meta_model_GNBM.pkl


--------------------------------------------------------------------------------
/pythresh/models/meta_model_LIN.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/pythresh/models/meta_model_LIN.pkl


--------------------------------------------------------------------------------
/pythresh/__init__.py:
--------------------------------------------------------------------------------
1 | from . import thresholds
2 | 
3 | # TODO: add version information here
4 | 
5 | __all__ = ['thresholds']
6 | 


--------------------------------------------------------------------------------
/docs/command.txt:
--------------------------------------------------------------------------------
1 | set SPHINX_APIDOC_OPTIONS=members,undoc-members,show-inheritance,inherited-members
2 | sphinx-apidoc -o rst /home/denmark/Documents/pythresh/
3 | 


--------------------------------------------------------------------------------
/docs/rebuild.bat:
--------------------------------------------------------------------------------
1 | REM rebuild docs shortcut
2 | REM only works for Windows
3 | cd..
4 | xcopy examples\*.png docs\figs /Y
5 | cd docs
6 | call make clean
7 | call make html
8 | 


--------------------------------------------------------------------------------
/pythresh/version.py:
--------------------------------------------------------------------------------
1 | """``pythresh`` is a python toolbox for outlier detection thresholding."""
2 | # Based on NiLearn package
3 | # License: simplified BSD
4 | 
5 | __version__ = '1.0.2'  # pragma: no cover
6 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | docutils
 3 | joblib
 4 | numpy
 5 | pandas
 6 | pyclustering
 7 | pyod
 8 | ruptures
 9 | scikit-learn
10 | scikit-lego
11 | scipy
12 | sphinx-datatables
13 | sphinx-rtd-theme
14 | sphinxcontrib-bibtex
15 | torch
16 | tqdm
17 | xgboost
18 | 


--------------------------------------------------------------------------------
/requirements-test.txt:
--------------------------------------------------------------------------------
 1 | #ruptures
 2 | git+https://github.com/deepcharles/ruptures.git
 3 | #pyclustering
 4 | https://github.com/KulikDM/pyclustering/archive/Warning-Fix.zip
 5 | joblib>=0.14.1
 6 | numpy
 7 | pandas
 8 | pyod
 9 | scikit-learn>=0.20.0
10 | scikit-lego
11 | scipy>=1.3.1
12 | setuptools>=65.5.1
13 | torch
14 | tqdm
15 | xgboost>=2.0.0,<2.1.0
16 | 


--------------------------------------------------------------------------------
/notebooks/data/README.md:
--------------------------------------------------------------------------------
1 | All datasets stored in this folder are downloaded from
2 | **Outlier Detection DataSets (ODDS)**: http://odds.cs.stonybrook.edu/#table1
3 | 
4 | If you use any data here, please see ODDS' citation policy:
5 | 
6 | Shebuti Rayana (2016).  ODDS Library \[http://odds.cs.stonybrook.edu\]. Stony Brook, NY: Stony Brook University, Department of Computer Science.
7 | 


--------------------------------------------------------------------------------
/docs/pythresh.rst:
--------------------------------------------------------------------------------
 1 | ###############
 2 |  API Reference
 3 | ###############
 4 | 
 5 | .. toctree::
 6 | 
 7 |    pythresh.thresholds
 8 |    pythresh.utils
 9 | 
10 | *****************
11 |  Module contents
12 | *****************
13 | 
14 | .. automodule:: pythresh
15 |    :members:
16 |    :exclude-members: __version__
17 |    :undoc-members:
18 |    :show-inheritance:
19 |    :inherited-members:
20 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | prune .github
 2 | prune docs
 3 | prune imgs
 4 | prune examples
 5 | prune notebooks
 6 | prune pythresh/test
 7 | exclude .readthedocs.yaml
 8 | exclude .gitignore
 9 | exclude .codeclimate.yml
10 | exclude .pre-commit-config.yaml
11 | exclude CHANGES.txt
12 | exclude requirements-test.txt
13 | include README.rst
14 | include requirements.txt
15 | include pythresh/utils/*
16 | include pythresh/models/meta_model_GNB.pkl
17 | include pythresh/models/meta_model_GNBC.pkl
18 | include pythresh/models/meta_model_GNBM.pkl
19 | include pythresh/models/meta_model_LIN.pkl
20 | include pythresh/models/rank_model_XGB.json
21 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = pythresh
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | ---
 8 | 
 9 | **Is your feature request related to a problem? Please describe.**
10 | A clear and concise description of what the problem is. Ex. I'm always frustrated when \[...\]
11 | 
12 | **Describe the solution you'd like**
13 | A clear and concise description of what you want to happen.
14 | 
15 | **Describe alternatives you've considered**
16 | A clear and concise description of any alternative solutions or features you've considered.
17 | 
18 | **Additional context**
19 | Add any other context or screenshots about the feature request here.
20 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | ---
 8 | 
 9 | **Describe the bug**
10 | A clear and concise description of what the bug is.
11 | 
12 | **To Reproduce**
13 | Steps to reproduce the behavior:
14 | 
15 | **Expected behavior**
16 | A clear and concise description of what you expected to happen.
17 | 
18 | **Screenshots**
19 | If applicable, add screenshots to help explain your problem.
20 | 
21 | **Desktop (please complete the following information):**
22 | 
23 | - OS: \[e.g. Windows\]
24 | - Version \[e.g. 0.2.6\]
25 | 
26 | **Additional context**
27 | Add any other context about the problem here.
28 | 


--------------------------------------------------------------------------------
/docs/pythresh.utils.rst:
--------------------------------------------------------------------------------
 1 | ###################
 2 |  Utility Functions
 3 | ###################
 4 | 
 5 | ****************************
 6 |  pythresh.utils.rank module
 7 | ****************************
 8 | 
 9 | .. automodule:: pythresh.utils.rank
10 |    :members:
11 |    :exclude-members: _cdf_metric, _clust_metric, _consensus_metric, _equi_rank, _equi_sort
12 |    :undoc-members:
13 |    :show-inheritance:
14 |    :inherited-members:
15 | 
16 | ****************************
17 |  pythresh.utils.conf module
18 | ****************************
19 | 
20 | .. automodule:: pythresh.utils.conf
21 |    :members:
22 |    :exclude-members: _valid_thresh, _invalid_thresh
23 |    :undoc-members:
24 |    :show-inheritance:
25 |    :inherited-members:
26 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the version of Python and other tools you might need
 9 | build:
10 |   os: ubuntu-22.04
11 |   tools:
12 |     python: "3.10"
13 |     # You can also specify other tool versions:
14 |     # nodejs: "19"
15 |     # rust: "1.64"
16 |     # golang: "1.19"
17 | 
18 | # Build documentation in the docs/ directory with Sphinx
19 | sphinx:
20 |    configuration: docs/conf.py
21 | 
22 | # If using Sphinx, optionally build your docs in additional formats such as PDF
23 | formats:
24 |    - pdf
25 | 
26 | # Optionally declare the Python requirements required to build your docs
27 | python:
28 |    install:
29 |    - requirements: docs/requirements.txt
30 | 


--------------------------------------------------------------------------------
/docs/tables/TimeComplexity.csv:
--------------------------------------------------------------------------------
 1 | Method,Complexity,Big-O Notation
 2 | AUCP,Quadratic,~1e-8*n^2
 3 | BOOT,Quadratic,~1e-8*n^2
 4 | CHAU,Linear,~1e-8*n
 5 | CLF,Quadratic,~1e-8*n^2
 6 | CLUST,Quadratic,~1e-8*n^2
 7 | CPD,Quadratic,~1e-8*n^2
 8 | DECOMP,Linear,~1e-4*n
 9 | DSN,Linear,~1e-4*n
10 | EB,Lineararithmic,~1e-6*n*log(n)
11 | FGD,Lineararithmic,~1e-5*n*log(n)
12 | FILTER,Quadratic,~1e-11*n^2
13 | FWFM,Lineararithmic,~1e-5*n*log(n)
14 | GAMGMM,Quadratic,~1e-6*n^2
15 | GESD,Quadratic,~1e-9*n^2
16 | HIST,Linear,~1e-8*n
17 | IQR,Linear,~1e-8*n
18 | KARCH,Lineararithmic,~1e-5*n*log(n)
19 | MAD,Linear,~1e-8*n
20 | MCST,Quadratic,~1e-7*n^2
21 | META,Cubic,~1e-12*n^3
22 | MIXMOD,Linear,~1e-3*n
23 | MOLL,Lineararithmic,~1e-7*n*log(n)
24 | MTT,Quadratic,~1e-10*n^2
25 | OCSVM,Linear,~1e-7*n
26 | QMCD,Quadratic,~1e-9*n^2
27 | REGR,Quadratic,~1e-8*n^2
28 | VAE,Linear,~1e-3*n
29 | WIND,Linear,~1e-4*n
30 | YJ,Linear,~1e-4*n
31 | ZSCORE,Linear,~1e-8*n
32 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=pythresh
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | 	echo.installed, then set the SPHINXBUILD environment variable to point
21 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | 	echo.may add the Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/.codeclimate.yml:
--------------------------------------------------------------------------------
 1 | version: "2"
 2 | checks:
 3 |   argument-count:
 4 |     enabled: true
 5 |     config:
 6 |       threshold: 4
 7 |   complex-logic:
 8 |     enabled: true
 9 |     config:
10 |       threshold: 4
11 |   file-lines:
12 |     enabled: true
13 |     config:
14 |       threshold: 250
15 |   method-complexity:
16 |     enabled: true
17 |     config:
18 |       threshold: 5
19 |   method-count:
20 |     enabled: true
21 |     config:
22 |       threshold: 20
23 |   method-lines:
24 |     enabled: true
25 |     config:
26 |       threshold: 25
27 |   nested-control-flow:
28 |     enabled: true
29 |     config:
30 |       threshold: 4
31 |   return-statements:
32 |     enabled: true
33 |     config:
34 |       threshold: 4
35 |   similar-code:
36 |     enabled: false
37 |   identical-code:
38 |     enabled: false
39 | plugins:
40 |   bandit:
41 |     enabled: true
42 |   radon:
43 |     enabled: true
44 |   sonar-python:
45 |     enabled: true
46 |     config:
47 |       minimum_severity: major
48 |       tests_patterns:
49 |         - pythresh/test/**
50 | exclude_patterns:
51 |   - "examples/"
52 |   - "**/test/"
53 | 


--------------------------------------------------------------------------------
/docs/tables/RankingCorr.csv:
--------------------------------------------------------------------------------
 1 | Label,Mean,Median,Standard Deviation
 2 | CAL,0.1872,0.2027,0.4303
 3 | DAV,-0.1257,-0.0474,0.4523
 4 | SIL,0.1262,0.0589,0.4744
 5 | BH,0.0039,0.0391,0.4836
 6 | BR,0.0260,0.0300,0.4750
 7 | CAL_sc,0.0158,0.0111,0.5083
 8 | DAV_sc,-0.0483,-0.0944,0.5230
 9 | DR,-0.0157,-0.0111,0.5084
10 | Dunn,0.0270,0.0207,0.5353
11 | Hubert,0.0641,0.1632,0.4674
12 | Iind,0.0648,0.1527,0.4763
13 | MR,0.0789,0.1710,0.4973
14 | PB,0.0367,0.0715,0.5089
15 | RL,-0.0314,-0.0569,0.5081
16 | RT,-0.0034,0.0524,0.4884
17 | SIL_sc,0.0473,0.0801,0.4885
18 | SDBW,-0.0641,-0.0627,0.4736
19 | SS,-0.0034,0.0524,0.4884
20 | WG,-0.0244,-0.0210,0.5221
21 | XBS,-0.0535,0.0032,0.5280
22 | AMA,0.0543,0.0088,0.5046
23 | BHT,0.0239,0.0163,0.5050
24 | BREG,0.1022,0.1546,0.5006
25 | COR,0.0173,0.0364,0.5114
26 | ENG,0.1120,0.1054,0.5030
27 | JS,0.0566,0.1282,0.5045
28 | MAH,0.0749,0.0300,0.5075
29 | LK,0.0749,0.1048,0.5070
30 | WS,0.1133,0.1766,0.5001
31 | EM,0.0261,0.0752,0.4332
32 | MV,0.0094,-0.0164,0.4361
33 | Contam,-0.2003,-0.1498,0.5297
34 | GNB,-0.1931,-0.2902,0.4768
35 | HITS,-0.0449,-0.1235,0.5998
36 | Mode,-0.1505,-0.0840,0.5377
37 | Thresh,-0.1696,-0.1940,0.6031
38 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ### Submissions Basics:
 2 | 
 3 | - [ ] Have you followed the guidelines in our Contributing document?
 4 | - [ ] Have you checked to ensure there aren't other open [Pull Requests](../../../../pulls) for the same update/change?
 5 | - [ ] Have you checked all [Issues](../../../../issues) to tie the PR to a specific one?
 6 | 
 7 | <!-- You can erase any parts of this template not applicable to your Pull Request. -->
 8 | 
 9 | ### Type of Change:
10 | 
11 | - [ ] Bug fix
12 | - [ ] New feature
13 | - [ ] Documentation update
14 | 
15 | ### All Submissions Cores:
16 | 
17 | - [ ] Have you added an explanation of what your changes do and why you'd like us to include them?
18 | - [ ] Have you written new unit tests for your changes?
19 | - [ ] Do all new and existing unit tests pass locally?
20 | 
21 | ### New Thresholder Submissions:
22 | 
23 | - [ ] Have you created a <NewThresholder>.py in ~/pythresh/thresholds/?
24 | - [ ] Have you created a <NewThresholder>\_example.py in ~/examples/?
25 | - [ ] Have you created a test\_<NewThresholder>.py in ~/pythresh/test/?
26 | - [ ] Have you lint your code locally prior to submission?
27 | - [ ] Have you added a reference of the new thresholder in your explanation?
28 | 


--------------------------------------------------------------------------------
/docs/install.rst:
--------------------------------------------------------------------------------
 1 | ##############
 2 |  Installation
 3 | ##############
 4 | 
 5 | It is recommended to use **pip** or **conda** installation. Please make
 6 | sure **the latest version** is installed, as PyThresh is updated
 7 | frequently:
 8 | 
 9 | .. code:: bash
10 | 
11 |    pip install pythresh            # normal install
12 |    pip install --upgrade pythresh  # or update if needed
13 | 
14 | .. code:: bash
15 | 
16 |    conda install -c conda-forge pythresh
17 | 
18 | Alternatively, you can get the version with the latest updates by
19 | cloning the repo and run setup.py file:
20 | 
21 | .. code:: bash
22 | 
23 |    git clone https://github.com/KulikDM/pythresh.git
24 |    cd pythresh
25 |    pip install .
26 | 
27 | Or with **pip**:
28 | 
29 | .. code:: bash
30 | 
31 |    pip install https://github.com/KulikDM/pythresh/archive/main.zip
32 | 
33 | **Required Dependencies**:
34 | 
35 | -  numpy>=1.13
36 | -  pyod
37 | -  scipy>=1.3.1
38 | -  scikit_learn>=0.20.0
39 | 
40 | **Optional Dependencies**:
41 | 
42 | -  pyclustering (used in the CLUST thresholder)
43 | -  ruptures (used in the CPD thresholder)
44 | -  scikit-lego (used in the META thresholder)
45 | -  joblib>=0.14.1 (used in the META thresholder and RANK)
46 | -  pandas (used in the META thresholder)
47 | -  torch (used in the VAE thresholder)
48 | -  tqdm (used in the VAE thresholder)
49 | -  xgboost>=2.0.0 (used in the RANK)
50 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2022, Daniel Kulik
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v6.0.0
 4 |     hooks:
 5 |       - id: trailing-whitespace
 6 |       - id: end-of-file-fixer
 7 |       - id: check-yaml
 8 |       - id: double-quote-string-fixer
 9 |       - id: requirements-txt-fixer
10 |       - id: name-tests-test
11 |         always_run: true
12 |         args: [--pytest-test-first]
13 |       - id: mixed-line-ending
14 |         args: [--fix=lf]
15 | 
16 |   - repo: https://github.com/DanielNoord/pydocstringformatter
17 |     rev: v0.7.5
18 |     hooks:
19 |       - id: pydocstringformatter
20 |         args: [--style=pep257, --style=numpydoc]
21 |         name: Format docstrings
22 | 
23 |   - repo: https://github.com/asottile/pyupgrade
24 |     rev: v3.21.2
25 |     hooks:
26 |       - id: pyupgrade
27 |         args: [--py38-plus]
28 |         name: Upgrade code
29 | 
30 |   - repo: https://github.com/hhatto/autopep8
31 |     rev: v2.3.2
32 |     hooks:
33 |       - id: autopep8
34 |         args: [--in-place]
35 |         name: Format code style
36 | 
37 |   - repo: https://github.com/PyCQA/isort
38 |     rev: 7.0.0
39 |     hooks:
40 |       - id: isort
41 |         args: [-m=3]
42 |         name: Sort imports
43 | 
44 |   - repo: https://github.com/charliermarsh/ruff-pre-commit
45 |     rev: v0.14.7
46 |     hooks:
47 |       - id: ruff
48 |         args: [--exit-non-zero-on-fix, --fix, --line-length=180]
49 |         exclude: "\\.ipynb$"
50 |         name: Lint code
51 | 


--------------------------------------------------------------------------------
/docs/api_cc.rst:
--------------------------------------------------------------------------------
 1 | ################
 2 |  API CheatSheet
 3 | ################
 4 | 
 5 | The following APIs are applicable for all detector models for ease of
 6 | use.
 7 | 
 8 | -  :func:`pythresh.thresholders.base.BaseDetector.eval`: evaluate a
 9 |    single outlier or multiple outlier detection likelihood score set
10 |    (Legacy method)
11 | 
12 | -  :func:`pythresh.thresholders.base.BaseDetector.fit`: fit a
13 |    thresholder for a single outlier or multiple outlier detection
14 |    likelihood score set
15 | 
16 | -  :func:`pythresh.thresholders.base.BaseDetector.predict`: predict the
17 |    binary labels using the fitted thresholder on a single outlier or
18 |    multiple outlier detection likelihood score set
19 | 
20 | Key Attributes of a fitted model:
21 | 
22 | -  :attr:`pythresh.thresholds.base.BaseThresholder.thresh_`: threshold
23 |    value from scores normalize between 0 and 1
24 | 
25 | -  :attr:`pythresh.thresholds.base.BaseThresholder.labels_`: A binary
26 |    array of labels for the fitted thresholder on the fitted dataset
27 | 
28 | -  :attr:`pythresh.thresholders.base.BaseDetector.confidence_interval_`:
29 |    Return the lower and upper confidence interval of the contamination
30 |    level. Only applies to the COMB thresholder
31 | 
32 | -  :attr:`pythresh.thresholders.base.BaseDetector.dscores_`: 1D array of
33 |    the TruncatedSVD decomposed decision scores if multiple outlier
34 |    detector score sets are passed
35 | 
36 | -  :attr:`pythresh.thresholders.mixmod.MIXMOD.mixture_`: fitted mixture
37 |    model class of the selected model used for thresholding. Only applies
38 |    to MIXMOD. Attributes include: components, weights, params. Functions
39 |    include: fit, loglikelihood, pdf, and posterior.
40 | 
41 | See base class definition below:
42 | 
43 | *********************************
44 |  pythresh.thresholds.base module
45 | *********************************
46 | 
47 | .. automodule:: pythresh.thresholds.base
48 |    :members:
49 |    :exclude-members: _data_setup, _set_norm, _set_attributes
50 |    :undoc-members:
51 |    :show-inheritance:
52 |    :inherited-members:
53 | 


--------------------------------------------------------------------------------
/pythresh/test/test_conf.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import unittest
 3 | from itertools import product
 4 | from os.path import dirname as up
 5 | 
 6 | # noinspection PyProtectedMember
 7 | from pyod.models.iforest import IForest
 8 | from pyod.utils.data import generate_data
 9 | 
10 | from pythresh.thresholds.filter import FILTER
11 | from pythresh.thresholds.ocsvm import OCSVM
12 | from pythresh.utils.conf import CONF
13 | 
14 | # temporary solution for relative imports in case pythresh is not installed
15 | # if pythresh is installed, no need to use the following line
16 | 
17 | path = up(up(up(__file__)))
18 | sys.path.append(path)
19 | 
20 | 
21 | class TestCONF(unittest.TestCase):
22 |     def setUp(self):
23 |         self.n_train = 200
24 |         self.n_test = 100
25 |         self.contamination = 0.1
26 |         self.X_train, self.X_test, self.y_train, self.y_test = generate_data(
27 |             n_train=self.n_train, n_test=self.n_test,
28 |             contamination=self.contamination, random_state=42)
29 | 
30 |         clf = IForest()
31 |         clf.fit(self.X_train)
32 | 
33 |         self.scores = clf.decision_scores_
34 | 
35 |         self.thres = [FILTER(), OCSVM()]
36 | 
37 |         self.alphas = [0.05, 0.1, 0.2]
38 | 
39 |         self.splits = [0.2, 0.5, 0.8]
40 | 
41 |         self.n_tests = [10, 100, 1000]
42 | 
43 |     def test_prediction_labels(self):
44 | 
45 |         params = product(self.thres,
46 |                          self.alphas,
47 |                          self.splits,
48 |                          self.n_tests)
49 | 
50 |         for thres, alpha, split, n_test in params:
51 | 
52 |             confidence = CONF(thres, alpha=alpha,
53 |                               split=split, n_test=n_test)
54 |             uncertains = confidence.eval(self.scores)
55 | 
56 |             assert (isinstance(uncertains, list))
57 |             assert (len(uncertains) <= len(self.scores))
58 | 
59 |             if len(uncertains) > 0:
60 | 
61 |                 assert (min(uncertains) > 0)
62 |                 assert (max(uncertains) < len(self.scores))
63 |                 assert (len(set(uncertains)) == len(uncertains))
64 | 


--------------------------------------------------------------------------------
/docs/tables/Benchmark2.csv:
--------------------------------------------------------------------------------
 1 | Label,Method
 2 | AUCP,AUCP()
 3 | BOOT,BOOT()
 4 | CHAU,CHAU()
 5 | CLF1,CLF(method=’simple’)
 6 | CLF2,CLF(method=’complex’)
 7 | CLUST1,CLUST(method=’agg’)
 8 | CLUST2,CLUST(method=’birch’)
 9 | CLUST3,CLUST(method=’bang’)
10 | CLUST4,CLUST(method=’bgm’)
11 | CLUST5,CLUST(method=’bsas’)
12 | CLUST6,CLUST(method=’dbscan’)
13 | CLUST7,CLUST(method=’ema’)
14 | CLUST8,CLUST(method=’kmeans’)
15 | CLUST9,CLUST(method=’mbsas’)
16 | CLUST10,CLUST(method=’mshift’)
17 | CLUST11,CLUST(method=’optics’)
18 | CLUST12,CLUST(method=’somsc’)
19 | CLUST13,CLUST(method=’spec’)
20 | CLUST14,CLUST(method=’xmeans’)
21 | CPD1,CPD(method=’Dynp’)
22 | CPD2,CPD(method=’KernelCPD’)
23 | CPD3,CPD(method=’Binseg’)
24 | CPD4,CPD(method=’BottomUp’)
25 | DECOMP1,DECOMP(method=’NMF’)
26 | DECOMP2,DECOMP(method=’PCA’)
27 | DECOMP3,DECOMP(method=’GRP’)
28 | DECOMP4,DECOMP(method=’SRP’)
29 | DSN1,DSN(metric=’JS’)
30 | DSN2,DSN(metric=’WS’)
31 | DSN3,DSN(metric=’ENG’)
32 | DSN4,DSN(metric=’BHT’)
33 | DSN5,DSN(metric=’HLL’)
34 | DSN6,DSN(metric=’HI’)
35 | DSN7,DSN(metric=’LK’)
36 | DSN8,DSN(metric=’MAH’)
37 | DSN9,DSN(metric=’TMT’)
38 | DSN10,DSN(metric=’RES’)
39 | DSN11,DSN(metric=’KS’)
40 | DSN12,DSN(metric=’INT’)
41 | DSN13,DSN(metric=’MMD’)
42 | EB,EB()
43 | FGD,FGD()
44 | FILTER1,FILTER(method=’gaussian’)
45 | FILTER2,FILTER(method=’savgol’)
46 | FILTER3,FILTER(method=’hilbert’)
47 | FILTER4,FILTER(method=’wiener’)
48 | FILTER5,FILTER(method=’medfilt’)
49 | FILTER6,FILTER(method=’decimate’)
50 | FILTER7,FILTER(method=’detrend’)
51 | FILTER8,FILTER(method=’resemple’)
52 | FWFM,FWFM()
53 | GESD,GESD()
54 | HIST1,HIST(method=’otsu’)
55 | HIST2,HIST(method=’yen’)
56 | HIST3,HIST(method=’isodata’)
57 | HIST4,HIST(method=’li’)
58 | HIST5,HIST(method=’triangle’)
59 | IQR,IQR()
60 | KARCH,KARCH()
61 | MAD,MAD()
62 | MCST,MCST()
63 | META1,META(method=’LIN’)
64 | META2,META(method=’GNB’)
65 | META3,META(method=’GNBC’)
66 | META4,META(method=’GNBM’)
67 | MOLL,MOLL()
68 | OCSVM1,OCSVM(method=’poly’)
69 | OCSVM2,OCSVM(method=’sgd’)
70 | QMCD1,QMCD(method=’CD’)
71 | QMCD2,QMCD(method=’WD’)
72 | QMCD3,QMCD(method=’MD’)
73 | QMCD4,QMCD(method=’L2-star’)
74 | REGR1,REGR(method=’siegel’)
75 | REGR2,REGR(method=’theil’)
76 | VAE,VAE()
77 | WIND,WIND()
78 | YJ,YJ()
79 | ZSCORE,ZSCORE()
80 | 


--------------------------------------------------------------------------------
/examples/eb_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using elliptical boundaries for outlier thresholding."""
 2 | # Author: D Kulik
 3 | # License: BSD 2 clause
 4 | 
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from pyod.models.knn import KNN
10 | from pyod.utils.data import evaluate_print, generate_data
11 | from pyod.utils.example import visualize
12 | 
13 | from pythresh.thresholds.eb import EB
14 | 
15 | # temporary solution for relative imports in case pyod is not installed
16 | # if pyod is installed, no need to use the following line
17 | sys.path.append(
18 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     contamination = 0.1  # percentage of outliers
23 |     n_train = 200  # number of training points
24 |     n_test = 100  # number of testing points
25 | 
26 |     # Generate sample data
27 |     X_train, X_test, y_train, y_test =\
28 |         generate_data(n_train=n_train,
29 |                       n_test=n_test,
30 |                       n_features=2,
31 |                       contamination=contamination,
32 |                       random_state=42)
33 | 
34 |     # train KNN detector
35 |     clf_name = 'KNN'
36 |     clf = KNN()
37 |     clf.fit(X_train)
38 |     thres = EB()
39 | 
40 |     # get the prediction labels and outlier scores of the training data
41 |     y_train_scores = clf.decision_scores_  # raw outlier scores
42 | 
43 |     # (Legacy method)
44 |     # y_train_pred = thres.eval(y_train_scores)
45 | 
46 |     # binary labels (0: inliers, 1: outliers)
47 |     thres.fit(y_train_scores)
48 |     y_train_pred = thres.labels_
49 |     # or
50 |     y_train_pred = thres.predict(y_train_scores)
51 | 
52 |     # get the prediction on the test data
53 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
54 | 
55 |     # (Legacy method)
56 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
57 |     y_test_pred = thres.predict(y_test_scores)
58 | 
59 |     # evaluate and print the results
60 |     print('\nOn Training Data:')
61 |     evaluate_print(clf_name, y_train, y_train_scores)
62 |     print('\nOn Test Data:')
63 |     evaluate_print(clf_name, y_test, y_test_scores)
64 | 
65 |     # visualize the results
66 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
67 |               y_test_pred, show_figure=True, save_figure=False)
68 | 


--------------------------------------------------------------------------------
/pythresh/test/test_fastkde.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import unittest
 3 | from itertools import product
 4 | from os.path import dirname as up
 5 | 
 6 | # noinspection PyProtectedMember
 7 | import numpy as np
 8 | from numpy.testing import assert_equal
 9 | from pyod.models.iforest import IForest
10 | from pyod.models.knn import KNN
11 | from pyod.models.pca import PCA
12 | from pyod.utils.data import generate_data
13 | 
14 | from pythresh.thresholds.dsn import DSN
15 | 
16 | # temporary solution for relative imports in case pythresh is not installed
17 | # if pythresh is installed, no need to use the following line
18 | 
19 | path = up(up(up(__file__)))
20 | sys.path.append(path)
21 | 
22 | # Test implementation of the fastkde interpolation method
23 | 
24 | 
25 | class TestFastKDE(unittest.TestCase):
26 |     def setUp(self):
27 |         self.n_train = 10000
28 |         self.n_test = 100
29 |         self.contamination = 0.1
30 |         self.X_train, self.X_test, self.y_train, self.y_test = generate_data(
31 |             n_train=self.n_train, n_test=self.n_test,
32 |             contamination=self.contamination, random_state=42)
33 | 
34 |         clf = KNN()
35 |         clf.fit(self.X_train)
36 | 
37 |         scores = clf.decision_scores_
38 | 
39 |         clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)]
40 | 
41 |         multiple_scores = [
42 |             clf.fit(self.X_train).decision_scores_ for clf in clfs]
43 |         multiple_scores = np.vstack(multiple_scores).T
44 | 
45 |         self.all_scores = [scores, multiple_scores]
46 | 
47 |         self.metrics = ['JS', 'MAH']
48 | 
49 |     def test_prediction_labels(self):
50 | 
51 |         params = product(self.all_scores, self.metrics)
52 | 
53 |         for scores, metric in params:
54 | 
55 |             self.thres = DSN(metric=metric)
56 |             pred_labels = self.thres.eval(scores)
57 |             assert (self.thres.thresh_ is not None)
58 |             assert (self.thres.dscores_ is not None)
59 | 
60 |             assert (self.thres.dscores_.min() == 0)
61 |             assert (self.thres.dscores_.max() == 1)
62 | 
63 |             assert_equal(pred_labels.shape, self.y_train.shape)
64 | 
65 |             if (not np.all(pred_labels == 0)) & (not np.all(pred_labels == 1)):
66 | 
67 |                 assert (pred_labels.min() == 0)
68 |                 assert (pred_labels.max() == 1)
69 | 


--------------------------------------------------------------------------------
/examples/boot_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using a boostraped method for outlier thresholding."""
 2 | # Author: D Kulik
 3 | # License: BSD 2 clause
 4 | 
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from pyod.models.knn import KNN
10 | from pyod.utils.data import evaluate_print, generate_data
11 | from pyod.utils.example import visualize
12 | 
13 | from pythresh.thresholds.boot import BOOT
14 | 
15 | # temporary solution for relative imports in case pyod is not installed
16 | # if pyod is installed, no need to use the following line
17 | sys.path.append(
18 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     contamination = 0.1  # percentage of outliers
23 |     n_train = 200  # number of training points
24 |     n_test = 100  # number of testing points
25 | 
26 |     # Generate sample data
27 |     X_train, X_test, y_train, y_test =\
28 |         generate_data(n_train=n_train,
29 |                       n_test=n_test,
30 |                       n_features=2,
31 |                       contamination=contamination,
32 |                       random_state=42)
33 | 
34 |     # train KNN detector
35 |     clf_name = 'KNN'
36 |     clf = KNN()
37 |     clf.fit(X_train)
38 |     thres = BOOT()
39 | 
40 |     # get the prediction labels and outlier scores of the training data
41 |     y_train_scores = clf.decision_scores_  # raw outlier scores
42 | 
43 |     # (Legacy method)
44 |     # y_train_pred = thres.eval(y_train_scores)
45 | 
46 |     # binary labels (0: inliers, 1: outliers)
47 |     thres.fit(y_train_scores)
48 |     y_train_pred = thres.labels_
49 |     # or
50 |     y_train_pred = thres.predict(y_train_scores)
51 | 
52 |     # get the prediction on the test data
53 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
54 | 
55 |     # (Legacy method)
56 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
57 |     y_test_pred = thres.predict(y_test_scores)
58 | 
59 |     # evaluate and print the results
60 |     print('\nOn Training Data:')
61 |     evaluate_print(clf_name, y_train, y_train_scores)
62 |     print('\nOn Test Data:')
63 |     evaluate_print(clf_name, y_test, y_test_scores)
64 | 
65 |     # visualize the results
66 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
67 |               y_test_pred, show_figure=True, save_figure=False)
68 | 


--------------------------------------------------------------------------------
/examples/clf_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using a trained classifier for outlier thresholding."""
 2 | # Author: D Kulik
 3 | # License: BSD 2 clause
 4 | 
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from pyod.models.knn import KNN
10 | from pyod.utils.data import evaluate_print, generate_data
11 | from pyod.utils.example import visualize
12 | 
13 | from pythresh.thresholds.clf import CLF
14 | 
15 | # temporary solution for relative imports in case pyod is not installed
16 | # if pyod is installed, no need to use the following line
17 | sys.path.append(
18 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     contamination = 0.1  # percentage of outliers
23 |     n_train = 200  # number of training points
24 |     n_test = 100  # number of testing points
25 | 
26 |     # Generate sample data
27 |     X_train, X_test, y_train, y_test =\
28 |         generate_data(n_train=n_train,
29 |                       n_test=n_test,
30 |                       n_features=2,
31 |                       contamination=contamination,
32 |                       random_state=42)
33 | 
34 |     # train KNN detector
35 |     clf_name = 'KNN'
36 |     clf = KNN()
37 |     clf.fit(X_train)
38 |     thres = CLF()
39 | 
40 |     # get the prediction labels and outlier scores of the training data
41 |     y_train_scores = clf.decision_scores_  # raw outlier scores
42 | 
43 |     # (Legacy method)
44 |     # y_train_pred = thres.eval(y_train_scores)
45 | 
46 |     # binary labels (0: inliers, 1: outliers)
47 |     thres.fit(y_train_scores)
48 |     y_train_pred = thres.labels_
49 |     # or
50 |     y_train_pred = thres.predict(y_train_scores)
51 | 
52 |     # get the prediction on the test data
53 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
54 | 
55 |     # (Legacy method)
56 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
57 |     y_test_pred = thres.predict(y_test_scores)
58 | 
59 |     # evaluate and print the results
60 |     print('\nOn Training Data:')
61 |     evaluate_print(clf_name, y_train, y_train_scores)
62 |     print('\nOn Test Data:')
63 |     evaluate_print(clf_name, y_test, y_test_scores)
64 | 
65 |     # visualize the results
66 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
67 |               y_test_pred, show_figure=True, save_figure=False)
68 | 


--------------------------------------------------------------------------------
/examples/cpd_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using a change point detection for outlier thresholding."""
 2 | # Author: D Kulik
 3 | # License: BSD 2 clause
 4 | 
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from pyod.models.knn import KNN
10 | from pyod.utils.data import evaluate_print, generate_data
11 | from pyod.utils.example import visualize
12 | 
13 | from pythresh.thresholds.cpd import CPD
14 | 
15 | # temporary solution for relative imports in case pyod is not installed
16 | # if pyod is installed, no need to use the following line
17 | sys.path.append(
18 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     contamination = 0.1  # percentage of outliers
23 |     n_train = 200  # number of training points
24 |     n_test = 100  # number of testing points
25 | 
26 |     # Generate sample data
27 |     X_train, X_test, y_train, y_test =\
28 |         generate_data(n_train=n_train,
29 |                       n_test=n_test,
30 |                       n_features=2,
31 |                       contamination=contamination,
32 |                       random_state=42)
33 | 
34 |     # train KNN detector
35 |     clf_name = 'KNN'
36 |     clf = KNN()
37 |     clf.fit(X_train)
38 |     thres = CPD()
39 | 
40 |     # get the prediction labels and outlier scores of the training data
41 |     y_train_scores = clf.decision_scores_  # raw outlier scores
42 | 
43 |     # (Legacy method)
44 |     # y_train_pred = thres.eval(y_train_scores)
45 | 
46 |     # binary labels (0: inliers, 1: outliers)
47 |     thres.fit(y_train_scores)
48 |     y_train_pred = thres.labels_
49 |     # or
50 |     y_train_pred = thres.predict(y_train_scores)
51 | 
52 |     # get the prediction on the test data
53 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
54 | 
55 |     # (Legacy method)
56 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
57 |     y_test_pred = thres.predict(y_test_scores)
58 | 
59 |     # evaluate and print the results
60 |     print('\nOn Training Data:')
61 |     evaluate_print(clf_name, y_train, y_train_scores)
62 |     print('\nOn Test Data:')
63 |     evaluate_print(clf_name, y_test, y_test_scores)
64 | 
65 |     # visualize the results
66 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
67 |               y_test_pred, show_figure=True, save_figure=False)
68 | 


--------------------------------------------------------------------------------
/examples/fgd_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using fixed gradient descent for outlier thresholding."""
 2 | # Author: D Kulik
 3 | # License: BSD 2 clause
 4 | 
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from pyod.models.knn import KNN
10 | from pyod.utils.data import evaluate_print, generate_data
11 | from pyod.utils.example import visualize
12 | 
13 | from pythresh.thresholds.fgd import FGD
14 | 
15 | # temporary solution for relative imports in case pyod is not installed
16 | # if pyod is installed, no need to use the following line
17 | sys.path.append(
18 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     contamination = 0.1  # percentage of outliers
23 |     n_train = 200  # number of training points
24 |     n_test = 100  # number of testing points
25 | 
26 |     # Generate sample data
27 |     X_train, X_test, y_train, y_test =\
28 |         generate_data(n_train=n_train,
29 |                       n_test=n_test,
30 |                       n_features=2,
31 |                       contamination=contamination,
32 |                       random_state=42)
33 | 
34 |     # train KNN detector
35 |     clf_name = 'KNN'
36 |     clf = KNN()
37 |     clf.fit(X_train)
38 |     thres = FGD()
39 | 
40 |     # get the prediction labels and outlier scores of the training data
41 |     y_train_scores = clf.decision_scores_  # raw outlier scores
42 | 
43 |     # (Legacy method)
44 |     # y_train_pred = thres.eval(y_train_scores)
45 | 
46 |     # binary labels (0: inliers, 1: outliers)
47 |     thres.fit(y_train_scores)
48 |     y_train_pred = thres.labels_
49 |     # or
50 |     y_train_pred = thres.predict(y_train_scores)
51 | 
52 |     # get the prediction on the test data
53 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
54 | 
55 |     # (Legacy method)
56 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
57 |     y_test_pred = thres.predict(y_test_scores)
58 | 
59 |     # evaluate and print the results
60 |     print('\nOn Training Data:')
61 |     evaluate_print(clf_name, y_train, y_train_scores)
62 |     print('\nOn Test Data:')
63 |     evaluate_print(clf_name, y_test, y_test_scores)
64 | 
65 |     # visualize the results
66 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
67 |               y_test_pred, show_figure=True, save_figure=False)
68 | 


--------------------------------------------------------------------------------
/examples/meta_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using a boostraped method for outlier thresholding."""
 2 | # Author: D Kulik
 3 | # License: BSD 2 clause
 4 | 
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from pyod.models.knn import KNN
10 | from pyod.utils.data import evaluate_print, generate_data
11 | from pyod.utils.example import visualize
12 | 
13 | from pythresh.thresholds.meta import META
14 | 
15 | # temporary solution for relative imports in case pyod is not installed
16 | # if pyod is installed, no need to use the following line
17 | sys.path.append(
18 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     contamination = 0.1  # percentage of outliers
23 |     n_train = 200  # number of training points
24 |     n_test = 100  # number of testing points
25 | 
26 |     # Generate sample data
27 |     X_train, X_test, y_train, y_test =\
28 |         generate_data(n_train=n_train,
29 |                       n_test=n_test,
30 |                       n_features=2,
31 |                       contamination=contamination,
32 |                       random_state=42)
33 | 
34 |     # train KNN detector
35 |     clf_name = 'KNN'
36 |     clf = KNN()
37 |     clf.fit(X_train)
38 |     thres = META()
39 | 
40 |     # get the prediction labels and outlier scores of the training data
41 |     y_train_scores = clf.decision_scores_  # raw outlier scores
42 | 
43 |     # (Legacy method)
44 |     # y_train_pred = thres.eval(y_train_scores)
45 | 
46 |     # binary labels (0: inliers, 1: outliers)
47 |     thres.fit(y_train_scores)
48 |     y_train_pred = thres.labels_
49 |     # or
50 |     y_train_pred = thres.predict(y_train_scores)
51 | 
52 |     # get the prediction on the test data
53 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
54 | 
55 |     # (Legacy method)
56 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
57 |     y_test_pred = thres.predict(y_test_scores)
58 | 
59 |     # evaluate and print the results
60 |     print('\nOn Training Data:')
61 |     evaluate_print(clf_name, y_train, y_train_scores)
62 |     print('\nOn Test Data:')
63 |     evaluate_print(clf_name, y_test, y_test_scores)
64 | 
65 |     # visualize the results
66 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
67 |               y_test_pred, show_figure=True, save_figure=False)
68 | 


--------------------------------------------------------------------------------
/examples/zscore_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using the zscore for outlier thresholding."""
 2 | # Author: D Kulik
 3 | # License: BSD 2 clause
 4 | 
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from pyod.models.knn import KNN
10 | from pyod.utils.data import evaluate_print, generate_data
11 | from pyod.utils.example import visualize
12 | 
13 | from pythresh.thresholds.zscore import ZSCORE
14 | 
15 | # temporary solution for relative imports in case pyod is not installed
16 | # if pyod is installed, no need to use the following line
17 | sys.path.append(
18 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     contamination = 0.1  # percentage of outliers
23 |     n_train = 200  # number of training points
24 |     n_test = 100  # number of testing points
25 | 
26 |     # Generate sample data
27 |     X_train, X_test, y_train, y_test =\
28 |         generate_data(n_train=n_train,
29 |                       n_test=n_test,
30 |                       n_features=2,
31 |                       contamination=contamination,
32 |                       random_state=42)
33 | 
34 |     # train KNN detector
35 |     clf_name = 'KNN'
36 |     clf = KNN()
37 |     clf.fit(X_train)
38 |     thres = ZSCORE()
39 | 
40 |     # get the prediction labels and outlier scores of the training data
41 |     y_train_scores = clf.decision_scores_  # raw outlier scores
42 | 
43 |     # (Legacy method)
44 |     # y_train_pred = thres.eval(y_train_scores)
45 | 
46 |     # binary labels (0: inliers, 1: outliers)
47 |     thres.fit(y_train_scores)
48 |     y_train_pred = thres.labels_
49 |     # or
50 |     y_train_pred = thres.predict(y_train_scores)
51 | 
52 |     # get the prediction on the test data
53 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
54 | 
55 |     # (Legacy method)
56 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
57 |     y_test_pred = thres.predict(y_test_scores)
58 | 
59 |     # evaluate and print the results
60 |     print('\nOn Training Data:')
61 |     evaluate_print(clf_name, y_train, y_train_scores)
62 |     print('\nOn Test Data:')
63 |     evaluate_print(clf_name, y_test, y_test_scores)
64 | 
65 |     # visualize the results
66 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
67 |               y_test_pred, show_figure=True, save_figure=False)
68 | 


--------------------------------------------------------------------------------
/examples/chau_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using Chauvenet's criterion for outlier thresholding."""
 2 | # Author: D Kulik
 3 | # License: BSD 2 clause
 4 | 
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from pyod.models.knn import KNN
10 | from pyod.utils.data import evaluate_print, generate_data
11 | from pyod.utils.example import visualize
12 | 
13 | from pythresh.thresholds.chau import CHAU
14 | 
15 | # temporary solution for relative imports in case pyod is not installed
16 | # if pyod is installed, no need to use the following line
17 | sys.path.append(
18 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     contamination = 0.1  # percentage of outliers
23 |     n_train = 200  # number of training points
24 |     n_test = 100  # number of testing points
25 | 
26 |     # Generate sample data
27 |     X_train, X_test, y_train, y_test =\
28 |         generate_data(n_train=n_train,
29 |                       n_test=n_test,
30 |                       n_features=2,
31 |                       contamination=contamination,
32 |                       random_state=42)
33 | 
34 |     # train KNN detector
35 |     clf_name = 'KNN'
36 |     clf = KNN()
37 |     clf.fit(X_train)
38 |     thres = CHAU()
39 | 
40 |     # get the prediction labels and outlier scores of the training data
41 |     y_train_scores = clf.decision_scores_  # raw outlier scores
42 | 
43 |     # (Legacy method)
44 |     # y_train_pred = thres.eval(y_train_scores)
45 | 
46 |     # binary labels (0: inliers, 1: outliers)
47 |     thres.fit(y_train_scores)
48 |     y_train_pred = thres.labels_
49 |     # or
50 |     y_train_pred = thres.predict(y_train_scores)
51 | 
52 |     # get the prediction on the test data
53 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
54 | 
55 |     # (Legacy method)
56 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
57 |     y_test_pred = thres.predict(y_test_scores)
58 | 
59 |     # evaluate and print the results
60 |     print('\nOn Training Data:')
61 |     evaluate_print(clf_name, y_train, y_train_scores)
62 |     print('\nOn Test Data:')
63 |     evaluate_print(clf_name, y_test, y_test_scores)
64 | 
65 |     # visualize the results
66 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
67 |               y_test_pred, show_figure=True, save_figure=False)
68 | 


--------------------------------------------------------------------------------
/examples/clust_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using a trained classifier for outlier thresholding."""
 2 | # Author: D Kulik
 3 | # License: BSD 2 clause
 4 | 
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from pyod.models.knn import KNN
10 | from pyod.utils.data import evaluate_print, generate_data
11 | from pyod.utils.example import visualize
12 | 
13 | from pythresh.thresholds.clust import CLUST
14 | 
15 | # temporary solution for relative imports in case pyod is not installed
16 | # if pyod is installed, no need to use the following line
17 | sys.path.append(
18 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     contamination = 0.1  # percentage of outliers
23 |     n_train = 200  # number of training points
24 |     n_test = 100  # number of testing points
25 | 
26 |     # Generate sample data
27 |     X_train, X_test, y_train, y_test =\
28 |         generate_data(n_train=n_train,
29 |                       n_test=n_test,
30 |                       n_features=2,
31 |                       contamination=contamination,
32 |                       random_state=42)
33 | 
34 |     # train KNN detector
35 |     clf_name = 'KNN'
36 |     clf = KNN()
37 |     clf.fit(X_train)
38 |     thres = CLUST()
39 | 
40 |     # get the prediction labels and outlier scores of the training data
41 |     y_train_scores = clf.decision_scores_  # raw outlier scores
42 | 
43 |     # (Legacy method)
44 |     # y_train_pred = thres.eval(y_train_scores)
45 | 
46 |     # binary labels (0: inliers, 1: outliers)
47 |     thres.fit(y_train_scores)
48 |     y_train_pred = thres.labels_
49 |     # or
50 |     y_train_pred = thres.predict(y_train_scores)
51 | 
52 |     # get the prediction on the test data
53 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
54 | 
55 |     # (Legacy method)
56 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
57 |     y_test_pred = thres.predict(y_test_scores)
58 | 
59 |     # evaluate and print the results
60 |     print('\nOn Training Data:')
61 |     evaluate_print(clf_name, y_train, y_train_scores)
62 |     print('\nOn Test Data:')
63 |     evaluate_print(clf_name, y_test, y_test_scores)
64 | 
65 |     # visualize the results
66 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
67 |               y_test_pred, show_figure=True, save_figure=False)
68 | 


--------------------------------------------------------------------------------
/examples/dsn_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using distance shift from normal for outlier thresholding."""
 2 | # Author: D Kulik
 3 | # License: BSD 2 clause
 4 | 
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from pyod.models.knn import KNN
10 | from pyod.utils.data import evaluate_print, generate_data
11 | from pyod.utils.example import visualize
12 | 
13 | from pythresh.thresholds.dsn import DSN
14 | 
15 | # temporary solution for relative imports in case pyod is not installed
16 | # if pyod is installed, no need to use the following line
17 | sys.path.append(
18 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     contamination = 0.1  # percentage of outliers
23 |     n_train = 200  # number of training points
24 |     n_test = 100  # number of testing points
25 | 
26 |     # Generate sample data
27 |     X_train, X_test, y_train, y_test =\
28 |         generate_data(n_train=n_train,
29 |                       n_test=n_test,
30 |                       n_features=2,
31 |                       contamination=contamination,
32 |                       random_state=42)
33 | 
34 |     # train KNN detector
35 |     clf_name = 'KNN'
36 |     clf = KNN()
37 |     clf.fit(X_train)
38 |     thres = DSN()
39 | 
40 |     # get the prediction labels and outlier scores of the training data
41 |     y_train_scores = clf.decision_scores_  # raw outlier scores
42 | 
43 |     # (Legacy method)
44 |     # y_train_pred = thres.eval(y_train_scores)
45 | 
46 |     # binary labels (0: inliers, 1: outliers)
47 |     thres.fit(y_train_scores)
48 |     y_train_pred = thres.labels_
49 |     # or
50 |     y_train_pred = thres.predict(y_train_scores)
51 | 
52 |     # get the prediction on the test data
53 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
54 | 
55 |     # (Legacy method)
56 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
57 |     y_test_pred = thres.predict(y_test_scores)
58 | 
59 |     # evaluate and print the results
60 |     print('\nOn Training Data:')
61 |     evaluate_print(clf_name, y_train, y_train_scores)
62 |     print('\nOn Test Data:')
63 |     evaluate_print(clf_name, y_test, y_test_scores)
64 | 
65 |     # visualize the results
66 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
67 |               y_test_pred, show_figure=True, save_figure=False)
68 | 


--------------------------------------------------------------------------------
/examples/gamgmm_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using gammaGMM for outlier thresholding."""
 2 | # Author: L Perini
 3 | # License: BSD 2 clause
 4 | 
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from pyod.models.knn import KNN
10 | from pyod.utils.data import evaluate_print, generate_data
11 | from pyod.utils.example import visualize
12 | 
13 | from pythresh.thresholds.gamgmm import GAMGMM
14 | 
15 | # temporary solution for relative imports in case pyod is not installed
16 | # if pyod is installed, no need to use the following line
17 | sys.path.append(
18 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     contamination = 0.1  # percentage of outliers
23 |     n_train = 200  # number of training points
24 |     n_test = 100  # number of testing points
25 | 
26 |     # Generate sample data
27 |     X_train, X_test, y_train, y_test =\
28 |         generate_data(n_train=n_train,
29 |                       n_test=n_test,
30 |                       n_features=2,
31 |                       contamination=contamination,
32 |                       random_state=42)
33 | 
34 |     # train Autoencoder detector
35 |     clf_name = 'KNN'
36 |     clf = KNN()
37 |     clf.fit(X_train)
38 |     thres = GAMGMM()
39 | 
40 |     # get the prediction labels and outlier scores of the training data
41 |     y_train_scores = clf.decision_scores_  # raw outlier scores
42 | 
43 |     # (Legacy method)
44 |     # y_train_pred = thres.eval(y_train_scores)
45 | 
46 |     # binary labels (0: inliers, 1: outliers)
47 |     thres.fit(y_train_scores)
48 |     y_train_pred = thres.labels_
49 |     # or
50 |     y_train_pred = thres.predict(y_train_scores)
51 | 
52 |     # get the prediction on the test data
53 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
54 | 
55 |     # (Legacy method)
56 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
57 |     y_test_pred = thres.predict(y_test_scores)
58 | 
59 |     # evaluate and print the results
60 |     print('\nOn Training Data:')
61 |     evaluate_print(clf_name, y_train, y_train_scores)
62 |     print('\nOn Test Data:')
63 |     evaluate_print(clf_name, y_test, y_test_scores)
64 | 
65 |     # visualize the results
66 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
67 |               y_test_pred, show_figure=True, save_figure=False)
68 | 


--------------------------------------------------------------------------------
/examples/hist_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using histogram based methods for outlier thresholding."""
 2 | # Author: D Kulik
 3 | # License: BSD 2 clause
 4 | 
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from pyod.models.knn import KNN
10 | from pyod.utils.data import evaluate_print, generate_data
11 | from pyod.utils.example import visualize
12 | 
13 | from pythresh.thresholds.hist import HIST
14 | 
15 | # temporary solution for relative imports in case pyod is not installed
16 | # if pyod is installed, no need to use the following line
17 | sys.path.append(
18 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     contamination = 0.1  # percentage of outliers
23 |     n_train = 200  # number of training points
24 |     n_test = 100  # number of testing points
25 | 
26 |     # Generate sample data
27 |     X_train, X_test, y_train, y_test =\
28 |         generate_data(n_train=n_train,
29 |                       n_test=n_test,
30 |                       n_features=2,
31 |                       contamination=contamination,
32 |                       random_state=42)
33 | 
34 |     # train KNN detector
35 |     clf_name = 'KNN'
36 |     clf = KNN()
37 |     clf.fit(X_train)
38 |     thres = HIST()
39 | 
40 |     # get the prediction labels and outlier scores of the training data
41 |     y_train_scores = clf.decision_scores_  # raw outlier scores
42 | 
43 |     # (Legacy method)
44 |     # y_train_pred = thres.eval(y_train_scores)
45 | 
46 |     # binary labels (0: inliers, 1: outliers)
47 |     thres.fit(y_train_scores)
48 |     y_train_pred = thres.labels_
49 |     # or
50 |     y_train_pred = thres.predict(y_train_scores)
51 | 
52 |     # get the prediction on the test data
53 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
54 | 
55 |     # (Legacy method)
56 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
57 |     y_test_pred = thres.predict(y_test_scores)
58 | 
59 |     # evaluate and print the results
60 |     print('\nOn Training Data:')
61 |     evaluate_print(clf_name, y_train, y_train_scores)
62 |     print('\nOn Test Data:')
63 |     evaluate_print(clf_name, y_test, y_test_scores)
64 | 
65 |     # visualize the results
66 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
67 |               y_test_pred, show_figure=True, save_figure=False)
68 | 


--------------------------------------------------------------------------------
/examples/iqr_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using the inter-quartile region for outlier thresholding."""
 2 | # Author: D Kulik
 3 | # License: BSD 2 clause
 4 | 
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from pyod.models.knn import KNN
10 | from pyod.utils.data import evaluate_print, generate_data
11 | from pyod.utils.example import visualize
12 | 
13 | from pythresh.thresholds.iqr import IQR
14 | 
15 | # temporary solution for relative imports in case pyod is not installed
16 | # if pyod is installed, no need to use the following line
17 | sys.path.append(
18 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     contamination = 0.1  # percentage of outliers
23 |     n_train = 200  # number of training points
24 |     n_test = 100  # number of testing points
25 | 
26 |     # Generate sample data
27 |     X_train, X_test, y_train, y_test =\
28 |         generate_data(n_train=n_train,
29 |                       n_test=n_test,
30 |                       n_features=2,
31 |                       contamination=contamination,
32 |                       random_state=42)
33 | 
34 |     # train KNN detector
35 |     clf_name = 'KNN'
36 |     clf = KNN()
37 |     clf.fit(X_train)
38 |     thres = IQR()
39 | 
40 |     # get the prediction labels and outlier scores of the training data
41 |     y_train_scores = clf.decision_scores_  # raw outlier scores
42 | 
43 |     # (Legacy method)
44 |     # y_train_pred = thres.eval(y_train_scores)
45 | 
46 |     # binary labels (0: inliers, 1: outliers)
47 |     thres.fit(y_train_scores)
48 |     y_train_pred = thres.labels_
49 |     # or
50 |     y_train_pred = thres.predict(y_train_scores)
51 | 
52 |     # get the prediction on the test data
53 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
54 | 
55 |     # (Legacy method)
56 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
57 |     y_test_pred = thres.predict(y_test_scores)
58 | 
59 |     # evaluate and print the results
60 |     print('\nOn Training Data:')
61 |     evaluate_print(clf_name, y_train, y_train_scores)
62 |     print('\nOn Test Data:')
63 |     evaluate_print(clf_name, y_test, y_test_scores)
64 | 
65 |     # visualize the results
66 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
67 |               y_test_pred, show_figure=True, save_figure=False)
68 | 


--------------------------------------------------------------------------------
/examples/mad_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using the median absolute deviation for outlier thresholding."""
 2 | # Author: D Kulik
 3 | # License: BSD 2 clause
 4 | 
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from pyod.models.knn import KNN
10 | from pyod.utils.data import evaluate_print, generate_data
11 | from pyod.utils.example import visualize
12 | 
13 | from pythresh.thresholds.mad import MAD
14 | 
15 | # temporary solution for relative imports in case pyod is not installed
16 | # if pyod is installed, no need to use the following line
17 | sys.path.append(
18 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     contamination = 0.1  # percentage of outliers
23 |     n_train = 200  # number of training points
24 |     n_test = 100  # number of testing points
25 | 
26 |     # Generate sample data
27 |     X_train, X_test, y_train, y_test =\
28 |         generate_data(n_train=n_train,
29 |                       n_test=n_test,
30 |                       n_features=2,
31 |                       contamination=contamination,
32 |                       random_state=42)
33 | 
34 |     # train KNN detector
35 |     clf_name = 'KNN'
36 |     clf = KNN()
37 |     clf.fit(X_train)
38 |     thres = MAD()
39 | 
40 |     # get the prediction labels and outlier scores of the training data
41 |     y_train_scores = clf.decision_scores_  # raw outlier scores
42 | 
43 |     # (Legacy method)
44 |     # y_train_pred = thres.eval(y_train_scores)
45 | 
46 |     # binary labels (0: inliers, 1: outliers)
47 |     thres.fit(y_train_scores)
48 |     y_train_pred = thres.labels_
49 |     # or
50 |     y_train_pred = thres.predict(y_train_scores)
51 | 
52 |     # get the prediction on the test data
53 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
54 | 
55 |     # (Legacy method)
56 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
57 |     y_test_pred = thres.predict(y_test_scores)
58 | 
59 |     # evaluate and print the results
60 |     print('\nOn Training Data:')
61 |     evaluate_print(clf_name, y_train, y_train_scores)
62 |     print('\nOn Test Data:')
63 |     evaluate_print(clf_name, y_test, y_test_scores)
64 | 
65 |     # visualize the results
66 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
67 |               y_test_pred, show_figure=True, save_figure=False)
68 | 


--------------------------------------------------------------------------------
/examples/moll_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using Friedrichs' mollifier for outlier thresholding."""
 2 | # Author: D Kulik
 3 | # License: BSD 2 clause
 4 | 
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from pyod.models.knn import KNN
10 | from pyod.utils.data import evaluate_print, generate_data
11 | from pyod.utils.example import visualize
12 | 
13 | from pythresh.thresholds.moll import MOLL
14 | 
15 | # temporary solution for relative imports in case pyod is not installed
16 | # if pyod is installed, no need to use the following line
17 | sys.path.append(
18 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     contamination = 0.1  # percentage of outliers
23 |     n_train = 200  # number of training points
24 |     n_test = 100  # number of testing points
25 | 
26 |     # Generate sample data
27 |     X_train, X_test, y_train, y_test =\
28 |         generate_data(n_train=n_train,
29 |                       n_test=n_test,
30 |                       n_features=2,
31 |                       contamination=contamination,
32 |                       random_state=42)
33 | 
34 |     # train KNN detector
35 |     clf_name = 'KNN'
36 |     clf = KNN()
37 |     clf.fit(X_train)
38 |     thres = MOLL()
39 | 
40 |     # get the prediction labels and outlier scores of the training data
41 |     y_train_scores = clf.decision_scores_  # raw outlier scores
42 | 
43 |     # (Legacy method)
44 |     # y_train_pred = thres.eval(y_train_scores)
45 | 
46 |     # binary labels (0: inliers, 1: outliers)
47 |     thres.fit(y_train_scores)
48 |     y_train_pred = thres.labels_
49 |     # or
50 |     y_train_pred = thres.predict(y_train_scores)
51 | 
52 |     # get the prediction on the test data
53 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
54 | 
55 |     # (Legacy method)
56 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
57 |     y_test_pred = thres.predict(y_test_scores)
58 | 
59 |     # evaluate and print the results
60 |     print('\nOn Training Data:')
61 |     evaluate_print(clf_name, y_train, y_train_scores)
62 |     print('\nOn Test Data:')
63 |     evaluate_print(clf_name, y_test, y_test_scores)
64 | 
65 |     # visualize the results
66 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
67 |               y_test_pred, show_figure=True, save_figure=False)
68 | 


--------------------------------------------------------------------------------
/examples/regr_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using the regression intercept for outlier thresholding."""
 2 | # Author: D Kulik
 3 | # License: BSD 2 clause
 4 | 
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from pyod.models.knn import KNN
10 | from pyod.utils.data import evaluate_print, generate_data
11 | from pyod.utils.example import visualize
12 | 
13 | from pythresh.thresholds.regr import REGR
14 | 
15 | # temporary solution for relative imports in case pyod is not installed
16 | # if pyod is installed, no need to use the following line
17 | sys.path.append(
18 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     contamination = 0.1  # percentage of outliers
23 |     n_train = 200  # number of training points
24 |     n_test = 100  # number of testing points
25 | 
26 |     # Generate sample data
27 |     X_train, X_test, y_train, y_test =\
28 |         generate_data(n_train=n_train,
29 |                       n_test=n_test,
30 |                       n_features=2,
31 |                       contamination=contamination,
32 |                       random_state=42)
33 | 
34 |     # train KNN detector
35 |     clf_name = 'KNN'
36 |     clf = KNN()
37 |     clf.fit(X_train)
38 |     thres = REGR()
39 | 
40 |     # get the prediction labels and outlier scores of the training data
41 |     y_train_scores = clf.decision_scores_  # raw outlier scores
42 | 
43 |     # (Legacy method)
44 |     # y_train_pred = thres.eval(y_train_scores)
45 | 
46 |     # binary labels (0: inliers, 1: outliers)
47 |     thres.fit(y_train_scores)
48 |     y_train_pred = thres.labels_
49 |     # or
50 |     y_train_pred = thres.predict(y_train_scores)
51 | 
52 |     # get the prediction on the test data
53 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
54 | 
55 |     # (Legacy method)
56 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
57 |     y_test_pred = thres.predict(y_test_scores)
58 | 
59 |     # evaluate and print the results
60 |     print('\nOn Training Data:')
61 |     evaluate_print(clf_name, y_train, y_train_scores)
62 |     print('\nOn Test Data:')
63 |     evaluate_print(clf_name, y_test, y_test_scores)
64 | 
65 |     # visualize the results
66 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
67 |               y_test_pred, show_figure=True, save_figure=False)
68 | 


--------------------------------------------------------------------------------
/examples/yj_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using the Yeo-Johnson transformation for outlier thresholding."""
 2 | # Author: D Kulik
 3 | # License: BSD 2 clause
 4 | 
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from pyod.models.knn import KNN
10 | from pyod.utils.data import evaluate_print, generate_data
11 | from pyod.utils.example import visualize
12 | 
13 | from pythresh.thresholds.yj import YJ
14 | 
15 | # temporary solution for relative imports in case pyod is not installed
16 | # if pyod is installed, no need to use the following line
17 | sys.path.append(
18 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     contamination = 0.1  # percentage of outliers
23 |     n_train = 200  # number of training points
24 |     n_test = 100  # number of testing points
25 | 
26 |     # Generate sample data
27 |     X_train, X_test, y_train, y_test =\
28 |         generate_data(n_train=n_train,
29 |                       n_test=n_test,
30 |                       n_features=2,
31 |                       contamination=contamination,
32 |                       random_state=42)
33 | 
34 |     # train KNN detector
35 |     clf_name = 'KNN'
36 |     clf = KNN()
37 |     clf.fit(X_train)
38 |     thres = YJ()
39 | 
40 |     # get the prediction labels and outlier scores of the training data
41 |     y_train_scores = clf.decision_scores_  # raw outlier scores
42 | 
43 |     # (Legacy method)
44 |     # y_train_pred = thres.eval(y_train_scores)
45 | 
46 |     # binary labels (0: inliers, 1: outliers)
47 |     thres.fit(y_train_scores)
48 |     y_train_pred = thres.labels_
49 |     # or
50 |     y_train_pred = thres.predict(y_train_scores)
51 | 
52 |     # get the prediction on the test data
53 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
54 | 
55 |     # (Legacy method)
56 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
57 |     y_test_pred = thres.predict(y_test_scores)
58 | 
59 |     # evaluate and print the results
60 |     print('\nOn Training Data:')
61 |     evaluate_print(clf_name, y_train, y_train_scores)
62 |     print('\nOn Test Data:')
63 |     evaluate_print(clf_name, y_test, y_test_scores)
64 | 
65 |     # visualize the results
66 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
67 |               y_test_pred, show_figure=True, save_figure=False)
68 | 


--------------------------------------------------------------------------------
/examples/fwfm_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using the full width at full maximum for outlier thresholding."""
 2 | # Author: D Kulik
 3 | # License: BSD 2 clause
 4 | 
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from pyod.models.knn import KNN
10 | from pyod.utils.data import evaluate_print, generate_data
11 | from pyod.utils.example import visualize
12 | 
13 | from pythresh.thresholds.fwfm import FWFM
14 | 
15 | # temporary solution for relative imports in case pyod is not installed
16 | # if pyod is installed, no need to use the following line
17 | sys.path.append(
18 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     contamination = 0.1  # percentage of outliers
23 |     n_train = 200  # number of training points
24 |     n_test = 100  # number of testing points
25 | 
26 |     # Generate sample data
27 |     X_train, X_test, y_train, y_test =\
28 |         generate_data(n_train=n_train,
29 |                       n_test=n_test,
30 |                       n_features=2,
31 |                       contamination=contamination,
32 |                       random_state=42)
33 | 
34 |     # train KNN detector
35 |     clf_name = 'KNN'
36 |     clf = KNN()
37 |     clf.fit(X_train)
38 |     thres = FWFM()
39 | 
40 |     # get the prediction labels and outlier scores of the training data
41 |     y_train_scores = clf.decision_scores_  # raw outlier scores
42 | 
43 |     # (Legacy method)
44 |     # y_train_pred = thres.eval(y_train_scores)
45 | 
46 |     # binary labels (0: inliers, 1: outliers)
47 |     thres.fit(y_train_scores)
48 |     y_train_pred = thres.labels_
49 |     # or
50 |     y_train_pred = thres.predict(y_train_scores)
51 | 
52 |     # get the prediction on the test data
53 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
54 | 
55 |     # (Legacy method)
56 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
57 |     y_test_pred = thres.predict(y_test_scores)
58 | 
59 |     # evaluate and print the results
60 |     print('\nOn Training Data:')
61 |     evaluate_print(clf_name, y_train, y_train_scores)
62 |     print('\nOn Test Data:')
63 |     evaluate_print(clf_name, y_test, y_test_scores)
64 | 
65 |     # visualize the results
66 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
67 |               y_test_pred, show_figure=True, save_figure=False)
68 | 


--------------------------------------------------------------------------------
/examples/karch_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using distance shift from normal for outlier thresholding."""
 2 | # Author: D Kulik
 3 | # License: BSD 2 clause
 4 | 
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from pyod.models.knn import KNN
10 | from pyod.utils.data import evaluate_print, generate_data
11 | from pyod.utils.example import visualize
12 | 
13 | from pythresh.thresholds.karch import KARCH
14 | 
15 | # temporary solution for relative imports in case pyod is not installed
16 | # if pyod is installed, no need to use the following line
17 | sys.path.append(
18 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     contamination = 0.1  # percentage of outliers
23 |     n_train = 200  # number of training points
24 |     n_test = 100  # number of testing points
25 | 
26 |     # Generate sample data
27 |     X_train, X_test, y_train, y_test =\
28 |         generate_data(n_train=n_train,
29 |                       n_test=n_test,
30 |                       n_features=2,
31 |                       contamination=contamination,
32 |                       random_state=42)
33 | 
34 |     # train KNN detector
35 |     clf_name = 'KNN'
36 |     clf = KNN()
37 |     clf.fit(X_train)
38 |     thres = KARCH()
39 | 
40 |     # get the prediction labels and outlier scores of the training data
41 |     y_train_scores = clf.decision_scores_  # raw outlier scores
42 | 
43 |     # (Legacy method)
44 |     # y_train_pred = thres.eval(y_train_scores)
45 | 
46 |     # binary labels (0: inliers, 1: outliers)
47 |     thres.fit(y_train_scores)
48 |     y_train_pred = thres.labels_
49 |     # or
50 |     y_train_pred = thres.predict(y_train_scores)
51 | 
52 |     # get the prediction on the test data
53 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
54 | 
55 |     # (Legacy method)
56 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
57 |     y_test_pred = thres.predict(y_test_scores)
58 | 
59 |     # evaluate and print the results
60 |     print('\nOn Training Data:')
61 |     evaluate_print(clf_name, y_train, y_train_scores)
62 |     print('\nOn Test Data:')
63 |     evaluate_print(clf_name, y_test, y_test_scores)
64 | 
65 |     # visualize the results
66 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
67 |               y_test_pred, show_figure=True, save_figure=False)
68 | 


--------------------------------------------------------------------------------
/examples/mcst_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using Monte Carlo statistical tests for outlier thresholding."""
 2 | # Author: D Kulik
 3 | # License: BSD 2 clause
 4 | 
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from pyod.models.knn import KNN
10 | from pyod.utils.data import evaluate_print, generate_data
11 | from pyod.utils.example import visualize
12 | 
13 | from pythresh.thresholds.mcst import MCST
14 | 
15 | # temporary solution for relative imports in case pyod is not installed
16 | # if pyod is installed, no need to use the following line
17 | sys.path.append(
18 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     contamination = 0.1  # percentage of outliers
23 |     n_train = 200  # number of training points
24 |     n_test = 100  # number of testing points
25 | 
26 |     # Generate sample data
27 |     X_train, X_test, y_train, y_test =\
28 |         generate_data(n_train=n_train,
29 |                       n_test=n_test,
30 |                       n_features=2,
31 |                       contamination=contamination,
32 |                       random_state=42)
33 | 
34 |     # train KNN detector
35 |     clf_name = 'KNN'
36 |     clf = KNN()
37 |     clf.fit(X_train)
38 |     thres = MCST()
39 | 
40 |     # get the prediction labels and outlier scores of the training data
41 |     y_train_scores = clf.decision_scores_  # raw outlier scores
42 | 
43 |     # (Legacy method)
44 |     # y_train_pred = thres.eval(y_train_scores)
45 | 
46 |     # binary labels (0: inliers, 1: outliers)
47 |     thres.fit(y_train_scores)
48 |     y_train_pred = thres.labels_
49 |     # or
50 |     y_train_pred = thres.predict(y_train_scores)
51 | 
52 |     # get the prediction on the test data
53 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
54 | 
55 |     # (Legacy method)
56 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
57 |     y_test_pred = thres.predict(y_test_scores)
58 | 
59 |     # evaluate and print the results
60 |     print('\nOn Training Data:')
61 |     evaluate_print(clf_name, y_train, y_train_scores)
62 |     print('\nOn Test Data:')
63 |     evaluate_print(clf_name, y_test, y_test_scores)
64 | 
65 |     # visualize the results
66 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
67 |               y_test_pred, show_figure=True, save_figure=False)
68 | 


--------------------------------------------------------------------------------
/examples/mixmod_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using mixture models for outlier thresholding."""
 2 | # Author: D Kulik
 3 | # License: BSD 2 clause
 4 | 
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from pyod.models.knn import KNN
10 | from pyod.utils.data import evaluate_print, generate_data
11 | from pyod.utils.example import visualize
12 | 
13 | from pythresh.thresholds.mixmod import MIXMOD
14 | 
15 | # temporary solution for relative imports in case pyod is not installed
16 | # if pyod is installed, no need to use the following line
17 | sys.path.append(
18 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     contamination = 0.1  # percentage of outliers
23 |     n_train = 200  # number of training points
24 |     n_test = 100  # number of testing points
25 | 
26 |     # Generate sample data
27 |     X_train, X_test, y_train, y_test =\
28 |         generate_data(n_train=n_train,
29 |                       n_test=n_test,
30 |                       n_features=2,
31 |                       contamination=contamination,
32 |                       random_state=42)
33 | 
34 |     # train Autoencoder detector
35 |     clf_name = 'KNN'
36 |     clf = KNN()
37 |     clf.fit(X_train)
38 |     thres = MIXMOD()
39 | 
40 |     # get the prediction labels and outlier scores of the training data
41 |     y_train_scores = clf.decision_scores_  # raw outlier scores
42 | 
43 |     # (Legacy method)
44 |     # y_train_pred = thres.eval(y_train_scores)
45 | 
46 |     # binary labels (0: inliers, 1: outliers)
47 |     thres.fit(y_train_scores)
48 |     y_train_pred = thres.labels_
49 |     # or
50 |     y_train_pred = thres.predict(y_train_scores)
51 | 
52 |     # get the prediction on the test data
53 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
54 | 
55 |     # (Legacy method)
56 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
57 |     y_test_pred = thres.predict(y_test_scores)
58 | 
59 |     # evaluate and print the results
60 |     print('\nOn Training Data:')
61 |     evaluate_print(clf_name, y_train, y_train_scores)
62 |     print('\nOn Test Data:')
63 |     evaluate_print(clf_name, y_test, y_test_scores)
64 | 
65 |     # visualize the results
66 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
67 |               y_test_pred, show_figure=True, save_figure=False)
68 | 


--------------------------------------------------------------------------------
/examples/mtt_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using the modified Thompson Tau test for outlier thresholding."""
 2 | # Author: D Kulik
 3 | # License: BSD 2 clause
 4 | 
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from pyod.models.knn import KNN
10 | from pyod.utils.data import evaluate_print, generate_data
11 | from pyod.utils.example import visualize
12 | 
13 | from pythresh.thresholds.mtt import MTT
14 | 
15 | # temporary solution for relative imports in case pyod is not installed
16 | # if pyod is installed, no need to use the following line
17 | sys.path.append(
18 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     contamination = 0.1  # percentage of outliers
23 |     n_train = 200  # number of training points
24 |     n_test = 100  # number of testing points
25 | 
26 |     # Generate sample data
27 |     X_train, X_test, y_train, y_test =\
28 |         generate_data(n_train=n_train,
29 |                       n_test=n_test,
30 |                       n_features=2,
31 |                       contamination=contamination,
32 |                       random_state=42)
33 | 
34 |     # train KNN detector
35 |     clf_name = 'KNN'
36 |     clf = KNN()
37 |     clf.fit(X_train)
38 |     thres = MTT()
39 | 
40 |     # get the prediction labels and outlier scores of the training data
41 |     y_train_scores = clf.decision_scores_  # raw outlier scores
42 | 
43 |     # (Legacy method)
44 |     # y_train_pred = thres.eval(y_train_scores)
45 | 
46 |     # binary labels (0: inliers, 1: outliers)
47 |     thres.fit(y_train_scores)
48 |     y_train_pred = thres.labels_
49 |     # or
50 |     y_train_pred = thres.predict(y_train_scores)
51 | 
52 |     # get the prediction on the test data
53 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
54 | 
55 |     # (Legacy method)
56 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
57 |     y_test_pred = thres.predict(y_test_scores)
58 | 
59 |     # evaluate and print the results
60 |     print('\nOn Training Data:')
61 |     evaluate_print(clf_name, y_train, y_train_scores)
62 |     print('\nOn Test Data:')
63 |     evaluate_print(clf_name, y_test, y_test_scores)
64 | 
65 |     # visualize the results
66 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
67 |               y_test_pred, show_figure=True, save_figure=False)
68 | 


--------------------------------------------------------------------------------
/examples/wind_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using the topological winding number for outlier thresholding."""
 2 | # Author: D Kulik
 3 | # License: BSD 2 clause
 4 | 
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from pyod.models.knn import KNN
10 | from pyod.utils.data import evaluate_print, generate_data
11 | from pyod.utils.example import visualize
12 | 
13 | from pythresh.thresholds.wind import WIND
14 | 
15 | # temporary solution for relative imports in case pyod is not installed
16 | # if pyod is installed, no need to use the following line
17 | sys.path.append(
18 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     contamination = 0.1  # percentage of outliers
23 |     n_train = 200  # number of training points
24 |     n_test = 100  # number of testing points
25 | 
26 |     # Generate sample data
27 |     X_train, X_test, y_train, y_test =\
28 |         generate_data(n_train=n_train,
29 |                       n_test=n_test,
30 |                       n_features=2,
31 |                       contamination=contamination,
32 |                       random_state=42)
33 | 
34 |     # train KNN detector
35 |     clf_name = 'KNN'
36 |     clf = KNN()
37 |     clf.fit(X_train)
38 |     thres = WIND()
39 | 
40 |     # get the prediction labels and outlier scores of the training data
41 |     y_train_scores = clf.decision_scores_  # raw outlier scores
42 | 
43 |     # (Legacy method)
44 |     # y_train_pred = thres.eval(y_train_scores)
45 | 
46 |     # binary labels (0: inliers, 1: outliers)
47 |     thres.fit(y_train_scores)
48 |     y_train_pred = thres.labels_
49 |     # or
50 |     y_train_pred = thres.predict(y_train_scores)
51 | 
52 |     # get the prediction on the test data
53 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
54 | 
55 |     # (Legacy method)
56 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
57 |     y_test_pred = thres.predict(y_test_scores)
58 | 
59 |     # evaluate and print the results
60 |     print('\nOn Training Data:')
61 |     evaluate_print(clf_name, y_train, y_train_scores)
62 |     print('\nOn Test Data:')
63 |     evaluate_print(clf_name, y_test, y_test_scores)
64 | 
65 |     # visualize the results
66 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
67 |               y_test_pred, show_figure=True, save_figure=False)
68 | 


--------------------------------------------------------------------------------
/examples/filter_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using distance shift from normal for outlier thresholding."""
 2 | # Author: D Kulik
 3 | # License: BSD 2 clause
 4 | 
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from pyod.models.knn import KNN
10 | from pyod.utils.data import evaluate_print, generate_data
11 | from pyod.utils.example import visualize
12 | 
13 | from pythresh.thresholds.filter import FILTER
14 | 
15 | # temporary solution for relative imports in case pyod is not installed
16 | # if pyod is installed, no need to use the following line
17 | sys.path.append(
18 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     contamination = 0.1  # percentage of outliers
23 |     n_train = 200  # number of training points
24 |     n_test = 100  # number of testing points
25 | 
26 |     # Generate sample data
27 |     X_train, X_test, y_train, y_test =\
28 |         generate_data(n_train=n_train,
29 |                       n_test=n_test,
30 |                       n_features=2,
31 |                       contamination=contamination,
32 |                       random_state=42)
33 | 
34 |     # train KNN detector
35 |     clf_name = 'KNN'
36 |     clf = KNN()
37 |     clf.fit(X_train)
38 |     thres = FILTER()
39 | 
40 |     # get the prediction labels and outlier scores of the training data
41 |     y_train_scores = clf.decision_scores_  # raw outlier scores
42 | 
43 |     # (Legacy method)
44 |     # y_train_pred = thres.eval(y_train_scores)
45 | 
46 |     # binary labels (0: inliers, 1: outliers)
47 |     thres.fit(y_train_scores)
48 |     y_train_pred = thres.labels_
49 |     # or
50 |     y_train_pred = thres.predict(y_train_scores)
51 | 
52 |     # get the prediction on the test data
53 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
54 | 
55 |     # (Legacy method)
56 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
57 |     y_test_pred = thres.predict(y_test_scores)
58 | 
59 |     # evaluate and print the results
60 |     print('\nOn Training Data:')
61 |     evaluate_print(clf_name, y_train, y_train_scores)
62 |     print('\nOn Test Data:')
63 |     evaluate_print(clf_name, y_test, y_test_scores)
64 | 
65 |     # visualize the results
66 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
67 |               y_test_pred, show_figure=True, save_figure=False)
68 | 


--------------------------------------------------------------------------------
/examples/qmcd_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using the quasi-Monte Carlo discreprancy for outlier thresholding."""
 2 | # Author: D Kulik
 3 | # License: BSD 2 clause
 4 | 
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | from pyod.models.knn import KNN
10 | from pyod.utils.data import evaluate_print, generate_data
11 | from pyod.utils.example import visualize
12 | 
13 | from pythresh.thresholds.qmcd import QMCD
14 | 
15 | # temporary solution for relative imports in case pyod is not installed
16 | # if pyod is installed, no need to use the following line
17 | sys.path.append(
18 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     contamination = 0.1  # percentage of outliers
23 |     n_train = 200  # number of training points
24 |     n_test = 100  # number of testing points
25 | 
26 |     # Generate sample data
27 |     X_train, X_test, y_train, y_test =\
28 |         generate_data(n_train=n_train,
29 |                       n_test=n_test,
30 |                       n_features=2,
31 |                       contamination=contamination,
32 |                       random_state=42)
33 | 
34 |     # train KNN detector
35 |     clf_name = 'KNN'
36 |     clf = KNN()
37 |     clf.fit(X_train)
38 |     thres = QMCD()
39 | 
40 |     # get the prediction labels and outlier scores of the training data
41 |     y_train_scores = clf.decision_scores_  # raw outlier scores
42 | 
43 |     # (Legacy method)
44 |     # y_train_pred = thres.eval(y_train_scores)
45 | 
46 |     # binary labels (0: inliers, 1: outliers)
47 |     thres.fit(y_train_scores)
48 |     y_train_pred = thres.labels_
49 |     # or
50 |     y_train_pred = thres.predict(y_train_scores)
51 | 
52 |     # get the prediction on the test data
53 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
54 | 
55 |     # (Legacy method)
56 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
57 |     y_test_pred = thres.predict(y_test_scores)
58 | 
59 |     # evaluate and print the results
60 |     print('\nOn Training Data:')
61 |     evaluate_print(clf_name, y_train, y_train_scores)
62 |     print('\nOn Test Data:')
63 |     evaluate_print(clf_name, y_test, y_test_scores)
64 | 
65 |     # visualize the results
66 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
67 |               y_test_pred, show_figure=True, save_figure=False)
68 | 


--------------------------------------------------------------------------------
/examples/ocsvm_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using a one-class svm.
 2 | 
 3 |    for outlier thresholding
 4 | """
 5 | # Author: D Kulik
 6 | # License: BSD 2 clause
 7 | 
 8 | 
 9 | import os
10 | import sys
11 | 
12 | from pyod.models.knn import KNN
13 | from pyod.utils.data import evaluate_print, generate_data
14 | from pyod.utils.example import visualize
15 | 
16 | from pythresh.thresholds.ocsvm import OCSVM
17 | 
18 | # temporary solution for relative imports in case pyod is not installed
19 | # if pyod is installed, no need to use the following line
20 | sys.path.append(
21 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     contamination = 0.1  # percentage of outliers
26 |     n_train = 200  # number of training points
27 |     n_test = 100  # number of testing points
28 | 
29 |     # Generate sample data
30 |     X_train, X_test, y_train, y_test =\
31 |         generate_data(n_train=n_train,
32 |                       n_test=n_test,
33 |                       n_features=2,
34 |                       contamination=contamination,
35 |                       random_state=42)
36 | 
37 |     # train KNN detector
38 |     clf_name = 'KNN'
39 |     clf = KNN()
40 |     clf.fit(X_train)
41 |     thres = OCSVM()
42 | 
43 |     # get the prediction labels and outlier scores of the training data
44 |     y_train_scores = clf.decision_scores_  # raw outlier scores
45 | 
46 |     # (Legacy method)
47 |     # y_train_pred = thres.eval(y_train_scores)
48 | 
49 |     # binary labels (0: inliers, 1: outliers)
50 |     thres.fit(y_train_scores)
51 |     y_train_pred = thres.labels_
52 |     # or
53 |     y_train_pred = thres.predict(y_train_scores)
54 | 
55 |     # get the prediction on the test data
56 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
57 | 
58 |     # (Legacy method)
59 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
60 |     y_test_pred = thres.predict(y_test_scores)
61 | 
62 |     # evaluate and print the results
63 |     print('\nOn Training Data:')
64 |     evaluate_print(clf_name, y_train, y_train_scores)
65 |     print('\nOn Test Data:')
66 |     evaluate_print(clf_name, y_test, y_test_scores)
67 | 
68 |     # visualize the results
69 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
70 |               y_test_pred, show_figure=True, save_figure=False)
71 | 


--------------------------------------------------------------------------------
/examples/aucp_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using the area under the curve percentage.
 2 | 
 3 |    for outlier thresholding
 4 | """
 5 | # Author: D Kulik
 6 | # License: BSD 2 clause
 7 | 
 8 | 
 9 | import os
10 | import sys
11 | 
12 | from pyod.models.knn import KNN
13 | from pyod.utils.data import evaluate_print, generate_data
14 | from pyod.utils.example import visualize
15 | 
16 | from pythresh.thresholds.aucp import AUCP
17 | 
18 | # temporary solution for relative imports in case pyod is not installed
19 | # if pyod is installed, no need to use the following line
20 | sys.path.append(
21 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     contamination = 0.1  # percentage of outliers
26 |     n_train = 200  # number of training points
27 |     n_test = 100  # number of testing points
28 | 
29 |     # Generate sample data
30 |     X_train, X_test, y_train, y_test =\
31 |         generate_data(n_train=n_train,
32 |                       n_test=n_test,
33 |                       n_features=2,
34 |                       contamination=contamination,
35 |                       random_state=42)
36 | 
37 |     # train KNN detector
38 |     clf_name = 'KNN'
39 |     clf = KNN()
40 |     clf.fit(X_train)
41 |     thres = AUCP()
42 | 
43 |     # get the prediction labels and outlier scores of the training data
44 |     y_train_scores = clf.decision_scores_  # raw outlier scores
45 | 
46 |     # (Legacy method)
47 |     # y_train_pred = thres.eval(y_train_scores)
48 | 
49 |     # binary labels (0: inliers, 1: outliers)
50 |     thres.fit(y_train_scores)
51 |     y_train_pred = thres.labels_
52 |     # or
53 |     y_train_pred = thres.predict(y_train_scores)
54 | 
55 |     # get the prediction on the test data
56 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
57 | 
58 |     # (Legacy method)
59 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
60 |     y_test_pred = thres.predict(y_test_scores)
61 | 
62 |     # evaluate and print the results
63 |     print('\nOn Training Data:')
64 |     evaluate_print(clf_name, y_train, y_train_scores)
65 |     print('\nOn Test Data:')
66 |     evaluate_print(clf_name, y_test, y_test_scores)
67 | 
68 |     # visualize the results
69 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
70 |               y_test_pred, show_figure=True, save_figure=False)
71 | 


--------------------------------------------------------------------------------
/examples/vae_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using the area under the curve percentage.
 2 | 
 3 |    for outlier thresholding
 4 | """
 5 | # Author: D Kulik
 6 | # License: BSD 2 clause
 7 | 
 8 | 
 9 | import os
10 | import sys
11 | 
12 | from pyod.models.knn import KNN
13 | from pyod.utils.data import evaluate_print, generate_data
14 | from pyod.utils.example import visualize
15 | 
16 | from pythresh.thresholds.vae import VAE
17 | 
18 | # temporary solution for relative imports in case pyod is not installed
19 | # if pyod is installed, no need to use the following line
20 | sys.path.append(
21 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     contamination = 0.1  # percentage of outliers
26 |     n_train = 200  # number of training points
27 |     n_test = 100  # number of testing points
28 | 
29 |     # Generate sample data
30 |     X_train, X_test, y_train, y_test =\
31 |         generate_data(n_train=n_train,
32 |                       n_test=n_test,
33 |                       n_features=2,
34 |                       contamination=contamination,
35 |                       random_state=42)
36 | 
37 |     # train KNN detector
38 |     clf_name = 'KNN'
39 |     clf = KNN()
40 |     clf.fit(X_train)
41 |     thres = VAE()
42 | 
43 |     # get the prediction labels and outlier scores of the training data
44 |     y_train_scores = clf.decision_scores_  # raw outlier scores
45 | 
46 |     # (Legacy method)
47 |     # y_train_pred = thres.eval(y_train_scores)
48 | 
49 |     # binary labels (0: inliers, 1: outliers)
50 |     thres.fit(y_train_scores)
51 |     y_train_pred = thres.labels_
52 |     # or
53 |     y_train_pred = thres.predict(y_train_scores)
54 | 
55 |     # get the prediction on the test data
56 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
57 | 
58 |     # (Legacy method)
59 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
60 |     y_test_pred = thres.predict(y_test_scores)
61 | 
62 |     # evaluate and print the results
63 |     print('\nOn Training Data:')
64 |     evaluate_print(clf_name, y_train, y_train_scores)
65 |     print('\nOn Test Data:')
66 |     evaluate_print(clf_name, y_test, y_test_scores)
67 | 
68 |     # visualize the results
69 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
70 |               y_test_pred, show_figure=True, save_figure=False)
71 | 


--------------------------------------------------------------------------------
/examples/decomp_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using the area under the curve percentage.
 2 | 
 3 |    for outlier thresholding
 4 | """
 5 | # Author: D Kulik
 6 | # License: BSD 2 clause
 7 | 
 8 | 
 9 | import os
10 | import sys
11 | 
12 | from pyod.models.knn import KNN
13 | from pyod.utils.data import evaluate_print, generate_data
14 | from pyod.utils.example import visualize
15 | 
16 | from pythresh.thresholds.decomp import DECOMP
17 | 
18 | # temporary solution for relative imports in case pyod is not installed
19 | # if pyod is installed, no need to use the following line
20 | sys.path.append(
21 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     contamination = 0.1  # percentage of outliers
26 |     n_train = 200  # number of training points
27 |     n_test = 100  # number of testing points
28 | 
29 |     # Generate sample data
30 |     X_train, X_test, y_train, y_test =\
31 |         generate_data(n_train=n_train,
32 |                       n_test=n_test,
33 |                       n_features=2,
34 |                       contamination=contamination,
35 |                       random_state=42)
36 | 
37 |     # train KNN detector
38 |     clf_name = 'KNN'
39 |     clf = KNN()
40 |     clf.fit(X_train)
41 |     thres = DECOMP()
42 | 
43 |     # get the prediction labels and outlier scores of the training data
44 |     y_train_scores = clf.decision_scores_  # raw outlier scores
45 | 
46 |     # (Legacy method)
47 |     # y_train_pred = thres.eval(y_train_scores)
48 | 
49 |     # binary labels (0: inliers, 1: outliers)
50 |     thres.fit(y_train_scores)
51 |     y_train_pred = thres.labels_
52 |     # or
53 |     y_train_pred = thres.predict(y_train_scores)
54 | 
55 |     # get the prediction on the test data
56 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
57 | 
58 |     # (Legacy method)
59 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
60 |     y_test_pred = thres.predict(y_test_scores)
61 | 
62 |     # evaluate and print the results
63 |     print('\nOn Training Data:')
64 |     evaluate_print(clf_name, y_train, y_train_scores)
65 |     print('\nOn Test Data:')
66 |     evaluate_print(clf_name, y_test, y_test_scores)
67 | 
68 |     # visualize the results
69 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
70 |               y_test_pred, show_figure=True, save_figure=False)
71 | 


--------------------------------------------------------------------------------
/examples/gesd_example.py:
--------------------------------------------------------------------------------
 1 | """Example of using the generalized extreme studentized deviate.
 2 | 
 3 |    for outlier thresholding
 4 | """
 5 | # Author: D Kulik
 6 | # License: BSD 2 clause
 7 | 
 8 | 
 9 | import os
10 | import sys
11 | 
12 | from pyod.models.knn import KNN
13 | from pyod.utils.data import evaluate_print, generate_data
14 | from pyod.utils.example import visualize
15 | 
16 | from pythresh.thresholds.gesd import GESD
17 | 
18 | # temporary solution for relative imports in case pyod is not installed
19 | # if pyod is installed, no need to use the following line
20 | sys.path.append(
21 |     os.path.abspath(os.path.join(os.path.dirname('__file__'), '..')))
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     contamination = 0.1  # percentage of outliers
26 |     n_train = 200  # number of training points
27 |     n_test = 100  # number of testing points
28 | 
29 |     # Generate sample data
30 |     X_train, X_test, y_train, y_test =\
31 |         generate_data(n_train=n_train,
32 |                       n_test=n_test,
33 |                       n_features=2,
34 |                       contamination=contamination,
35 |                       random_state=42)
36 | 
37 |     # train KNN detector
38 |     clf_name = 'KNN'
39 |     clf = KNN()
40 |     clf.fit(X_train)
41 |     thres = GESD()
42 | 
43 |     # get the prediction labels and outlier scores of the training data
44 |     y_train_scores = clf.decision_scores_  # raw outlier scores
45 | 
46 |     # (Legacy method)
47 |     # y_train_pred = thres.eval(y_train_scores)
48 | 
49 |     # binary labels (0: inliers, 1: outliers)
50 |     thres.fit(y_train_scores)
51 |     y_train_pred = thres.labels_
52 |     # or
53 |     y_train_pred = thres.predict(y_train_scores)
54 | 
55 |     # get the prediction on the test data
56 |     y_test_scores = clf.decision_function(X_test)  # outlier scores
57 | 
58 |     # (Legacy method)
59 |     # y_test_pred = thres.eval(y_test_scores)  # outlier labels (0 or 1)
60 |     y_test_pred = thres.predict(y_test_scores)
61 | 
62 |     # evaluate and print the results
63 |     print('\nOn Training Data:')
64 |     evaluate_print(clf_name, y_train, y_train_scores)
65 |     print('\nOn Test Data:')
66 |     evaluate_print(clf_name, y_test, y_test_scores)
67 | 
68 |     # visualize the results
69 |     visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred,
70 |               y_test_pred, show_figure=True, save_figure=False)
71 | 


--------------------------------------------------------------------------------
/pythresh/utils/rank_utility.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.decomposition import TruncatedSVD
 3 | from sklearn.naive_bayes import GaussianNB
 4 | 
 5 | 
 6 | def BREG_metric(x1, x2):
 7 |     """Calculate the Exponential Euclidean Bregman distance."""
 8 | 
 9 |     gradient_x = np.exp(x1) - 1
10 |     gradient_y = np.exp(x2) - 1
11 | 
12 |     distance = np.sum((x1 - x2) * (gradient_x - gradient_y))
13 | 
14 |     return distance
15 | 
16 | 
17 | def mclain_rao_index(data, labels):
18 |     """Calculate the Mclain Rao index."""
19 | 
20 |     unique_labels = np.unique(labels)
21 |     centroids = []
22 | 
23 |     # Calculate the centroids of each cluster
24 |     for label in unique_labels:
25 |         cluster_data = data[labels == label]
26 |         centroid = np.mean(cluster_data)
27 |         centroids.append(centroid)
28 | 
29 |     num_clusters = len(centroids)
30 |     mri = 0.0
31 | 
32 |     # Calculate the MRI
33 |     for i in range(num_clusters):
34 |         for j in range(i + 1, num_clusters):
35 |             distance = (centroids[i] - centroids[j]) ** 2
36 |             mri += distance
37 | 
38 |     # Normalize the MRI by the number of cluster pairs
39 |     num_cluster_pairs = num_clusters * (num_clusters - 1) / 2
40 |     mri /= num_cluster_pairs
41 | 
42 |     return mri
43 | 
44 | 
45 | def GNB_score(data, labels):
46 |     """Calculate the Gaussian Naive-Bayes trained consensus score."""
47 | 
48 |     # Setup data for training
49 |     X = np.tile(data, (len(labels), 1))
50 |     y = np.hstack(labels)
51 | 
52 |     # Fit model and predict
53 |     model = GaussianNB()
54 |     model.fit(X, y)
55 | 
56 |     pred = model.predict(data)
57 | 
58 |     # Find the deviation of each model from fitted GNB model
59 |     dev = np.sum(np.abs(np.vstack(labels) - pred), axis=1)
60 | 
61 |     return dev.squeeze()
62 | 
63 | 
64 | def Contam_score(data, labels, contam):
65 |     """Calculate the mean contamination deviation based on TruncatedSVD decomposed scores."""
66 | 
67 |     # Fit model and transform data
68 |     decomp = TruncatedSVD(n_components=1, random_state=1234)
69 |     dat = decomp.fit_transform(np.vstack(data).T).squeeze()
70 | 
71 |     # Find the deviation of the contamination of each model from the decomposed model
72 |     thr = np.zeros(len(labels[0]))
73 |     thr[dat > np.percentile(dat, (1-np.mean(contam))*100)] = 1
74 | 
75 |     dev = np.sum(np.abs(np.vstack(labels) - thr), axis=1)
76 | 
77 |     return dev.squeeze()
78 | 


--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python package
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "main" ]
 9 |   pull_request:
10 |     branches: [ "main" ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ${{ matrix.os }}
16 | 
17 |     strategy:
18 |       fail-fast: false
19 |       matrix:
20 |         os: [ubuntu-latest, windows-latest, macos-latest]
21 |         python-version: ["3.10", "3.11", "3.12", "3.13"]
22 | 
23 |     steps:
24 |     - uses: actions/checkout@v4
25 |       with:
26 |         fetch-depth: 0
27 |     - name: Set up Python ${{ matrix.python-version }}
28 |       uses: actions/setup-python@v6
29 |       with:
30 |         python-version: ${{ matrix.python-version }}
31 |     - name: Install dependencies
32 |       run: |
33 |         python -m pip install --upgrade pip wheel setuptools setuptools-scm
34 |         python -m pip install flake8 pytest-cov mypy cython
35 |         python -m pip install -r requirements-test.txt --use-pep517
36 |     - name: Lint with flake8
37 |       run: |
38 |         # stop the build if there are Python syntax errors or undefined names
39 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
40 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
41 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
42 |     - name: Test with pytest
43 |       run: |
44 |         pytest -vs --doctest-modules --cov-fail-under=90 --cov-branch --cov=pythresh --cov-report term-missing --pyargs pythresh --continue-on-collection-errors
45 |     - name: Upload coverage to Codecov (partial)
46 |       uses: codecov/codecov-action@v5
47 |       with:
48 |         files: ./coverage.xml
49 |         flags: ${{ matrix.os }}-${{ matrix.python-version }}
50 |         fail_ci_if_error: true
51 |         verbose: true
52 |         partial: true
53 |       env:
54 |         CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
55 | 
56 |   finalize:
57 |     needs: build
58 |     runs-on: ubuntu-latest
59 |     steps:
60 |       - name: Finalize Codecov uploads
61 |         uses: codecov/codecov-action@v5
62 |         with:
63 |           finalize: true
64 |         env:
65 |           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
66 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import sys
 3 | # read the contents of README file
 4 | from os import path
 5 | 
 6 | from setuptools import find_packages, setup
 7 | 
 8 | # get __version__ from version.py
 9 | try:
10 |     verpath = path.join('pythresh', 'version.py')
11 |     with open(verpath) as version_file:
12 |         __version__ = str(re.findall(
13 |             r'\b\d+(?:\.\d+)+', version_file.read())[0])
14 | except Exception as error:
15 |     __version__ = '0.0.1'
16 |     sys.stderr.write("Warning: Could not open '%s' due %s\n" %
17 |                      (verpath, error))
18 | 
19 | 
20 | this_directory = path.abspath(path.dirname(__file__))
21 | 
22 | 
23 | # read the contents of README.rst
24 | def readme():
25 |     with open(path.join(this_directory, 'README.rst'), encoding='utf-8') as f:
26 |         return f.read()
27 | 
28 | 
29 | # read the contents of requirements.txt
30 | with open(path.join(this_directory, 'requirements.txt'),
31 |           encoding='utf-8') as f:
32 |     requirements = f.read().splitlines()
33 | 
34 | setup(
35 |     name='pythresh',
36 |     version=__version__,
37 |     description='A Python Toolbox for Outlier Detection Thresholding',
38 |     long_description=readme(),
39 |     long_description_content_type='text/x-rst',
40 |     author='D Kulik',
41 |     url='https://github.com/KulikDM/pythresh',
42 |     download_url='https://github.com/KulikDM/pythresh/archive/master.zip',
43 |     keywords=['outlier detection', 'anomaly detection', 'thresholding', 'cutoff',
44 |               'contamintion level', 'data science', 'machine learning'],
45 |     project_urls={
46 |         'Documentation': 'https://pythresh.readthedocs.io/en/latest/'},
47 |     packages=find_packages(exclude=['test']),
48 |     include_package_data=True,
49 |     install_requires=requirements,
50 |     setup_requires=['setuptools>=38.6.0'],
51 |     classifiers=[
52 |         'Development Status :: 5 - Production/Stable',
53 |         'Intended Audience :: Education',
54 |         'Intended Audience :: Financial and Insurance Industry',
55 |         'Intended Audience :: Science/Research',
56 |         'Intended Audience :: Developers',
57 |         'Intended Audience :: Information Technology',
58 |         'License :: OSI Approved :: BSD License',
59 |         'Programming Language :: Python :: 3.7',
60 |         'Programming Language :: Python :: 3.8',
61 |         'Programming Language :: Python :: 3.9',
62 |         'Programming Language :: Python :: 3.10',
63 |         'Programming Language :: Python :: 3.11',
64 |         'Programming Language :: Python :: 3.12',
65 |         'Programming Language :: Python :: 3.13',
66 |     ],
67 | )
68 | 


--------------------------------------------------------------------------------
/pythresh/test/test_rank.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import unittest
 3 | from itertools import product
 4 | from os.path import dirname as up
 5 | 
 6 | # noinspection PyProtectedMember
 7 | from pyod.models.iforest import IForest
 8 | from pyod.models.knn import KNN
 9 | from pyod.models.pca import PCA
10 | from pyod.utils.data import generate_data
11 | 
12 | from pythresh.thresholds.filter import FILTER
13 | from pythresh.thresholds.iqr import IQR
14 | from pythresh.thresholds.ocsvm import OCSVM
15 | from pythresh.utils.rank import RANK
16 | 
17 | # temporary solution for relative imports in case pythresh is not installed
18 | # if pythresh is installed, no need to use the following line
19 | 
20 | path = up(up(up(__file__)))
21 | sys.path.append(path)
22 | 
23 | 
24 | class TestRANK(unittest.TestCase):
25 |     def setUp(self):
26 |         self.n_train = 200
27 |         self.n_test = 100
28 |         self.contamination = 0.1
29 |         self.X_train, self.X_test, self.y_train, self.y_test = generate_data(
30 |             n_train=self.n_train, n_test=self.n_test,
31 |             contamination=self.contamination, random_state=42)
32 | 
33 |         self.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)]
34 | 
35 |         self.thres = [FILTER(), self.contamination,
36 |                       [FILTER(), IQR(), OCSVM()]]
37 | 
38 |         self.method = ['model', 'native']
39 | 
40 |         self.weights = [[0.5, 0.25, 0.25],
41 |                         [0.25, 0.5, 0.25],
42 |                         [0.25, 0.25, 0.5],
43 |                         None]
44 | 
45 |     def test_prediction_labels(self):
46 | 
47 |         params = product(self.thres,
48 |                          self.method,
49 |                          self.weights)
50 | 
51 |         for thres, method, weights in params:
52 | 
53 |             ranker = RANK(self.clfs, thres, method=method, weights=weights)
54 |             rankings = ranker.eval(self.X_train)
55 | 
56 |             cdf_rank = ranker.cdf_rank_
57 |             clust_rank = ranker.clust_rank_
58 |             consensus_rank = ranker.consensus_rank_
59 | 
60 |             assert (cdf_rank is not None)
61 |             assert (clust_rank is not None)
62 |             assert (consensus_rank is not None)
63 |             assert (rankings is not None)
64 | 
65 |             n_clfs = len(self.clfs)
66 |             n_thres = len(thres) if isinstance(thres, list) else 1
67 |             len_models = n_clfs * n_thres
68 | 
69 |             assert (len(cdf_rank) == len_models)
70 |             assert (len(clust_rank) == len_models)
71 |             assert (len(consensus_rank) == len_models)
72 |             assert (len(rankings) == len_models)
73 | 
74 |             assert (len(set(rankings)) == len_models)
75 | 


--------------------------------------------------------------------------------
/pythresh/thresholds/zscore.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.stats as stats
 3 | 
 4 | from .base import BaseThresholder
 5 | 
 6 | 
 7 | class ZSCORE(BaseThresholder):
 8 |     r"""ZSCORE class for ZSCORE thresholder.
 9 | 
10 |        Use the zscore to evaluate a non-parametric means to threshold
11 |        scores generated by the decision_scores where outliers are set
12 |        to any value beyond a zscore of one.
13 |        See :cite:`bagdonavicius2020zscore` for details.
14 | 
15 |        Parameters
16 |        ----------
17 | 
18 |        factor : int, optional (default=1)
19 |             The factor to multiply the zscore by to set the threshold.
20 |             The default is 1.
21 | 
22 |        random_state : int, optional (default=1234)
23 |             Random seed for the random number generators of the thresholders. Can also
24 |             be set to None.
25 | 
26 |        Attributes
27 |        ----------
28 | 
29 |        thresh_ : threshold value that separates inliers from outliers
30 | 
31 |        dscores_ : 1D array of decomposed decision scores
32 | 
33 |        Notes
34 |        -----
35 | 
36 |        The z-score can be calculated as follows:
37 | 
38 |        .. math::
39 | 
40 |            Z = \frac{x-\bar{x}}{\sigma} \mathrm{,}
41 | 
42 |        where :math:`\bar{x}` and :math:`\sigma` are the mean and the
43 |        standard deviation of the decision scores respectively. The threshold
44 |        is set that any value beyond an absolute z-score of 1 is considered
45 |        and outlier.
46 | 
47 |     """
48 | 
49 |     def __init__(self, factor=1, random_state=1234):
50 | 
51 |         super().__init__()
52 |         self.factor = factor
53 |         self.random_state = random_state
54 |         np.random.seed(random_state)
55 | 
56 |     def eval(self, decision):
57 |         """Outlier/inlier evaluation process for decision scores.
58 | 
59 |         Parameters
60 |         ----------
61 |         decision : np.array or list of shape (n_samples)
62 |                    or np.array of shape (n_samples, n_detectors)
63 |                    which are the decision scores from a
64 |                    outlier detection.
65 | 
66 |         Returns
67 |         -------
68 |         outlier_labels : numpy array of shape (n_samples,)
69 |             For each observation, tells whether or not
70 |             it should be considered as an outlier according to the
71 |             fitted model. 0 stands for inliers and 1 for outliers.
72 |         """
73 | 
74 |         decision = self._data_setup(decision)
75 | 
76 |         # Get the zscore of the decision scores
77 |         zscore = stats.zscore(decision)
78 | 
79 |         # Set the limit to where the zscore is greater than the factor
80 |         labels = np.zeros(len(decision), dtype=int)
81 |         mask = np.where(zscore >= self.factor)
82 |         labels[mask] = 1
83 | 
84 |         self.thresh_ = np.min(decision[labels == 1])
85 | 
86 |         return labels
87 | 


--------------------------------------------------------------------------------
/pythresh/thresholds/mad.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.stats as stats
 3 | 
 4 | from .base import BaseThresholder
 5 | from .thresh_utility import cut
 6 | 
 7 | 
 8 | class MAD(BaseThresholder):
 9 |     r"""MAD class for Median Absolute Deviation thresholder.
10 | 
11 |        Use the median absolute deviation to evaluate a non-parametric
12 |        means to threshold scores generated by the decision_scores
13 |        where outliers are set to any value beyond the mean plus the
14 |        median absolute deviation over the standard deviation.
15 |        See :cite:`archana2015mad` for details.
16 | 
17 |        Parameters
18 |        ----------
19 | 
20 |        factor : int, optional (default=1)
21 |             The factor to multiply the MAD by to set the threshold.
22 |             The default is 1.
23 | 
24 |        random_state : int, optional (default=1234)
25 |             Random seed for the random number generators of the thresholders. Can also
26 |             be set to None.
27 | 
28 |        Attributes
29 |        ----------
30 | 
31 |        thresh_ : threshold value that separates inliers from outliers
32 | 
33 |        dscores_ : 1D array of decomposed decision scores
34 | 
35 |        Notes
36 |        -----
37 | 
38 |        The median absolute deviation is defined as:
39 | 
40 |        .. math::
41 | 
42 |           MAD = med\lvert x - med(x)\rvert \mathrm{.}
43 | 
44 |        And the threshold is set such that:
45 | 
46 |        .. math::
47 | 
48 |           \mathrm{lim} = \bar{x} + \frac{MAD}{\sigma} \mathrm{,}
49 | 
50 |        where :math:`\bar{x}` and :math:`\sigma` are the mean and
51 |        standard deviation of the scores respectively
52 | 
53 |     """
54 | 
55 |     def __init__(self, factor=1, random_state=1234):
56 | 
57 |         super().__init__()
58 |         self.factor = factor
59 |         self.random_state = random_state
60 |         np.random.seed(random_state)
61 | 
62 |     def eval(self, decision):
63 |         """Outlier/inlier evaluation process for decision scores.
64 | 
65 |         Parameters
66 |         ----------
67 |         decision : np.array or list of shape (n_samples)
68 |                    or np.array of shape (n_samples, n_detectors)
69 |                    which are the decision scores from a
70 |                    outlier detection.
71 | 
72 |         Returns
73 |         -------
74 |         outlier_labels : numpy array of shape (n_samples,)
75 |             For each observation, tells whether or not
76 |             it should be considered as an outlier according to the
77 |             fitted model. 0 stands for inliers and 1 for outliers.
78 |         """
79 | 
80 |         decision = self._data_setup(decision)
81 | 
82 |         # Set limit
83 |         mean = np.mean(decision)
84 |         mad = stats.median_abs_deviation(decision, scale=np.std(decision))
85 |         limit = mean + self.factor * mad
86 | 
87 |         self.thresh_ = limit
88 | 
89 |         return cut(decision, limit)
90 | 


--------------------------------------------------------------------------------
/pythresh/thresholds/iqr.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | 
 3 | import numpy as np
 4 | 
 5 | from .base import BaseThresholder
 6 | from .thresh_utility import cut
 7 | 
 8 | 
 9 | class IQR(BaseThresholder):
10 |     r"""IQR class for Inter-Qaurtile Region thresholder.
11 | 
12 |        Use the inter-quartile region to evaluate a non-parametric
13 |        means to threshold scores generated by the decision_scores
14 |        where outliers are set to any value beyond the third quartile
15 |        plus 1.5 times the inter-quartile region.
16 |        See :cite:`bardet2015iqr` for details.
17 | 
18 |        Parameters
19 |        ----------
20 | 
21 |        random_state : int, optional (default=1234)
22 |             Random seed for the random number generators of the thresholders. Can also
23 |             be set to None.
24 | 
25 |        Attributes
26 |        ----------
27 | 
28 |        thresh_ : threshold value that separates inliers from outliers
29 | 
30 |        dscores_ : 1D array of decomposed decision scores
31 | 
32 |        Notes
33 |        -----
34 | 
35 |        The inter-quartile region is given as:
36 | 
37 |        .. math::
38 | 
39 |            IQR = \lvert Q_3-Q_1 \rvert
40 | 
41 |        where :math:`Q_1` and :math:`Q_3` are the first and third quartile
42 |        respectively. The threshold for the decision scores is set as:
43 | 
44 |        .. math::
45 | 
46 |            t = Q_3 + 1.5 IQR
47 | 
48 |     """
49 | 
50 |     def __init__(self, random_state=1234):
51 | 
52 |         super().__init__()
53 |         self.random_state = random_state
54 |         np.random.seed(random_state)
55 | 
56 |     def eval(self, decision):
57 |         """Outlier/inlier evaluation process for decision scores.
58 | 
59 |         Parameters
60 |         ----------
61 |         decision : np.array or list of shape (n_samples)
62 |                    or np.array of shape (n_samples, n_detectors)
63 |                    which are the decision scores from a
64 |                    outlier detection.
65 | 
66 |         Returns
67 |         -------
68 |         outlier_labels : numpy array of shape (n_samples,)
69 |             For each observation, tells whether or not
70 |             it should be considered as an outlier according to the
71 |             fitted model. 0 stands for inliers and 1 for outliers.
72 |         """
73 | 
74 |         decision = self._data_setup(decision)
75 | 
76 |         arg_map = {'old': 'interpolation', 'new': 'method'}
77 |         arg_name = (arg_map['new'] if 'method' in
78 |                     inspect.signature(np.percentile).parameters
79 |                     else arg_map['old'])
80 | 
81 |         # First quartile (Q1)
82 |         P1 = np.percentile(decision, 25, **{arg_name: 'midpoint'})
83 | 
84 |         # Third quartile (Q3)
85 |         P3 = np.percentile(decision, 75, **{arg_name: 'midpoint'})
86 | 
87 |         # Calculate IQR and generate limit
88 |         iqr = abs(P3-P1)
89 |         limit = P3 + 1.5*iqr
90 | 
91 |         self.thresh_ = limit
92 | 
93 |         return cut(decision, limit)
94 | 


--------------------------------------------------------------------------------
/pythresh/thresholds/fwfm.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy.signal import find_peaks, peak_widths
 3 | 
 4 | from .base import BaseThresholder
 5 | from .thresh_utility import cut, gen_kde, normalize
 6 | 
 7 | 
 8 | class FWFM(BaseThresholder):
 9 |     """FWFM class for Full Width at Full Minimum thresholder.
10 | 
11 |        Use the full width at full minimum (aka base width) to evaluate
12 |        a non-parametric means to threshold scores generated by the
13 |        decision_scores where outliers are set to any value beyond the base
14 |        width. See :cite:`joneidi2013fwfm` for details.
15 | 
16 |        Parameters
17 |        ----------
18 | 
19 |        random_state : int, optional (default=1234)
20 |             Random seed for the random number generators of the thresholders. Can also
21 |             be set to None.
22 | 
23 |        Attributes
24 |        ----------
25 | 
26 |        thresh_ : threshold value that separates inliers from outliers
27 | 
28 |        dscores_ : 1D array of decomposed decision scores
29 | 
30 |        Notes
31 |        -----
32 | 
33 |        The outlier detection scores are assumed to be a mixture of Gaussian
34 |        distributions. The probability density function of this Gaussian mixture
35 |        is approximated using kernel density estimation. The highest peak within the
36 |        PDF is used to find the base width of the mixture and the threshold is set
37 |        to the base width divided by the number of scores.
38 |     """
39 | 
40 |     def __init__(self, random_state=1234):
41 | 
42 |         super().__init__()
43 |         self.random_state = random_state
44 |         np.random.seed(random_state)
45 | 
46 |     def eval(self, decision):
47 |         """Outlier/inlier evaluation process for decision scores.
48 | 
49 |         Parameters
50 |         ----------
51 |         decision : np.array or list of shape (n_samples)
52 |                    or np.array of shape (n_samples, n_detectors)
53 |                    which are the decision scores from a
54 |                    outlier detection.
55 | 
56 |         Returns
57 |         -------
58 |         outlier_labels : numpy array of shape (n_samples,)
59 |             For each observation, tells whether or not
60 |             it should be considered as an outlier according to the
61 |             fitted model. 0 stands for inliers and 1 for outliers.
62 |         """
63 | 
64 |         decision = self._data_setup(decision)
65 | 
66 |         # Generate KDE
67 |         val, _ = gen_kde(decision, -1, 1, len(decision)*3)
68 |         val = normalize(val)
69 | 
70 |         # Find the greatest peak of the KDE
71 |         peaks, _ = find_peaks(val, prominence=0.75)
72 | 
73 |         # Find the base width of the peak
74 |         base_width = peak_widths(val, peaks, rel_height=0.99)[0]
75 | 
76 |         # Normalize and set limit
77 |         eps = np.finfo(decision.dtype).eps
78 |         limit = base_width[0]/len(val) if len(base_width) > 0 else 1.0 + eps
79 | 
80 |         self.thresh_ = limit
81 | 
82 |         return cut(decision, limit)
83 | 


--------------------------------------------------------------------------------
/pythresh/thresholds/fgd.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from .base import BaseThresholder
 4 | from .thresh_utility import cut, gen_kde
 5 | 
 6 | 
 7 | class FGD(BaseThresholder):
 8 |     """FGD class for Fixed Gradient Descent thresholder.
 9 | 
10 |        Use the fixed gradient descent to evaluate a non-parametric means
11 |        to threshold scores generated by the decision_scores where outliers
12 |        are set to any value beyond where the first derivative of the kde
13 |        with respect to the decision scores passes the mean of the first
14 |        and second inflection points. See :cite:`qi2021fgd` for details.
15 | 
16 |        Parameters
17 |        ----------
18 | 
19 |        random_state : int, optional (default=1234)
20 |             Random seed for the random number generators of the thresholders. Can also
21 |             be set to None.
22 | 
23 |        Attributes
24 |        ----------
25 | 
26 |        thresh_ : threshold value that separates inliers from outliers
27 | 
28 |        dscores_ : 1D array of decomposed decision scores
29 | 
30 |        Notes
31 |        -----
32 | 
33 |        A probability distribution of the decision scores is generated using
34 |        kernel density estimation. The first derivative of the pdf is
35 |        calculated, and the threshold is set as the middle point between the
36 |        first and second inflection points starting from the left side of the
37 |        data range.
38 |     """
39 | 
40 |     def __init__(self, random_state=1234):
41 | 
42 |         super().__init__()
43 |         self.random_state = random_state
44 |         np.random.seed(random_state)
45 | 
46 |     def eval(self, decision):
47 |         """Outlier/inlier evaluation process for decision scores.
48 | 
49 |         Parameters
50 |         ----------
51 |         decision : np.array or list of shape (n_samples)
52 |                    or np.array of shape (n_samples, n_detectors)
53 |                    which are the decision scores from a
54 |                    outlier detection.
55 | 
56 |         Returns
57 |         -------
58 |         outlier_labels : numpy array of shape (n_samples,)
59 |             For each observation, tells whether or not
60 |             it should be considered as an outlier according to the
61 |             fitted model. 0 stands for inliers and 1 for outliers.
62 |         """
63 | 
64 |         decision = self._data_setup(decision)
65 | 
66 |         # Generate KDE
67 |         val, dat_range = gen_kde(decision, 0, 1, len(decision)*3)
68 | 
69 |         # Calculate the first derivative of the KDE with respect
70 |         # to the data range
71 |         deriv = np.gradient(val, dat_range[1]-dat_range[0])
72 | 
73 |         count = 0
74 |         ind = []
75 | 
76 |         # Find the first two inflection points
77 |         for i in range(len(deriv)-1):
78 | 
79 |             if (deriv[i] > 0) & (deriv[i+1] <= 0):
80 |                 count += 1
81 |                 ind.append(i)
82 |                 if count == 2:
83 |                     break
84 | 
85 |         eps = np.finfo(decision.dtype).eps
86 | 
87 |         limit = ((dat_range[ind[0]]+dat_range[ind[1]])/2 if
88 |                  len(ind) > 1 else 1.0 + eps)
89 | 
90 |         self.thresh_ = limit
91 | 
92 |         return cut(decision, limit)
93 | 


--------------------------------------------------------------------------------
/pythresh/thresholds/thresh_utility.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.stats as stats
  3 | from scipy.interpolate import interp1d
  4 | from scipy.special import ndtr
  5 | from sklearn.decomposition import TruncatedSVD
  6 | from sklearn.utils import check_array
  7 | 
  8 | 
  9 | def get_min_max(data):
 10 | 
 11 |     min_val = np.min(data, axis=0)
 12 |     max_val = np.max(data, axis=0)
 13 | 
 14 |     return min_val, max_val
 15 | 
 16 | 
 17 | def normalize(data, min_val=None, max_val=None):
 18 | 
 19 |     if min_val is None or max_val is None:
 20 |         min_val, max_val = get_min_max(data)
 21 | 
 22 |     normed = (data - min_val) / (max_val - min_val)
 23 | 
 24 |     return normed
 25 | 
 26 | 
 27 | def cut(decision, limit):
 28 | 
 29 |     labels = np.zeros(len(decision), dtype=int)
 30 | 
 31 |     labels[decision >= limit] = 1
 32 | 
 33 |     return labels
 34 | 
 35 | 
 36 | def gen_interp(x, y):
 37 | 
 38 |     interpolator = interp1d(x, y, kind='cubic',
 39 |                             fill_value='extrapolate')
 40 | 
 41 |     return interpolator
 42 | 
 43 | 
 44 | def gen_kde(data, lower, upper, size):
 45 | 
 46 |     insize = min(size, 5000)
 47 | 
 48 |     # Create a KDE of the data
 49 |     kde = stats.gaussian_kde(data)
 50 |     dat_range = np.linspace(lower, upper, insize)
 51 |     dat_eval = np.linspace(lower, upper, size)
 52 | 
 53 |     # Use interpolation for fast KDE upsampling
 54 |     if size > insize:
 55 |         interpolator = gen_interp(dat_range, kde(dat_range))
 56 |         return interpolator(dat_eval), dat_eval
 57 | 
 58 |     return kde(dat_eval), dat_eval
 59 | 
 60 | 
 61 | def gen_cdf(data, lower, upper, size):
 62 | 
 63 |     insize = min(size, 5000)
 64 | 
 65 |     # Create a KDE & CDF of the data
 66 |     kde = stats.gaussian_kde(data)
 67 |     dat_range = np.linspace(lower, upper, insize)
 68 |     dat_eval = np.linspace(lower, upper, size)
 69 | 
 70 |     cdf = np.array(tuple(ndtr(np.ravel(item - kde.dataset) / kde.factor).mean()
 71 |                          for item in dat_range))
 72 | 
 73 |     # Use interpolation for fast CDF upsampling
 74 |     if size > insize:
 75 |         interpolator = gen_interp(dat_range, cdf)
 76 |         return interpolator(dat_eval), dat_eval
 77 | 
 78 |     return cdf, dat_eval
 79 | 
 80 | 
 81 | def check_scores(decision, decomp=None, min_val=None, max_val=None, random_state=1234):
 82 | 
 83 |     # Check decision scores dimensionality and pre-process
 84 |     if (np.asarray(decision).ndim == 2) & (np.atleast_2d(decision).shape[1] > 1):
 85 | 
 86 |         decision = check_array(decision, ensure_2d=True)
 87 |         decision = normalize(decision, min_val, max_val)
 88 |         decision, decomp = decompose(decision, decomp, random_state)
 89 | 
 90 |     else:
 91 |         decision = check_array(decision, ensure_2d=False)
 92 | 
 93 |     return decision.squeeze(), decomp
 94 | 
 95 | 
 96 | def decompose(data, decomp=None, random_state=1234):
 97 | 
 98 |     # Decompose decision scores to 1D array for thresholding
 99 |     if decomp is None:
100 |         decomp = TruncatedSVD(n_components=1, random_state=random_state)
101 |         data = decomp.fit_transform(data)
102 |     else:
103 |         data = decomp.transform(data)
104 | 
105 |     return data, decomp
106 | 


--------------------------------------------------------------------------------
/pythresh/thresholds/cpd.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import ruptures as rpt
 3 | 
 4 | from .base import BaseThresholder
 5 | from .thresh_utility import cut, gen_cdf, gen_kde
 6 | 
 7 | 
 8 | class CPD(BaseThresholder):
 9 |     r"""CPD class for Change Point Detection thresholder.
10 | 
11 |        Use change point detection to find a non-parametric means
12 |        to threshold scores generated by the decision_scores where outliers
13 |        are set to any value beyond the detected change point.
14 |        See :cite:`fearnhead2016cpd` for details
15 | 
16 |        Parameters
17 |        ----------
18 | 
19 |        method : {'Dynp', 'KernelCPD', 'Binseg', 'BottomUp'}, optional (default='Dynp')
20 |             Method for change point detection
21 | 
22 |             - 'Dynp':      Dynamic programming (optimal minimum sum of errors per partition)
23 |             - 'KernelCPD': RBF kernel function (optimal minimum sum of errors per partition)
24 |             - 'Binseg':    Binary segmentation
25 |             - 'BottomUp':  Bottom-up segmentation
26 | 
27 |        transform : {'cdf', 'kde'}, optional (default='cdf')
28 |             Data transformation method prior to fit
29 | 
30 |             - 'cdf': Use the cumulative distribution function
31 |             - 'kde': Use the kernel density estimation
32 | 
33 |        random_state : int, optional (default=1234)
34 |             Random seed for the random number generators of the thresholders. Can also
35 |             be set to None.
36 | 
37 |        Attributes
38 |        ----------
39 | 
40 |        thresh_ : threshold value that separates inliers from outliers
41 | 
42 |        dscores_ : 1D array of decomposed decision scores
43 | 
44 |     """
45 | 
46 |     def __init__(self, method='Dynp', transform='cdf', random_state=1234):
47 | 
48 |         super().__init__()
49 |         self.method = method
50 |         self.transform = transform
51 |         self.method_func = {'Dynp': rpt.Dynp(), 'KernelCPD': rpt.KernelCPD(kernel='rbf'),
52 |                             'Binseg': rpt.Binseg(), 'BottomUp': rpt.BottomUp()}
53 |         self.random_state = random_state
54 |         np.random.seed(random_state)
55 | 
56 |     def eval(self, decision):
57 |         """Outlier/inlier evaluation process for decision scores.
58 | 
59 |         Parameters
60 |         ----------
61 |         decision : np.array or list of shape (n_samples)
62 |                    or np.array of shape (n_samples, n_detectors)
63 |                    which are the decision scores from a
64 |                    outlier detection.
65 | 
66 |         Returns
67 |         -------
68 |         outlier_labels : numpy array of shape (n_samples,)
69 |             For each observation, tells whether or not
70 |             it should be considered as an outlier according to the
71 |             fitted model. 0 stands for inliers and 1 for outliers.
72 |         """
73 | 
74 |         decision = self._data_setup(decision)
75 | 
76 |         # Transform data prior to fit
77 |         if self.transform == 'cdf':
78 |             val_data, data_range = gen_cdf(decision, 0, 1, len(decision)*3)
79 |         else:
80 |             val_data, data_range = gen_kde(decision, 0, 1, len(decision)*3)
81 | 
82 |         # Change point detection
83 |         det = self.method_func[self.method].fit(val_data)
84 |         change = det.predict(n_bkps=1)
85 | 
86 |         # Set limit at change point
87 |         limit = data_range[change[0]]
88 |         self.thresh_ = limit
89 | 
90 |         return cut(decision, limit)
91 | 


--------------------------------------------------------------------------------
/pythresh/thresholds/mtt.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.stats as stats
  3 | 
  4 | from .base import BaseThresholder
  5 | from .thresh_utility import cut
  6 | 
  7 | # https://github.com/vvaezian/modified_thompson_tau_test/blob/main/src/Modified_Thompson_Tau_Test/modified_thompson_tau_test.py
  8 | 
  9 | 
 10 | class MTT(BaseThresholder):
 11 |     r"""MTT class for Modified Thompson Tau test thresholder.
 12 | 
 13 |        Use the modified Thompson Tau test to evaluate a non-parametric means
 14 |        to threshold scores generated by the decision_scores where outliers
 15 |        are set to any value beyond the smallest outlier detected by the test.
 16 |        See :cite:`rengasamy2020mtt` for details.
 17 | 
 18 |        Parameters
 19 |        ----------
 20 | 
 21 |        alpha : float, optional (default=0.01)
 22 |             Confidence level corresponding to the t-Student distribution map to sample
 23 | 
 24 |        random_state : int, optional (default=1234)
 25 |             Random seed for the random number generators of the thresholders. Can also
 26 |             be set to None.
 27 | 
 28 |        Attributes
 29 |        ----------
 30 | 
 31 |        thresh_ : threshold value that separates inliers from outliers
 32 | 
 33 |        dscores_ : 1D array of decomposed decision scores
 34 | 
 35 |        Notes
 36 |        -----
 37 | 
 38 |        The Modified Thompson Tau test is a modified univariate t-test that eliminates outliers
 39 |        that are more than a number of standard deviations away from the mean. This method is
 40 |        done iteratively with the Tau critical value being recalculated after each outlier removal
 41 |        until the dataset no longer has data points that fall outside of the criterion. The Tau
 42 |        critical value can be obtained by,
 43 | 
 44 |        .. math::
 45 | 
 46 |            \tau = \frac{t \cdot (n-1)}{\sqrt{n}\sqrt{n-2+t^2}}  \mathrm{,}
 47 | 
 48 |        where :math:`n` is the number of data points and :math:`t` is the student t-value
 49 | 
 50 |     """
 51 | 
 52 |     def __init__(self, alpha=0.01, random_state=1234):
 53 | 
 54 |         super().__init__()
 55 |         self.alpha = alpha if alpha <= 0.5 else 1 - alpha
 56 |         self.random_state = random_state
 57 |         np.random.seed(random_state)
 58 | 
 59 |     def eval(self, decision):
 60 |         """Outlier/inlier evaluation process for decision scores.
 61 | 
 62 |         Parameters
 63 |         ----------
 64 |         decision : np.array or list of shape (n_samples)
 65 |                    or np.array of shape (n_samples, n_detectors)
 66 |                    which are the decision scores from a
 67 |                    outlier detection.
 68 | 
 69 |         Returns
 70 |         -------
 71 |         outlier_labels : numpy array of shape (n_samples,)
 72 |             For each observation, tells whether or not
 73 |             it should be considered as an outlier according to the
 74 |             fitted model. 0 stands for inliers and 1 for outliers.
 75 |         """
 76 | 
 77 |         decision = self._data_setup(decision)
 78 | 
 79 |         arr = np.sort(decision.copy())
 80 | 
 81 |         eps = np.finfo(decision.dtype).eps
 82 |         limit = 1.0 + eps
 83 | 
 84 |         while True:
 85 | 
 86 |             # Calculate the rejection threshold
 87 |             n = len(arr)
 88 |             t = stats.t.ppf(1-self.alpha, df=n-2)
 89 |             thres = (t * (n - 1))/(np.sqrt(n) * np.sqrt(n - 2 + t**2))
 90 |             delta = np.abs(arr[-1] - arr.mean())/arr.std()
 91 | 
 92 |             if delta > thres:
 93 |                 limit = arr[-1]
 94 |                 arr = np.delete(arr, n-1)
 95 | 
 96 |             else:
 97 |                 break
 98 | 
 99 |         self.thresh_ = limit
100 | 
101 |         return cut(decision, limit)
102 | 


--------------------------------------------------------------------------------
/pythresh/test/test_eb.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import unittest
 3 | from os.path import dirname as up
 4 | 
 5 | # noinspection PyProtectedMember
 6 | import joblib
 7 | import numpy as np
 8 | from numpy.testing import assert_equal
 9 | from pyod.models.iforest import IForest
10 | from pyod.models.knn import KNN
11 | from pyod.models.pca import PCA
12 | from pyod.utils.data import generate_data
13 | 
14 | from pythresh.thresholds.eb import EB
15 | 
16 | # temporary solution for relative imports in case pythresh is not installed
17 | # if pythresh is installed, no need to use the following line
18 | 
19 | path = up(up(up(__file__)))
20 | sys.path.append(path)
21 | 
22 | 
23 | class TestEB(unittest.TestCase):
24 | 
25 |     @classmethod
26 |     def setUpClass(cls):
27 |         cls.n_train = 200
28 |         cls.n_test = 100
29 |         cls.contamination = 0.1
30 |         cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data(
31 |             n_train=cls.n_train, n_test=cls.n_test,
32 |             contamination=cls.contamination, random_state=42)
33 | 
34 |         cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)]
35 |         cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_
36 |         cls.multiple_scores = np.vstack([
37 |             clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs
38 |         ]).T
39 |         cls.all_scores = [cls.single_score, cls.multiple_scores]
40 | 
41 |     def setUp(self):
42 |         self.thres = EB()
43 | 
44 |     def check_labels(self, labels, scores_shape):
45 |         self.assertEqual(labels.shape, scores_shape[:1])
46 |         self.assertIn(labels.min(), [0, 1])
47 |         self.assertIn(labels.max(), [0, 1])
48 | 
49 |     def check_fitted_attributes(self, thres):
50 |         self.assertTrue(thres.__sklearn_is_fitted__())
51 |         self.assertIsNotNone(thres.labels_)
52 |         self.assertIsNotNone(thres.thresh_)
53 | 
54 |     def test_eval(self):
55 |         for scores in self.all_scores:
56 |             pred_labels = self.thres.eval(scores)
57 | 
58 |             self.assertIsNotNone(self.thres.thresh_)
59 |             self.assertIsNotNone(self.thres.dscores_)
60 |             self.assertGreaterEqual(self.thres.dscores_.min(), 0)
61 |             self.assertLessEqual(self.thres.dscores_.max(), 1)
62 |             self.check_labels(pred_labels, scores.shape)
63 | 
64 |     def test_fit(self):
65 |         for scores in self.all_scores:
66 |             self.thres.fit(scores)
67 |             self.check_fitted_attributes(self.thres)
68 |             self.check_labels(self.thres.labels_, scores.shape)
69 | 
70 |     def test_predict(self):
71 |         for scores in self.all_scores:
72 |             self.thres.fit(scores)
73 |             pred_labels = self.thres.predict(scores)
74 |             self.check_fitted_attributes(self.thres)
75 |             self.check_labels(pred_labels, scores.shape)
76 |             assert_equal(self.thres.labels_, pred_labels)
77 | 
78 |     def test_test_data(self):
79 |         for scores, test_scores in zip(self.all_scores, [
80 |             self.clfs[0].fit(self.X_train).decision_function(self.X_test),
81 |             np.vstack([clf.fit(self.X_train).decision_function(self.X_test)
82 |                       for clf in self.clfs]).T
83 |         ]):
84 |             self.thres.fit(scores)
85 |             pred_labels = self.thres.predict(test_scores)
86 |             self.check_fitted_attributes(self.thres)
87 |             self.check_labels(pred_labels, test_scores.shape)
88 | 
89 |     def test_save_and_load(self):
90 |         for scores in self.all_scores:
91 |             self.thres.fit(scores)
92 |             joblib.dump(self.thres, 'model.pkl')
93 |             loaded_thres = joblib.load('model.pkl')
94 | 
95 |             assert_equal(self.thres.predict(scores),
96 |                          loaded_thres.predict(scores))
97 | 


--------------------------------------------------------------------------------
/pythresh/test/test_yj.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import unittest
 3 | from os.path import dirname as up
 4 | 
 5 | # noinspection PyProtectedMember
 6 | import joblib
 7 | import numpy as np
 8 | from numpy.testing import assert_equal
 9 | from pyod.models.iforest import IForest
10 | from pyod.models.knn import KNN
11 | from pyod.models.pca import PCA
12 | from pyod.utils.data import generate_data
13 | 
14 | from pythresh.thresholds.yj import YJ
15 | 
16 | # temporary solution for relative imports in case pythresh is not installed
17 | # if pythresh is installed, no need to use the following line
18 | 
19 | path = up(up(up(__file__)))
20 | sys.path.append(path)
21 | 
22 | 
23 | class TestYJ(unittest.TestCase):
24 | 
25 |     @classmethod
26 |     def setUpClass(cls):
27 |         cls.n_train = 200
28 |         cls.n_test = 100
29 |         cls.contamination = 0.1
30 |         cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data(
31 |             n_train=cls.n_train, n_test=cls.n_test,
32 |             contamination=cls.contamination, random_state=42)
33 | 
34 |         cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)]
35 |         cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_
36 |         cls.multiple_scores = np.vstack([
37 |             clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs
38 |         ]).T
39 |         cls.all_scores = [cls.single_score, cls.multiple_scores]
40 | 
41 |     def setUp(self):
42 |         self.thres = YJ()
43 | 
44 |     def check_labels(self, labels, scores_shape):
45 |         self.assertEqual(labels.shape, scores_shape[:1])
46 |         self.assertIn(labels.min(), [0, 1])
47 |         self.assertIn(labels.max(), [0, 1])
48 | 
49 |     def check_fitted_attributes(self, thres):
50 |         self.assertTrue(thres.__sklearn_is_fitted__())
51 |         self.assertIsNotNone(thres.labels_)
52 |         self.assertIsNotNone(thres.thresh_)
53 | 
54 |     def test_eval(self):
55 |         for scores in self.all_scores:
56 |             pred_labels = self.thres.eval(scores)
57 | 
58 |             self.assertIsNotNone(self.thres.thresh_)
59 |             self.assertIsNotNone(self.thres.dscores_)
60 |             self.assertGreaterEqual(self.thres.dscores_.min(), 0)
61 |             self.assertLessEqual(self.thres.dscores_.max(), 1)
62 |             self.check_labels(pred_labels, scores.shape)
63 | 
64 |     def test_fit(self):
65 |         for scores in self.all_scores:
66 |             self.thres.fit(scores)
67 |             self.check_fitted_attributes(self.thres)
68 |             self.check_labels(self.thres.labels_, scores.shape)
69 | 
70 |     def test_predict(self):
71 |         for scores in self.all_scores:
72 |             self.thres.fit(scores)
73 |             pred_labels = self.thres.predict(scores)
74 |             self.check_fitted_attributes(self.thres)
75 |             self.check_labels(pred_labels, scores.shape)
76 |             assert_equal(self.thres.labels_, pred_labels)
77 | 
78 |     def test_test_data(self):
79 |         for scores, test_scores in zip(self.all_scores, [
80 |             self.clfs[0].fit(self.X_train).decision_function(self.X_test),
81 |             np.vstack([clf.fit(self.X_train).decision_function(self.X_test)
82 |                       for clf in self.clfs]).T
83 |         ]):
84 |             self.thres.fit(scores)
85 |             pred_labels = self.thres.predict(test_scores)
86 |             self.check_fitted_attributes(self.thres)
87 |             self.check_labels(pred_labels, test_scores.shape)
88 | 
89 |     def test_save_and_load(self):
90 |         for scores in self.all_scores:
91 |             self.thres.fit(scores)
92 |             joblib.dump(self.thres, 'model.pkl')
93 |             loaded_thres = joblib.load('model.pkl')
94 | 
95 |             assert_equal(self.thres.predict(scores),
96 |                          loaded_thres.predict(scores))
97 | 


--------------------------------------------------------------------------------
/pythresh/test/test_fgd.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import unittest
 3 | from os.path import dirname as up
 4 | 
 5 | # noinspection PyProtectedMember
 6 | import joblib
 7 | import numpy as np
 8 | from numpy.testing import assert_equal
 9 | from pyod.models.iforest import IForest
10 | from pyod.models.knn import KNN
11 | from pyod.models.pca import PCA
12 | from pyod.utils.data import generate_data
13 | 
14 | from pythresh.thresholds.fgd import FGD
15 | 
16 | # temporary solution for relative imports in case pythresh is not installed
17 | # if pythresh is installed, no need to use the following line
18 | 
19 | path = up(up(up(__file__)))
20 | sys.path.append(path)
21 | 
22 | 
23 | class TestFGD(unittest.TestCase):
24 | 
25 |     @classmethod
26 |     def setUpClass(cls):
27 |         cls.n_train = 200
28 |         cls.n_test = 100
29 |         cls.contamination = 0.1
30 |         cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data(
31 |             n_train=cls.n_train, n_test=cls.n_test,
32 |             contamination=cls.contamination, random_state=42)
33 | 
34 |         cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)]
35 |         cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_
36 |         cls.multiple_scores = np.vstack([
37 |             clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs
38 |         ]).T
39 |         cls.all_scores = [cls.single_score, cls.multiple_scores]
40 | 
41 |     def setUp(self):
42 |         self.thres = FGD()
43 | 
44 |     def check_labels(self, labels, scores_shape):
45 |         self.assertEqual(labels.shape, scores_shape[:1])
46 |         self.assertIn(labels.min(), [0, 1])
47 |         self.assertIn(labels.max(), [0, 1])
48 | 
49 |     def check_fitted_attributes(self, thres):
50 |         self.assertTrue(thres.__sklearn_is_fitted__())
51 |         self.assertIsNotNone(thres.labels_)
52 |         self.assertIsNotNone(thres.thresh_)
53 | 
54 |     def test_eval(self):
55 |         for scores in self.all_scores:
56 |             pred_labels = self.thres.eval(scores)
57 | 
58 |             self.assertIsNotNone(self.thres.thresh_)
59 |             self.assertIsNotNone(self.thres.dscores_)
60 |             self.assertGreaterEqual(self.thres.dscores_.min(), 0)
61 |             self.assertLessEqual(self.thres.dscores_.max(), 1)
62 |             self.check_labels(pred_labels, scores.shape)
63 | 
64 |     def test_fit(self):
65 |         for scores in self.all_scores:
66 |             self.thres.fit(scores)
67 |             self.check_fitted_attributes(self.thres)
68 |             self.check_labels(self.thres.labels_, scores.shape)
69 | 
70 |     def test_predict(self):
71 |         for scores in self.all_scores:
72 |             self.thres.fit(scores)
73 |             pred_labels = self.thres.predict(scores)
74 |             self.check_fitted_attributes(self.thres)
75 |             self.check_labels(pred_labels, scores.shape)
76 |             assert_equal(self.thres.labels_, pred_labels)
77 | 
78 |     def test_test_data(self):
79 |         for scores, test_scores in zip(self.all_scores, [
80 |             self.clfs[0].fit(self.X_train).decision_function(self.X_test),
81 |             np.vstack([clf.fit(self.X_train).decision_function(self.X_test)
82 |                       for clf in self.clfs]).T
83 |         ]):
84 |             self.thres.fit(scores)
85 |             pred_labels = self.thres.predict(test_scores)
86 |             self.check_fitted_attributes(self.thres)
87 |             self.check_labels(pred_labels, test_scores.shape)
88 | 
89 |     def test_save_and_load(self):
90 |         for scores in self.all_scores:
91 |             self.thres.fit(scores)
92 |             joblib.dump(self.thres, 'model.pkl')
93 |             loaded_thres = joblib.load('model.pkl')
94 | 
95 |             assert_equal(self.thres.predict(scores),
96 |                          loaded_thres.predict(scores))
97 | 


--------------------------------------------------------------------------------
/pythresh/test/test_iqr.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import unittest
 3 | from os.path import dirname as up
 4 | 
 5 | # noinspection PyProtectedMember
 6 | import joblib
 7 | import numpy as np
 8 | from numpy.testing import assert_equal
 9 | from pyod.models.iforest import IForest
10 | from pyod.models.knn import KNN
11 | from pyod.models.pca import PCA
12 | from pyod.utils.data import generate_data
13 | 
14 | from pythresh.thresholds.iqr import IQR
15 | 
16 | # temporary solution for relative imports in case pythresh is not installed
17 | # if pythresh is installed, no need to use the following line
18 | 
19 | path = up(up(up(__file__)))
20 | sys.path.append(path)
21 | 
22 | 
23 | class TestIQR(unittest.TestCase):
24 | 
25 |     @classmethod
26 |     def setUpClass(cls):
27 |         cls.n_train = 200
28 |         cls.n_test = 100
29 |         cls.contamination = 0.1
30 |         cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data(
31 |             n_train=cls.n_train, n_test=cls.n_test,
32 |             contamination=cls.contamination, random_state=42)
33 | 
34 |         cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)]
35 |         cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_
36 |         cls.multiple_scores = np.vstack([
37 |             clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs
38 |         ]).T
39 |         cls.all_scores = [cls.single_score, cls.multiple_scores]
40 | 
41 |     def setUp(self):
42 |         self.thres = IQR()
43 | 
44 |     def check_labels(self, labels, scores_shape):
45 |         self.assertEqual(labels.shape, scores_shape[:1])
46 |         self.assertIn(labels.min(), [0, 1])
47 |         self.assertIn(labels.max(), [0, 1])
48 | 
49 |     def check_fitted_attributes(self, thres):
50 |         self.assertTrue(thres.__sklearn_is_fitted__())
51 |         self.assertIsNotNone(thres.labels_)
52 |         self.assertIsNotNone(thres.thresh_)
53 | 
54 |     def test_eval(self):
55 |         for scores in self.all_scores:
56 |             pred_labels = self.thres.eval(scores)
57 | 
58 |             self.assertIsNotNone(self.thres.thresh_)
59 |             self.assertIsNotNone(self.thres.dscores_)
60 |             self.assertGreaterEqual(self.thres.dscores_.min(), 0)
61 |             self.assertLessEqual(self.thres.dscores_.max(), 1)
62 |             self.check_labels(pred_labels, scores.shape)
63 | 
64 |     def test_fit(self):
65 |         for scores in self.all_scores:
66 |             self.thres.fit(scores)
67 |             self.check_fitted_attributes(self.thres)
68 |             self.check_labels(self.thres.labels_, scores.shape)
69 | 
70 |     def test_predict(self):
71 |         for scores in self.all_scores:
72 |             self.thres.fit(scores)
73 |             pred_labels = self.thres.predict(scores)
74 |             self.check_fitted_attributes(self.thres)
75 |             self.check_labels(pred_labels, scores.shape)
76 |             assert_equal(self.thres.labels_, pred_labels)
77 | 
78 |     def test_test_data(self):
79 |         for scores, test_scores in zip(self.all_scores, [
80 |             self.clfs[0].fit(self.X_train).decision_function(self.X_test),
81 |             np.vstack([clf.fit(self.X_train).decision_function(self.X_test)
82 |                       for clf in self.clfs]).T
83 |         ]):
84 |             self.thres.fit(scores)
85 |             pred_labels = self.thres.predict(test_scores)
86 |             self.check_fitted_attributes(self.thres)
87 |             self.check_labels(pred_labels, test_scores.shape)
88 | 
89 |     def test_save_and_load(self):
90 |         for scores in self.all_scores:
91 |             self.thres.fit(scores)
92 |             joblib.dump(self.thres, 'model.pkl')
93 |             loaded_thres = joblib.load('model.pkl')
94 | 
95 |             assert_equal(self.thres.predict(scores),
96 |                          loaded_thres.predict(scores))
97 | 


--------------------------------------------------------------------------------
/pythresh/test/test_aucp.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import unittest
 3 | from os.path import dirname as up
 4 | 
 5 | # noinspection PyProtectedMember
 6 | import joblib
 7 | import numpy as np
 8 | from numpy.testing import assert_equal
 9 | from pyod.models.iforest import IForest
10 | from pyod.models.knn import KNN
11 | from pyod.models.pca import PCA
12 | from pyod.utils.data import generate_data
13 | 
14 | from pythresh.thresholds.aucp import AUCP
15 | 
16 | # temporary solution for relative imports in case pythresh is not installed
17 | # if pythresh is installed, no need to use the following line
18 | 
19 | path = up(up(up(__file__)))
20 | sys.path.append(path)
21 | 
22 | 
23 | class TestAUCP(unittest.TestCase):
24 | 
25 |     @classmethod
26 |     def setUpClass(cls):
27 |         cls.n_train = 200
28 |         cls.n_test = 100
29 |         cls.contamination = 0.1
30 |         cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data(
31 |             n_train=cls.n_train, n_test=cls.n_test,
32 |             contamination=cls.contamination, random_state=42)
33 | 
34 |         cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)]
35 |         cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_
36 |         cls.multiple_scores = np.vstack([
37 |             clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs
38 |         ]).T
39 |         cls.all_scores = [cls.single_score, cls.multiple_scores]
40 | 
41 |     def setUp(self):
42 |         self.thres = AUCP()
43 | 
44 |     def check_labels(self, labels, scores_shape):
45 |         self.assertEqual(labels.shape, scores_shape[:1])
46 |         self.assertIn(labels.min(), [0, 1])
47 |         self.assertIn(labels.max(), [0, 1])
48 | 
49 |     def check_fitted_attributes(self, thres):
50 |         self.assertTrue(thres.__sklearn_is_fitted__())
51 |         self.assertIsNotNone(thres.labels_)
52 |         self.assertIsNotNone(thres.thresh_)
53 | 
54 |     def test_eval(self):
55 |         for scores in self.all_scores:
56 |             pred_labels = self.thres.eval(scores)
57 | 
58 |             self.assertIsNotNone(self.thres.thresh_)
59 |             self.assertIsNotNone(self.thres.dscores_)
60 |             self.assertGreaterEqual(self.thres.dscores_.min(), 0)
61 |             self.assertLessEqual(self.thres.dscores_.max(), 1)
62 |             self.check_labels(pred_labels, scores.shape)
63 | 
64 |     def test_fit(self):
65 |         for scores in self.all_scores:
66 |             self.thres.fit(scores)
67 |             self.check_fitted_attributes(self.thres)
68 |             self.check_labels(self.thres.labels_, scores.shape)
69 | 
70 |     def test_predict(self):
71 |         for scores in self.all_scores:
72 |             self.thres.fit(scores)
73 |             pred_labels = self.thres.predict(scores)
74 |             self.check_fitted_attributes(self.thres)
75 |             self.check_labels(pred_labels, scores.shape)
76 |             assert_equal(self.thres.labels_, pred_labels)
77 | 
78 |     def test_test_data(self):
79 |         for scores, test_scores in zip(self.all_scores, [
80 |             self.clfs[0].fit(self.X_train).decision_function(self.X_test),
81 |             np.vstack([clf.fit(self.X_train).decision_function(self.X_test)
82 |                       for clf in self.clfs]).T
83 |         ]):
84 |             self.thres.fit(scores)
85 |             pred_labels = self.thres.predict(test_scores)
86 |             self.check_fitted_attributes(self.thres)
87 |             self.check_labels(pred_labels, test_scores.shape)
88 | 
89 |     def test_save_and_load(self):
90 |         for scores in self.all_scores:
91 |             self.thres.fit(scores)
92 |             joblib.dump(self.thres, 'model.pkl')
93 |             loaded_thres = joblib.load('model.pkl')
94 | 
95 |             assert_equal(self.thres.predict(scores),
96 |                          loaded_thres.predict(scores))
97 | 


--------------------------------------------------------------------------------
/pythresh/test/test_boot.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import unittest
 3 | from os.path import dirname as up
 4 | 
 5 | # noinspection PyProtectedMember
 6 | import joblib
 7 | import numpy as np
 8 | from numpy.testing import assert_equal
 9 | from pyod.models.iforest import IForest
10 | from pyod.models.knn import KNN
11 | from pyod.models.pca import PCA
12 | from pyod.utils.data import generate_data
13 | 
14 | from pythresh.thresholds.boot import BOOT
15 | 
16 | # temporary solution for relative imports in case pythresh is not installed
17 | # if pythresh is installed, no need to use the following line
18 | 
19 | path = up(up(up(__file__)))
20 | sys.path.append(path)
21 | 
22 | 
23 | class TestBOOT(unittest.TestCase):
24 | 
25 |     @classmethod
26 |     def setUpClass(cls):
27 |         cls.n_train = 200
28 |         cls.n_test = 100
29 |         cls.contamination = 0.1
30 |         cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data(
31 |             n_train=cls.n_train, n_test=cls.n_test,
32 |             contamination=cls.contamination, random_state=42)
33 | 
34 |         cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)]
35 |         cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_
36 |         cls.multiple_scores = np.vstack([
37 |             clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs
38 |         ]).T
39 |         cls.all_scores = [cls.single_score, cls.multiple_scores]
40 | 
41 |     def setUp(self):
42 |         self.thres = BOOT()
43 | 
44 |     def check_labels(self, labels, scores_shape):
45 |         self.assertEqual(labels.shape, scores_shape[:1])
46 |         self.assertIn(labels.min(), [0, 1])
47 |         self.assertIn(labels.max(), [0, 1])
48 | 
49 |     def check_fitted_attributes(self, thres):
50 |         self.assertTrue(thres.__sklearn_is_fitted__())
51 |         self.assertIsNotNone(thres.labels_)
52 |         self.assertIsNotNone(thres.thresh_)
53 | 
54 |     def test_eval(self):
55 |         for scores in self.all_scores:
56 |             pred_labels = self.thres.eval(scores)
57 | 
58 |             self.assertIsNotNone(self.thres.thresh_)
59 |             self.assertIsNotNone(self.thres.dscores_)
60 |             self.assertGreaterEqual(self.thres.dscores_.min(), 0)
61 |             self.assertLessEqual(self.thres.dscores_.max(), 1)
62 |             self.check_labels(pred_labels, scores.shape)
63 | 
64 |     def test_fit(self):
65 |         for scores in self.all_scores:
66 |             self.thres.fit(scores)
67 |             self.check_fitted_attributes(self.thres)
68 |             self.check_labels(self.thres.labels_, scores.shape)
69 | 
70 |     def test_predict(self):
71 |         for scores in self.all_scores:
72 |             self.thres.fit(scores)
73 |             pred_labels = self.thres.predict(scores)
74 |             self.check_fitted_attributes(self.thres)
75 |             self.check_labels(pred_labels, scores.shape)
76 |             assert_equal(self.thres.labels_, pred_labels)
77 | 
78 |     def test_test_data(self):
79 |         for scores, test_scores in zip(self.all_scores, [
80 |             self.clfs[0].fit(self.X_train).decision_function(self.X_test),
81 |             np.vstack([clf.fit(self.X_train).decision_function(self.X_test)
82 |                       for clf in self.clfs]).T
83 |         ]):
84 |             self.thres.fit(scores)
85 |             pred_labels = self.thres.predict(test_scores)
86 |             self.check_fitted_attributes(self.thres)
87 |             self.check_labels(pred_labels, test_scores.shape)
88 | 
89 |     def test_save_and_load(self):
90 |         for scores in self.all_scores:
91 |             self.thres.fit(scores)
92 |             joblib.dump(self.thres, 'model.pkl')
93 |             loaded_thres = joblib.load('model.pkl')
94 | 
95 |             assert_equal(self.thres.predict(scores),
96 |                          loaded_thres.predict(scores))
97 | 


--------------------------------------------------------------------------------
/pythresh/test/test_fwfm.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import unittest
 3 | from os.path import dirname as up
 4 | 
 5 | # noinspection PyProtectedMember
 6 | import joblib
 7 | import numpy as np
 8 | from numpy.testing import assert_equal
 9 | from pyod.models.iforest import IForest
10 | from pyod.models.knn import KNN
11 | from pyod.models.pca import PCA
12 | from pyod.utils.data import generate_data
13 | 
14 | from pythresh.thresholds.fwfm import FWFM
15 | 
16 | # temporary solution for relative imports in case pythresh is not installed
17 | # if pythresh is installed, no need to use the following line
18 | 
19 | path = up(up(up(__file__)))
20 | sys.path.append(path)
21 | 
22 | 
23 | class TestFWFM(unittest.TestCase):
24 | 
25 |     @classmethod
26 |     def setUpClass(cls):
27 |         cls.n_train = 200
28 |         cls.n_test = 100
29 |         cls.contamination = 0.1
30 |         cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data(
31 |             n_train=cls.n_train, n_test=cls.n_test,
32 |             contamination=cls.contamination, random_state=42)
33 | 
34 |         cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)]
35 |         cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_
36 |         cls.multiple_scores = np.vstack([
37 |             clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs
38 |         ]).T
39 |         cls.all_scores = [cls.single_score, cls.multiple_scores]
40 | 
41 |     def setUp(self):
42 |         self.thres = FWFM()
43 | 
44 |     def check_labels(self, labels, scores_shape):
45 |         self.assertEqual(labels.shape, scores_shape[:1])
46 |         self.assertIn(labels.min(), [0, 1])
47 |         self.assertIn(labels.max(), [0, 1])
48 | 
49 |     def check_fitted_attributes(self, thres):
50 |         self.assertTrue(thres.__sklearn_is_fitted__())
51 |         self.assertIsNotNone(thres.labels_)
52 |         self.assertIsNotNone(thres.thresh_)
53 | 
54 |     def test_eval(self):
55 |         for scores in self.all_scores:
56 |             pred_labels = self.thres.eval(scores)
57 | 
58 |             self.assertIsNotNone(self.thres.thresh_)
59 |             self.assertIsNotNone(self.thres.dscores_)
60 |             self.assertGreaterEqual(self.thres.dscores_.min(), 0)
61 |             self.assertLessEqual(self.thres.dscores_.max(), 1)
62 |             self.check_labels(pred_labels, scores.shape)
63 | 
64 |     def test_fit(self):
65 |         for scores in self.all_scores:
66 |             self.thres.fit(scores)
67 |             self.check_fitted_attributes(self.thres)
68 |             self.check_labels(self.thres.labels_, scores.shape)
69 | 
70 |     def test_predict(self):
71 |         for scores in self.all_scores:
72 |             self.thres.fit(scores)
73 |             pred_labels = self.thres.predict(scores)
74 |             self.check_fitted_attributes(self.thres)
75 |             self.check_labels(pred_labels, scores.shape)
76 |             assert_equal(self.thres.labels_, pred_labels)
77 | 
78 |     def test_test_data(self):
79 |         for scores, test_scores in zip(self.all_scores, [
80 |             self.clfs[0].fit(self.X_train).decision_function(self.X_test),
81 |             np.vstack([clf.fit(self.X_train).decision_function(self.X_test)
82 |                       for clf in self.clfs]).T
83 |         ]):
84 |             self.thres.fit(scores)
85 |             pred_labels = self.thres.predict(test_scores)
86 |             self.check_fitted_attributes(self.thres)
87 |             self.check_labels(pred_labels, test_scores.shape)
88 | 
89 |     def test_save_and_load(self):
90 |         for scores in self.all_scores:
91 |             self.thres.fit(scores)
92 |             joblib.dump(self.thres, 'model.pkl')
93 |             loaded_thres = joblib.load('model.pkl')
94 | 
95 |             assert_equal(self.thres.predict(scores),
96 |                          loaded_thres.predict(scores))
97 | 


--------------------------------------------------------------------------------
/pythresh/test/test_mcst.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import unittest
 3 | from os.path import dirname as up
 4 | 
 5 | # noinspection PyProtectedMember
 6 | import joblib
 7 | import numpy as np
 8 | from numpy.testing import assert_equal
 9 | from pyod.models.iforest import IForest
10 | from pyod.models.knn import KNN
11 | from pyod.models.pca import PCA
12 | from pyod.utils.data import generate_data
13 | 
14 | from pythresh.thresholds.mcst import MCST
15 | 
16 | # temporary solution for relative imports in case pythresh is not installed
17 | # if pythresh is installed, no need to use the following line
18 | 
19 | path = up(up(up(__file__)))
20 | sys.path.append(path)
21 | 
22 | 
23 | class TestMCST(unittest.TestCase):
24 | 
25 |     @classmethod
26 |     def setUpClass(cls):
27 |         cls.n_train = 200
28 |         cls.n_test = 100
29 |         cls.contamination = 0.1
30 |         cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data(
31 |             n_train=cls.n_train, n_test=cls.n_test,
32 |             contamination=cls.contamination, random_state=42)
33 | 
34 |         cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)]
35 |         cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_
36 |         cls.multiple_scores = np.vstack([
37 |             clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs
38 |         ]).T
39 |         cls.all_scores = [cls.single_score, cls.multiple_scores]
40 | 
41 |     def setUp(self):
42 |         self.thres = MCST()
43 | 
44 |     def check_labels(self, labels, scores_shape):
45 |         self.assertEqual(labels.shape, scores_shape[:1])
46 |         self.assertIn(labels.min(), [0, 1])
47 |         self.assertIn(labels.max(), [0, 1])
48 | 
49 |     def check_fitted_attributes(self, thres):
50 |         self.assertTrue(thres.__sklearn_is_fitted__())
51 |         self.assertIsNotNone(thres.labels_)
52 |         self.assertIsNotNone(thres.thresh_)
53 | 
54 |     def test_eval(self):
55 |         for scores in self.all_scores:
56 |             pred_labels = self.thres.eval(scores)
57 | 
58 |             self.assertIsNotNone(self.thres.thresh_)
59 |             self.assertIsNotNone(self.thres.dscores_)
60 |             self.assertGreaterEqual(self.thres.dscores_.min(), 0)
61 |             self.assertLessEqual(self.thres.dscores_.max(), 1)
62 |             self.check_labels(pred_labels, scores.shape)
63 | 
64 |     def test_fit(self):
65 |         for scores in self.all_scores:
66 |             self.thres.fit(scores)
67 |             self.check_fitted_attributes(self.thres)
68 |             self.check_labels(self.thres.labels_, scores.shape)
69 | 
70 |     def test_predict(self):
71 |         for scores in self.all_scores:
72 |             self.thres.fit(scores)
73 |             pred_labels = self.thres.predict(scores)
74 |             self.check_fitted_attributes(self.thres)
75 |             self.check_labels(pred_labels, scores.shape)
76 |             assert_equal(self.thres.labels_, pred_labels)
77 | 
78 |     def test_test_data(self):
79 |         for scores, test_scores in zip(self.all_scores, [
80 |             self.clfs[0].fit(self.X_train).decision_function(self.X_test),
81 |             np.vstack([clf.fit(self.X_train).decision_function(self.X_test)
82 |                       for clf in self.clfs]).T
83 |         ]):
84 |             self.thres.fit(scores)
85 |             pred_labels = self.thres.predict(test_scores)
86 |             self.check_fitted_attributes(self.thres)
87 |             self.check_labels(pred_labels, test_scores.shape)
88 | 
89 |     def test_save_and_load(self):
90 |         for scores in self.all_scores:
91 |             self.thres.fit(scores)
92 |             joblib.dump(self.thres, 'model.pkl')
93 |             loaded_thres = joblib.load('model.pkl')
94 | 
95 |             assert_equal(self.thres.predict(scores),
96 |                          loaded_thres.predict(scores))
97 | 


--------------------------------------------------------------------------------
/pythresh/test/test_moll.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import unittest
 3 | from os.path import dirname as up
 4 | 
 5 | # noinspection PyProtectedMember
 6 | import joblib
 7 | import numpy as np
 8 | from numpy.testing import assert_equal
 9 | from pyod.models.iforest import IForest
10 | from pyod.models.knn import KNN
11 | from pyod.models.pca import PCA
12 | from pyod.utils.data import generate_data
13 | 
14 | from pythresh.thresholds.moll import MOLL
15 | 
16 | # temporary solution for relative imports in case pythresh is not installed
17 | # if pythresh is installed, no need to use the following line
18 | 
19 | path = up(up(up(__file__)))
20 | sys.path.append(path)
21 | 
22 | 
23 | class TestMOLL(unittest.TestCase):
24 | 
25 |     @classmethod
26 |     def setUpClass(cls):
27 |         cls.n_train = 200
28 |         cls.n_test = 100
29 |         cls.contamination = 0.1
30 |         cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data(
31 |             n_train=cls.n_train, n_test=cls.n_test,
32 |             contamination=cls.contamination, random_state=42)
33 | 
34 |         cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)]
35 |         cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_
36 |         cls.multiple_scores = np.vstack([
37 |             clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs
38 |         ]).T
39 |         cls.all_scores = [cls.single_score, cls.multiple_scores]
40 | 
41 |     def setUp(self):
42 |         self.thres = MOLL()
43 | 
44 |     def check_labels(self, labels, scores_shape):
45 |         self.assertEqual(labels.shape, scores_shape[:1])
46 |         self.assertIn(labels.min(), [0, 1])
47 |         self.assertIn(labels.max(), [0, 1])
48 | 
49 |     def check_fitted_attributes(self, thres):
50 |         self.assertTrue(thres.__sklearn_is_fitted__())
51 |         self.assertIsNotNone(thres.labels_)
52 |         self.assertIsNotNone(thres.thresh_)
53 | 
54 |     def test_eval(self):
55 |         for scores in self.all_scores:
56 |             pred_labels = self.thres.eval(scores)
57 | 
58 |             self.assertIsNotNone(self.thres.thresh_)
59 |             self.assertIsNotNone(self.thres.dscores_)
60 |             self.assertGreaterEqual(self.thres.dscores_.min(), 0)
61 |             self.assertLessEqual(self.thres.dscores_.max(), 1)
62 |             self.check_labels(pred_labels, scores.shape)
63 | 
64 |     def test_fit(self):
65 |         for scores in self.all_scores:
66 |             self.thres.fit(scores)
67 |             self.check_fitted_attributes(self.thres)
68 |             self.check_labels(self.thres.labels_, scores.shape)
69 | 
70 |     def test_predict(self):
71 |         for scores in self.all_scores:
72 |             self.thres.fit(scores)
73 |             pred_labels = self.thres.predict(scores)
74 |             self.check_fitted_attributes(self.thres)
75 |             self.check_labels(pred_labels, scores.shape)
76 |             assert_equal(self.thres.labels_, pred_labels)
77 | 
78 |     def test_test_data(self):
79 |         for scores, test_scores in zip(self.all_scores, [
80 |             self.clfs[0].fit(self.X_train).decision_function(self.X_test),
81 |             np.vstack([clf.fit(self.X_train).decision_function(self.X_test)
82 |                       for clf in self.clfs]).T
83 |         ]):
84 |             self.thres.fit(scores)
85 |             pred_labels = self.thres.predict(test_scores)
86 |             self.check_fitted_attributes(self.thres)
87 |             self.check_labels(pred_labels, test_scores.shape)
88 | 
89 |     def test_save_and_load(self):
90 |         for scores in self.all_scores:
91 |             self.thres.fit(scores)
92 |             joblib.dump(self.thres, 'model.pkl')
93 |             loaded_thres = joblib.load('model.pkl')
94 | 
95 |             assert_equal(self.thres.predict(scores),
96 |                          loaded_thres.predict(scores))
97 | 


--------------------------------------------------------------------------------
/pythresh/test/test_wind.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import unittest
 3 | from os.path import dirname as up
 4 | 
 5 | # noinspection PyProtectedMember
 6 | import joblib
 7 | import numpy as np
 8 | from numpy.testing import assert_equal
 9 | from pyod.models.iforest import IForest
10 | from pyod.models.knn import KNN
11 | from pyod.models.pca import PCA
12 | from pyod.utils.data import generate_data
13 | 
14 | from pythresh.thresholds.wind import WIND
15 | 
16 | # temporary solution for relative imports in case pythresh is not installed
17 | # if pythresh is installed, no need to use the following line
18 | 
19 | path = up(up(up(__file__)))
20 | sys.path.append(path)
21 | 
22 | 
23 | class TestWIND(unittest.TestCase):
24 | 
25 |     @classmethod
26 |     def setUpClass(cls):
27 |         cls.n_train = 200
28 |         cls.n_test = 100
29 |         cls.contamination = 0.1
30 |         cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data(
31 |             n_train=cls.n_train, n_test=cls.n_test,
32 |             contamination=cls.contamination, random_state=42)
33 | 
34 |         cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)]
35 |         cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_
36 |         cls.multiple_scores = np.vstack([
37 |             clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs
38 |         ]).T
39 |         cls.all_scores = [cls.single_score, cls.multiple_scores]
40 | 
41 |     def setUp(self):
42 |         self.thres = WIND()
43 | 
44 |     def check_labels(self, labels, scores_shape):
45 |         self.assertEqual(labels.shape, scores_shape[:1])
46 |         self.assertIn(labels.min(), [0, 1])
47 |         self.assertIn(labels.max(), [0, 1])
48 | 
49 |     def check_fitted_attributes(self, thres):
50 |         self.assertTrue(thres.__sklearn_is_fitted__())
51 |         self.assertIsNotNone(thres.labels_)
52 |         self.assertIsNotNone(thres.thresh_)
53 | 
54 |     def test_eval(self):
55 |         for scores in self.all_scores:
56 |             pred_labels = self.thres.eval(scores)
57 | 
58 |             self.assertIsNotNone(self.thres.thresh_)
59 |             self.assertIsNotNone(self.thres.dscores_)
60 |             self.assertGreaterEqual(self.thres.dscores_.min(), 0)
61 |             self.assertLessEqual(self.thres.dscores_.max(), 1)
62 |             self.check_labels(pred_labels, scores.shape)
63 | 
64 |     def test_fit(self):
65 |         for scores in self.all_scores:
66 |             self.thres.fit(scores)
67 |             self.check_fitted_attributes(self.thres)
68 |             self.check_labels(self.thres.labels_, scores.shape)
69 | 
70 |     def test_predict(self):
71 |         for scores in self.all_scores:
72 |             self.thres.fit(scores)
73 |             pred_labels = self.thres.predict(scores)
74 |             self.check_fitted_attributes(self.thres)
75 |             self.check_labels(pred_labels, scores.shape)
76 |             assert_equal(self.thres.labels_, pred_labels)
77 | 
78 |     def test_test_data(self):
79 |         for scores, test_scores in zip(self.all_scores, [
80 |             self.clfs[0].fit(self.X_train).decision_function(self.X_test),
81 |             np.vstack([clf.fit(self.X_train).decision_function(self.X_test)
82 |                       for clf in self.clfs]).T
83 |         ]):
84 |             self.thres.fit(scores)
85 |             pred_labels = self.thres.predict(test_scores)
86 |             self.check_fitted_attributes(self.thres)
87 |             self.check_labels(pred_labels, test_scores.shape)
88 | 
89 |     def test_save_and_load(self):
90 |         for scores in self.all_scores:
91 |             self.thres.fit(scores)
92 |             joblib.dump(self.thres, 'model.pkl')
93 |             loaded_thres = joblib.load('model.pkl')
94 | 
95 |             assert_equal(self.thres.predict(scores),
96 |                          loaded_thres.predict(scores))
97 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/pythresh/thresholds/boot.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.stats as stats
  3 | 
  4 | from .base import BaseThresholder
  5 | from .thresh_utility import cut
  6 | 
  7 | 
  8 | class BOOT(BaseThresholder):
  9 |     r"""BOOT class for Bootstrapping thresholder.
 10 | 
 11 |        Use a bootstrapping based method to find a non-parametric means
 12 |        to threshold scores generated by the decision_scores where outliers
 13 |        are set to any value beyond the mean of the confidence intervals.
 14 |        See :cite:`martin2006boot` for details
 15 | 
 16 |        Parameters
 17 |        ----------
 18 |        random_state : int, optional (default=1234)
 19 |             Random seed for bootstrapping a confidence interval. Can also be set to None.
 20 | 
 21 |        Attributes
 22 |        ----------
 23 | 
 24 |        thresh_ : threshold value that separates inliers from outliers
 25 | 
 26 |        dscores_ : 1D array of decomposed decision scores
 27 | 
 28 |        Notes
 29 |        -----
 30 | 
 31 |        The two sided bias-corrected and accelerated bootstrap confidence interval
 32 |        is calculated with a confidence level of 0.95. The statistic calculating
 33 |        the confidence interval is the standard deviation of the decision
 34 |        scores, with the statistic treating corresponding elements of the
 35 |        samples in the decision scores as paired
 36 | 
 37 |        The returned upper and lower confidence intervals are used to threshold
 38 |        the decision scores. Outliers are set to any value above the mean of the
 39 |        upper and lower confidence intervals.
 40 | 
 41 |        Examples
 42 |        --------
 43 |        The effects of randomness can affect the thresholder's output performance
 44 |        significantly. Therefore, to alleviate the effects of randomness on the
 45 |        thresholder a combined model can be used with different random_state values.
 46 |        E.g.
 47 | 
 48 |        .. code:: python
 49 | 
 50 |             # train the KNN detector
 51 |             from pyod.models.knn import KNN
 52 |             from pythresh.thresholds.comb import COMB
 53 |             from pythresh.thresholds.boot import BOOT
 54 | 
 55 |             clf = KNN()
 56 |             clf.fit(X_train)
 57 | 
 58 |             # get outlier scores
 59 |             decision_scores = clf.decision_scores_  # raw outlier scores
 60 | 
 61 |             # get outlier labels with combined model
 62 |             thres = COMB(thresholders = [BOOT(random_state=1234),
 63 |             BOOT(random_state=42), BOOT(random_state=9685),
 64 |             BOOT(random_state=111222)])
 65 |             labels = thres.eval(decision_scores)
 66 | 
 67 |     """
 68 | 
 69 |     def __init__(self, random_state=1234):
 70 | 
 71 |         super().__init__()
 72 |         self.random_state = random_state
 73 |         np.random.seed(random_state)
 74 | 
 75 |     def eval(self, decision):
 76 |         """Outlier/inlier evaluation process for decision scores.
 77 | 
 78 |         Parameters
 79 |         ----------
 80 |         decision : np.array or list of shape (n_samples)
 81 |                    or np.array of shape (n_samples, n_detectors)
 82 |                    which are the decision scores from a
 83 |                    outlier detection.
 84 | 
 85 |         Returns
 86 |         -------
 87 |         outlier_labels : numpy array of shape (n_samples,)
 88 |             For each observation, tells whether or not
 89 |             it should be considered as an outlier according to the
 90 |             fitted model. 0 stands for inliers and 1 for outliers.
 91 |         """
 92 | 
 93 |         decision = self._data_setup(decision)
 94 | 
 95 |         limit1, limit2 = stats.bootstrap(
 96 |             decision.reshape(1, -1),
 97 |             np.std,
 98 |             paired=True,
 99 |             random_state=self.random_state
100 |         ).confidence_interval
101 | 
102 |         self.thresh_ = (limit1+limit2)/2
103 | 
104 |         return cut(decision, (limit1+limit2)/2)
105 | 


--------------------------------------------------------------------------------
/pythresh/thresholds/regr.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.stats as stats
  3 | 
  4 | from .base import BaseThresholder
  5 | from .thresh_utility import cut, normalize
  6 | 
  7 | 
  8 | class REGR(BaseThresholder):
  9 |     """REGR class for Regression based thresholder.
 10 | 
 11 |        Use the regression to evaluate a non-parametric means
 12 |        to threshold scores generated by the decision_scores where outliers
 13 |        are set to any value beyond the y-intercept value of the linear fit.
 14 |        See :cite:`aggarwal2017clf` for details.
 15 | 
 16 |        Parameters
 17 |        ----------
 18 | 
 19 |        method : {'siegel', 'theil'}, optional (default='siegel')
 20 |             Regression based method to calculate the y-intercept
 21 | 
 22 |             - 'siegel': implements a method for robust linear regression using repeated medians
 23 |             - 'theil':  implements a method for robust linear regression using paired values
 24 | 
 25 |        random_state : int, optional (default=1234)
 26 |             random seed for the normal distribution. Can also be set to None
 27 | 
 28 |        Attributes
 29 |        ----------
 30 | 
 31 |        thresh_ : threshold value that separates inliers from outliers
 32 | 
 33 |        Examples
 34 |        --------
 35 |        The effects of randomness can affect the thresholder's output performance
 36 |        significantly. Therefore, to alleviate the effects of randomness on the
 37 |        thresholder a combined model can be used with different random_state values.
 38 |        E.g.
 39 | 
 40 |        .. code:: python
 41 | 
 42 |             # train the KNN detector
 43 |             from pyod.models.knn import KNN
 44 |             from pythresh.thresholds.comb import COMB
 45 |             from pythresh.thresholds.regr import REGR
 46 | 
 47 |             clf = KNN()
 48 |             clf.fit(X_train)
 49 | 
 50 |             # get outlier scores
 51 |             decision_scores = clf.decision_scores_  # raw outlier scores
 52 | 
 53 |             # get outlier labels with combined model
 54 |             thres = COMB(thresholders = [REGR(random_state=1234),
 55 |             REGR(random_state=42), REGR(random_state=9685),
 56 |             REGR(random_state=111222)])
 57 |             labels = thres.eval(decision_scores)
 58 |     """
 59 | 
 60 |     def __init__(self, method='siegel', random_state=1234):
 61 | 
 62 |         super().__init__()
 63 |         self.method = method
 64 |         self.random_state = random_state
 65 |         np.random.seed(random_state)
 66 | 
 67 |     def eval(self, decision):
 68 |         """Outlier/inlier evaluation process for decision scores.
 69 | 
 70 |         Parameters
 71 |         ----------
 72 |         decision : np.array or list of shape (n_samples)
 73 |                    or np.array of shape (n_samples, n_detectors)
 74 |                    which are the decision scores from a
 75 |                    outlier detection.
 76 | 
 77 |         Returns
 78 |         -------
 79 |         outlier_labels : numpy array of shape (n_samples,)
 80 |             For each observation, tells whether or not
 81 |             it should be considered as an outlier according to the
 82 |             fitted model. 0 stands for inliers and 1 for outliers.
 83 |         """
 84 |         decision = self._data_setup(decision)
 85 | 
 86 |         # Create a normal distribution and normalize
 87 |         norm = np.random.default_rng(self.random_state).normal(
 88 |             loc=0.0, scale=1.0, size=decision.shape)
 89 |         norm = normalize(norm)
 90 | 
 91 |         # Set limit to the y-intercept
 92 |         try:
 93 |             if self.method == 'siegel':
 94 |                 res = stats.siegelslopes(norm, decision)
 95 |             elif self.method == 'theil':
 96 |                 res = stats.theilslopes(norm, decision)
 97 |         except MemoryError:
 98 |             res = [0.0, 1.0]
 99 | 
100 |         limit = res[1]
101 | 
102 |         self.thresh_ = limit
103 | 
104 |         return cut(decision, limit)
105 | 


--------------------------------------------------------------------------------
/pythresh/thresholds/yj.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.stats as stats
  3 | 
  4 | from .base import BaseThresholder
  5 | from .thresh_utility import cut, gen_kde
  6 | 
  7 | 
  8 | class YJ(BaseThresholder):
  9 |     r"""YJ class for Yeo-Johnson transformation thresholder.
 10 | 
 11 |        Use the Yeo-Johnson transformation to evaluate
 12 |        a non-parametric means to threshold scores generated by the
 13 |        decision_scores where outliers are set to any value beyond the
 14 |        max value in the YJ transformed data.
 15 |        See :cite:`raymaekers2021yj` for details.
 16 | 
 17 |        Parameters
 18 |        ----------
 19 | 
 20 |        random_state : int, optional (default=1234)
 21 |             Random seed for the random number generators of the thresholders. Can also
 22 |             be set to None.
 23 | 
 24 |        Attributes
 25 |        ----------
 26 | 
 27 |        thresh_ : threshold value that separates inliers from outliers
 28 | 
 29 |        dscores_ : 1D array of decomposed decision scores
 30 | 
 31 |        Notes
 32 |        -----
 33 | 
 34 |        The Yeo-Johnson transformation is a power transform which is a
 35 |        set of power functions that apply a monotonic transformation to
 36 |        the dataset. For the decision scores this make their distribution
 37 |        more normal-like. The transformation is given by:
 38 | 
 39 |        .. math::
 40 | 
 41 |            \psi_{(y, \lambda)} = \begin{cases}
 42 |                                  \left((y+1)^\lambda-1\right)/\lambda & \text{if } \lambda \neq 0 \text{, } y \geq 0 \\
 43 |                                  \text{log}(y+1) & \text{if } \lambda = 0 \text{, } y \geq 0 \\
 44 |                                  -\left((-y+1)^{(2-\lambda)}-1\right)/{(2-\lambda)} & \text{if } \lambda \neq 2 \text{, } y < 0 \\
 45 |                                  -\text{log}(-y+1) & \text{if } \lambda = 2 \text{, } y < 0
 46 |                                  \end{cases} \mathrm{,}
 47 | 
 48 | 
 49 |        where :math:`\lambda` is a power parameter that is chosen via maximum
 50 |        likelihood estimation. Therefore, any values from the original decision
 51 |        scores that are beyond maximum value after this transformation are
 52 |        considered outliers. However, the closer a set of decision scores are
 53 |        to a normal distribution originally the smaller the probability this
 54 |        threshold will be able to identify outliers.
 55 | 
 56 |     """
 57 | 
 58 |     def __init__(self, random_state=1234):
 59 | 
 60 |         super().__init__()
 61 |         self.random_state = random_state
 62 |         np.random.seed(random_state)
 63 | 
 64 |     def eval(self, decision):
 65 |         """Outlier/inlier evaluation process for decision scores.
 66 | 
 67 |         Parameters
 68 |         ----------
 69 |         decision : np.array or list of shape (n_samples)
 70 |                    or np.array of shape (n_samples, n_detectors)
 71 |                    which are the decision scores from a
 72 |                    outlier detection.
 73 | 
 74 |         Returns
 75 |         -------
 76 |         outlier_labels : numpy array of shape (n_samples,)
 77 |             For each observation, tells whether or not
 78 |             it should be considered as an outlier according to the
 79 |             fitted model. 0 stands for inliers and 1 for outliers.
 80 |         """
 81 | 
 82 |         decision = self._data_setup(decision)
 83 | 
 84 |         # Generate KDE
 85 |         val, _ = gen_kde(decision, 0, 1, len(decision)*3)
 86 | 
 87 |         # Use Yeo-Johnson transformation to reshape distribution
 88 |         # iterate to get average transformation
 89 |         mean_s = np.zeros(len(val))
 90 |         for _ in range(50):
 91 |             scores = stats.yeojohnson(val)[0]
 92 |             mean_s += scores
 93 |         mean_s = mean_s/50
 94 | 
 95 |         # Set limit to the max value from the transformation
 96 |         limit = np.max(mean_s)
 97 | 
 98 |         self.thresh_ = limit
 99 | 
100 |         return cut(decision, limit)
101 | 


--------------------------------------------------------------------------------
/pythresh/test/test_chau.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import unittest
  3 | from itertools import product
  4 | from os.path import dirname as up
  5 | 
  6 | # noinspection PyProtectedMember
  7 | import joblib
  8 | import numpy as np
  9 | from numpy.testing import assert_equal
 10 | from pyod.models.iforest import IForest
 11 | from pyod.models.knn import KNN
 12 | from pyod.models.pca import PCA
 13 | from pyod.utils.data import generate_data
 14 | 
 15 | from pythresh.thresholds.chau import CHAU
 16 | 
 17 | # temporary solution for relative imports in case pythresh is not installed
 18 | # if pythresh is installed, no need to use the following line
 19 | 
 20 | path = up(up(up(__file__)))
 21 | sys.path.append(path)
 22 | 
 23 | 
 24 | class TestCHAU(unittest.TestCase):
 25 | 
 26 |     @classmethod
 27 |     def setUpClass(cls):
 28 |         cls.n_train = 200
 29 |         cls.n_test = 100
 30 |         cls.contamination = 0.1
 31 |         cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data(
 32 |             n_train=cls.n_train, n_test=cls.n_test,
 33 |             contamination=cls.contamination, random_state=42)
 34 | 
 35 |         cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)]
 36 |         cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_
 37 |         cls.multiple_scores = np.vstack([
 38 |             clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs
 39 |         ]).T
 40 |         cls.all_scores = [cls.single_score, cls.multiple_scores]
 41 | 
 42 |         cls.methods = ['mean', 'median', 'gmean']
 43 | 
 44 |         cls.params = list(product(cls.all_scores, cls.methods))
 45 | 
 46 |     def setUp(self):
 47 |         self.thres = CHAU()
 48 | 
 49 |     def check_labels(self, labels, scores_shape):
 50 |         self.assertEqual(labels.shape, scores_shape[:1])
 51 |         self.assertIn(labels.min(), [0, 1])
 52 |         self.assertIn(labels.max(), [0, 1])
 53 | 
 54 |     def check_fitted_attributes(self, thres):
 55 |         self.assertTrue(thres.__sklearn_is_fitted__())
 56 |         self.assertIsNotNone(thres.labels_)
 57 |         self.assertIsNotNone(thres.thresh_)
 58 | 
 59 |     def test_eval(self):
 60 |         for scores, method in self.params:
 61 |             thres = CHAU(method=method)
 62 |             pred_labels = thres.eval(scores)
 63 | 
 64 |             self.assertIsNotNone(thres.thresh_)
 65 |             self.assertIsNotNone(thres.dscores_)
 66 |             self.assertGreaterEqual(thres.dscores_.min(), 0)
 67 |             self.assertLessEqual(thres.dscores_.max(), 1)
 68 |             self.check_labels(pred_labels, scores.shape)
 69 | 
 70 |     def test_fit(self):
 71 |         for scores in self.all_scores:
 72 |             self.thres.fit(scores)
 73 |             self.check_fitted_attributes(self.thres)
 74 |             self.check_labels(self.thres.labels_, scores.shape)
 75 | 
 76 |     def test_predict(self):
 77 |         for scores in self.all_scores:
 78 |             self.thres.fit(scores)
 79 |             pred_labels = self.thres.predict(scores)
 80 |             self.check_fitted_attributes(self.thres)
 81 |             self.check_labels(pred_labels, scores.shape)
 82 | 
 83 |     def test_test_data(self):
 84 |         for scores, test_scores in zip(self.all_scores, [
 85 |             self.clfs[0].fit(self.X_train).decision_function(self.X_test),
 86 |             np.vstack([clf.fit(self.X_train).decision_function(self.X_test)
 87 |                       for clf in self.clfs]).T
 88 |         ]):
 89 |             self.thres.fit(scores)
 90 |             pred_labels = self.thres.predict(test_scores)
 91 |             self.check_fitted_attributes(self.thres)
 92 |             self.check_labels(pred_labels, test_scores.shape)
 93 | 
 94 |     def test_save_and_load(self):
 95 |         for scores in self.all_scores:
 96 |             self.thres.fit(scores)
 97 |             joblib.dump(self.thres, 'model.pkl')
 98 |             loaded_thres = joblib.load('model.pkl')
 99 | 
100 |             assert_equal(self.thres.predict(scores),
101 |                          loaded_thres.predict(scores))
102 | 


--------------------------------------------------------------------------------
/pythresh/test/test_mad.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import unittest
  3 | from itertools import product
  4 | from os.path import dirname as up
  5 | 
  6 | # noinspection PyProtectedMember
  7 | import joblib
  8 | import numpy as np
  9 | from numpy.testing import assert_equal
 10 | from pyod.models.iforest import IForest
 11 | from pyod.models.knn import KNN
 12 | from pyod.models.pca import PCA
 13 | from pyod.utils.data import generate_data
 14 | 
 15 | from pythresh.thresholds.mad import MAD
 16 | 
 17 | # temporary solution for relative imports in case pythresh is not installed
 18 | # if pythresh is installed, no need to use the following line
 19 | 
 20 | path = up(up(up(__file__)))
 21 | sys.path.append(path)
 22 | 
 23 | 
 24 | class TestMAD(unittest.TestCase):
 25 | 
 26 |     @classmethod
 27 |     def setUpClass(cls):
 28 |         cls.n_train = 200
 29 |         cls.n_test = 100
 30 |         cls.contamination = 0.1
 31 |         cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data(
 32 |             n_train=cls.n_train, n_test=cls.n_test,
 33 |             contamination=cls.contamination, random_state=42)
 34 | 
 35 |         cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)]
 36 |         cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_
 37 |         cls.multiple_scores = np.vstack([
 38 |             clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs
 39 |         ]).T
 40 |         cls.all_scores = [cls.single_score, cls.multiple_scores]
 41 | 
 42 |         cls.factors = [0.5, 1, 2]
 43 | 
 44 |         cls.params = list(product(cls.all_scores, cls.factors))
 45 | 
 46 |     def setUp(self):
 47 |         self.thres = MAD()
 48 | 
 49 |     def check_labels(self, labels, scores_shape):
 50 |         self.assertEqual(labels.shape, scores_shape[:1])
 51 |         self.assertIn(labels.min(), [0, 1])
 52 |         self.assertIn(labels.max(), [0, 1])
 53 | 
 54 |     def check_fitted_attributes(self, thres):
 55 |         self.assertTrue(thres.__sklearn_is_fitted__())
 56 |         self.assertIsNotNone(thres.labels_)
 57 |         self.assertIsNotNone(thres.thresh_)
 58 | 
 59 |     def test_eval(self):
 60 |         for scores, factor in self.params:
 61 |             thres = MAD(factor=factor)
 62 |             pred_labels = thres.eval(scores)
 63 | 
 64 |             self.assertIsNotNone(thres.thresh_)
 65 |             self.assertIsNotNone(thres.dscores_)
 66 |             self.assertGreaterEqual(thres.dscores_.min(), 0)
 67 |             self.assertLessEqual(thres.dscores_.max(), 1)
 68 |             self.check_labels(pred_labels, scores.shape)
 69 | 
 70 |     def test_fit(self):
 71 |         for scores in self.all_scores:
 72 |             self.thres.fit(scores)
 73 |             self.check_fitted_attributes(self.thres)
 74 |             self.check_labels(self.thres.labels_, scores.shape)
 75 | 
 76 |     def test_predict(self):
 77 |         for scores in self.all_scores:
 78 |             self.thres.fit(scores)
 79 |             pred_labels = self.thres.predict(scores)
 80 |             self.check_fitted_attributes(self.thres)
 81 |             self.check_labels(pred_labels, scores.shape)
 82 |             assert_equal(self.thres.labels_, pred_labels)
 83 | 
 84 |     def test_test_data(self):
 85 |         for scores, test_scores in zip(self.all_scores, [
 86 |             self.clfs[0].fit(self.X_train).decision_function(self.X_test),
 87 |             np.vstack([clf.fit(self.X_train).decision_function(self.X_test)
 88 |                       for clf in self.clfs]).T
 89 |         ]):
 90 |             self.thres.fit(scores)
 91 |             pred_labels = self.thres.predict(test_scores)
 92 |             self.check_fitted_attributes(self.thres)
 93 |             self.check_labels(pred_labels, test_scores.shape)
 94 | 
 95 |     def test_save_and_load(self):
 96 |         for scores in self.all_scores:
 97 |             self.thres.fit(scores)
 98 |             joblib.dump(self.thres, 'model.pkl')
 99 |             loaded_thres = joblib.load('model.pkl')
100 | 
101 |             assert_equal(self.thres.predict(scores),
102 |                          loaded_thres.predict(scores))
103 | 


--------------------------------------------------------------------------------
/pythresh/test/test_regr.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import unittest
  3 | from itertools import product
  4 | from os.path import dirname as up
  5 | 
  6 | # noinspection PyProtectedMember
  7 | import joblib
  8 | import numpy as np
  9 | from numpy.testing import assert_equal
 10 | from pyod.models.iforest import IForest
 11 | from pyod.models.knn import KNN
 12 | from pyod.models.pca import PCA
 13 | from pyod.utils.data import generate_data
 14 | 
 15 | from pythresh.thresholds.regr import REGR
 16 | 
 17 | # temporary solution for relative imports in case pythresh is not installed
 18 | # if pythresh is installed, no need to use the following line
 19 | 
 20 | path = up(up(up(__file__)))
 21 | sys.path.append(path)
 22 | 
 23 | 
 24 | class TestREGR(unittest.TestCase):
 25 | 
 26 |     @classmethod
 27 |     def setUpClass(cls):
 28 |         cls.n_train = 200
 29 |         cls.n_test = 100
 30 |         cls.contamination = 0.1
 31 |         cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data(
 32 |             n_train=cls.n_train, n_test=cls.n_test,
 33 |             contamination=cls.contamination, random_state=42)
 34 | 
 35 |         cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)]
 36 |         cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_
 37 |         cls.multiple_scores = np.vstack([
 38 |             clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs
 39 |         ]).T
 40 |         cls.all_scores = [cls.single_score, cls.multiple_scores]
 41 | 
 42 |         cls.methods = ['siegel', 'theil']
 43 | 
 44 |         cls.params = list(product(cls.all_scores, cls.methods))
 45 | 
 46 |     def setUp(self):
 47 |         self.thres = REGR()
 48 | 
 49 |     def check_labels(self, labels, scores_shape):
 50 |         self.assertEqual(labels.shape, scores_shape[:1])
 51 |         self.assertIn(labels.min(), [0, 1])
 52 |         self.assertIn(labels.max(), [0, 1])
 53 | 
 54 |     def check_fitted_attributes(self, thres):
 55 |         self.assertTrue(thres.__sklearn_is_fitted__())
 56 |         self.assertIsNotNone(thres.labels_)
 57 |         self.assertIsNotNone(thres.thresh_)
 58 | 
 59 |     def test_eval(self):
 60 |         for scores, method in self.params:
 61 |             thres = REGR(method=method)
 62 |             pred_labels = thres.eval(scores)
 63 | 
 64 |             self.assertIsNotNone(thres.thresh_)
 65 |             self.assertIsNotNone(thres.dscores_)
 66 |             self.assertGreaterEqual(thres.dscores_.min(), 0)
 67 |             self.assertLessEqual(thres.dscores_.max(), 1)
 68 |             self.check_labels(pred_labels, scores.shape)
 69 | 
 70 |     def test_fit(self):
 71 |         for scores in self.all_scores:
 72 |             self.thres.fit(scores)
 73 |             self.check_fitted_attributes(self.thres)
 74 |             self.check_labels(self.thres.labels_, scores.shape)
 75 | 
 76 |     def test_predict(self):
 77 |         for scores in self.all_scores:
 78 |             self.thres.fit(scores)
 79 |             pred_labels = self.thres.predict(scores)
 80 |             self.check_fitted_attributes(self.thres)
 81 |             self.check_labels(pred_labels, scores.shape)
 82 |             assert_equal(self.thres.labels_, pred_labels)
 83 | 
 84 |     def test_test_data(self):
 85 |         for scores, test_scores in zip(self.all_scores, [
 86 |             self.clfs[0].fit(self.X_train).decision_function(self.X_test),
 87 |             np.vstack([clf.fit(self.X_train).decision_function(self.X_test)
 88 |                       for clf in self.clfs]).T
 89 |         ]):
 90 |             self.thres.fit(scores)
 91 |             pred_labels = self.thres.predict(test_scores)
 92 |             self.check_fitted_attributes(self.thres)
 93 |             self.check_labels(pred_labels, test_scores.shape)
 94 | 
 95 |     def test_save_and_load(self):
 96 |         for scores in self.all_scores:
 97 |             self.thres.fit(scores)
 98 |             joblib.dump(self.thres, 'model.pkl')
 99 |             loaded_thres = joblib.load('model.pkl')
100 | 
101 |             assert_equal(self.thres.predict(scores),
102 |                          loaded_thres.predict(scores))
103 | 


--------------------------------------------------------------------------------
/pythresh/test/test_mtt.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import unittest
  3 | from itertools import product
  4 | from os.path import dirname as up
  5 | 
  6 | # noinspection PyProtectedMember
  7 | import joblib
  8 | import numpy as np
  9 | from numpy.testing import assert_equal
 10 | from pyod.models.iforest import IForest
 11 | from pyod.models.knn import KNN
 12 | from pyod.models.pca import PCA
 13 | from pyod.utils.data import generate_data
 14 | 
 15 | from pythresh.thresholds.mtt import MTT
 16 | 
 17 | # temporary solution for relative imports in case pythresh is not installed
 18 | # if pythresh is installed, no need to use the following line
 19 | 
 20 | path = up(up(up(__file__)))
 21 | sys.path.append(path)
 22 | 
 23 | 
 24 | class TestMTT(unittest.TestCase):
 25 | 
 26 |     @classmethod
 27 |     def setUpClass(cls):
 28 |         cls.n_train = 200
 29 |         cls.n_test = 100
 30 |         cls.contamination = 0.1
 31 |         cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data(
 32 |             n_train=cls.n_train, n_test=cls.n_test,
 33 |             contamination=cls.contamination, random_state=42)
 34 | 
 35 |         cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)]
 36 |         cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_
 37 |         cls.multiple_scores = np.vstack([
 38 |             clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs
 39 |         ]).T
 40 |         cls.all_scores = [cls.single_score, cls.multiple_scores]
 41 | 
 42 |         cls.alphas = [0.1, 0.05, 0.025, 0.01, 0.005]
 43 | 
 44 |         cls.params = list(product(cls.all_scores, cls.alphas))
 45 | 
 46 |     def setUp(self):
 47 |         self.thres = MTT()
 48 | 
 49 |     def check_labels(self, labels, scores_shape):
 50 |         self.assertEqual(labels.shape, scores_shape[:1])
 51 |         self.assertIn(labels.min(), [0, 1])
 52 |         self.assertIn(labels.max(), [0, 1])
 53 | 
 54 |     def check_fitted_attributes(self, thres):
 55 |         self.assertTrue(thres.__sklearn_is_fitted__())
 56 |         self.assertIsNotNone(thres.labels_)
 57 |         self.assertIsNotNone(thres.thresh_)
 58 | 
 59 |     def test_eval(self):
 60 |         for scores, alpha in self.params:
 61 |             thres = MTT(alpha=alpha)
 62 |             pred_labels = thres.eval(scores)
 63 | 
 64 |             self.assertIsNotNone(thres.thresh_)
 65 |             self.assertIsNotNone(thres.dscores_)
 66 |             self.assertGreaterEqual(thres.dscores_.min(), 0)
 67 |             self.assertLessEqual(thres.dscores_.max(), 1)
 68 |             self.check_labels(pred_labels, scores.shape)
 69 | 
 70 |     def test_fit(self):
 71 |         for scores in self.all_scores:
 72 |             self.thres.fit(scores)
 73 |             self.check_fitted_attributes(self.thres)
 74 |             self.check_labels(self.thres.labels_, scores.shape)
 75 | 
 76 |     def test_predict(self):
 77 |         for scores in self.all_scores:
 78 |             self.thres.fit(scores)
 79 |             pred_labels = self.thres.predict(scores)
 80 |             self.check_fitted_attributes(self.thres)
 81 |             self.check_labels(pred_labels, scores.shape)
 82 |             assert_equal(self.thres.labels_, pred_labels)
 83 | 
 84 |     def test_test_data(self):
 85 |         for scores, test_scores in zip(self.all_scores, [
 86 |             self.clfs[0].fit(self.X_train).decision_function(self.X_test),
 87 |             np.vstack([clf.fit(self.X_train).decision_function(self.X_test)
 88 |                       for clf in self.clfs]).T
 89 |         ]):
 90 |             self.thres.fit(scores)
 91 |             pred_labels = self.thres.predict(test_scores)
 92 |             self.check_fitted_attributes(self.thres)
 93 |             self.check_labels(pred_labels, test_scores.shape)
 94 | 
 95 |     def test_save_and_load(self):
 96 |         for scores in self.all_scores:
 97 |             self.thres.fit(scores)
 98 |             joblib.dump(self.thres, 'model.pkl')
 99 |             loaded_thres = joblib.load('model.pkl')
100 | 
101 |             assert_equal(self.thres.predict(scores),
102 |                          loaded_thres.predict(scores))
103 | 


--------------------------------------------------------------------------------
/pythresh/test/test_zscore.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import unittest
  3 | from itertools import product
  4 | from os.path import dirname as up
  5 | 
  6 | # noinspection PyProtectedMember
  7 | import joblib
  8 | import numpy as np
  9 | from numpy.testing import assert_equal
 10 | from pyod.models.iforest import IForest
 11 | from pyod.models.knn import KNN
 12 | from pyod.models.pca import PCA
 13 | from pyod.utils.data import generate_data
 14 | 
 15 | from pythresh.thresholds.zscore import ZSCORE
 16 | 
 17 | # temporary solution for relative imports in case pythresh is not installed
 18 | # if pythresh is installed, no need to use the following line
 19 | 
 20 | path = up(up(up(__file__)))
 21 | sys.path.append(path)
 22 | 
 23 | 
 24 | class TestZSCORE(unittest.TestCase):
 25 | 
 26 |     @classmethod
 27 |     def setUpClass(cls):
 28 |         cls.n_train = 200
 29 |         cls.n_test = 100
 30 |         cls.contamination = 0.1
 31 |         cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data(
 32 |             n_train=cls.n_train, n_test=cls.n_test,
 33 |             contamination=cls.contamination, random_state=42)
 34 | 
 35 |         cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)]
 36 |         cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_
 37 |         cls.multiple_scores = np.vstack([
 38 |             clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs
 39 |         ]).T
 40 |         cls.all_scores = [cls.single_score, cls.multiple_scores]
 41 | 
 42 |         cls.factors = [0.5, 1, 2]
 43 | 
 44 |         cls.params = list(product(cls.all_scores, cls.factors))
 45 | 
 46 |     def setUp(self):
 47 |         self.thres = ZSCORE()
 48 | 
 49 |     def check_labels(self, labels, scores_shape):
 50 |         self.assertEqual(labels.shape, scores_shape[:1])
 51 |         self.assertIn(labels.min(), [0, 1])
 52 |         self.assertIn(labels.max(), [0, 1])
 53 | 
 54 |     def check_fitted_attributes(self, thres):
 55 |         self.assertTrue(thres.__sklearn_is_fitted__())
 56 |         self.assertIsNotNone(thres.labels_)
 57 |         self.assertIsNotNone(thres.thresh_)
 58 | 
 59 |     def test_eval(self):
 60 |         for scores, factor in self.params:
 61 |             thres = ZSCORE(factor=factor)
 62 |             pred_labels = thres.eval(scores)
 63 | 
 64 |             self.assertIsNotNone(thres.thresh_)
 65 |             self.assertIsNotNone(thres.dscores_)
 66 |             self.assertGreaterEqual(thres.dscores_.min(), 0)
 67 |             self.assertLessEqual(thres.dscores_.max(), 1)
 68 |             self.check_labels(pred_labels, scores.shape)
 69 | 
 70 |     def test_fit(self):
 71 |         for scores in self.all_scores:
 72 |             self.thres.fit(scores)
 73 |             self.check_fitted_attributes(self.thres)
 74 |             self.check_labels(self.thres.labels_, scores.shape)
 75 | 
 76 |     def test_predict(self):
 77 |         for scores in self.all_scores:
 78 |             self.thres.fit(scores)
 79 |             pred_labels = self.thres.predict(scores)
 80 |             self.check_fitted_attributes(self.thres)
 81 |             self.check_labels(pred_labels, scores.shape)
 82 |             assert_equal(self.thres.labels_, pred_labels)
 83 | 
 84 |     def test_test_data(self):
 85 |         for scores, test_scores in zip(self.all_scores, [
 86 |             self.clfs[0].fit(self.X_train).decision_function(self.X_test),
 87 |             np.vstack([clf.fit(self.X_train).decision_function(self.X_test)
 88 |                       for clf in self.clfs]).T
 89 |         ]):
 90 |             self.thres.fit(scores)
 91 |             pred_labels = self.thres.predict(test_scores)
 92 |             self.check_fitted_attributes(self.thres)
 93 |             self.check_labels(pred_labels, test_scores.shape)
 94 | 
 95 |     def test_save_and_load(self):
 96 |         for scores in self.all_scores:
 97 |             self.thres.fit(scores)
 98 |             joblib.dump(self.thres, 'model.pkl')
 99 |             loaded_thres = joblib.load('model.pkl')
100 | 
101 |             assert_equal(self.thres.predict(scores),
102 |                          loaded_thres.predict(scores))
103 | 


--------------------------------------------------------------------------------
/pythresh/test/test_decomp.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import unittest
  3 | from itertools import product
  4 | from os.path import dirname as up
  5 | 
  6 | # noinspection PyProtectedMember
  7 | import joblib
  8 | import numpy as np
  9 | from numpy.testing import assert_equal
 10 | from pyod.models.iforest import IForest
 11 | from pyod.models.knn import KNN
 12 | from pyod.models.pca import PCA
 13 | from pyod.utils.data import generate_data
 14 | 
 15 | from pythresh.thresholds.decomp import DECOMP
 16 | 
 17 | # temporary solution for relative imports in case pyod is not installed
 18 | # if pythresh is installed, no need to use the following line
 19 | 
 20 | path = up(up(up(__file__)))
 21 | sys.path.append(path)
 22 | 
 23 | 
 24 | class TestDECOMP(unittest.TestCase):
 25 | 
 26 |     @classmethod
 27 |     def setUpClass(cls):
 28 |         cls.n_train = 200
 29 |         cls.n_test = 100
 30 |         cls.contamination = 0.1
 31 |         cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data(
 32 |             n_train=cls.n_train, n_test=cls.n_test,
 33 |             contamination=cls.contamination, random_state=42)
 34 | 
 35 |         cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)]
 36 |         cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_
 37 |         cls.multiple_scores = np.vstack([
 38 |             clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs
 39 |         ]).T
 40 |         cls.all_scores = [cls.single_score, cls.multiple_scores]
 41 | 
 42 |         cls.methods = ['NMF', 'PCA', 'GRP', 'SRP']
 43 | 
 44 |         cls.params = list(product(cls.all_scores, cls.methods))
 45 | 
 46 |     def setUp(self):
 47 |         self.thres = DECOMP()
 48 | 
 49 |     def check_labels(self, labels, scores_shape):
 50 |         self.assertEqual(labels.shape, scores_shape[:1])
 51 |         self.assertIn(labels.min(), [0, 1])
 52 |         self.assertIn(labels.max(), [0, 1])
 53 | 
 54 |     def check_fitted_attributes(self, thres):
 55 |         self.assertTrue(thres.__sklearn_is_fitted__())
 56 |         self.assertIsNotNone(thres.labels_)
 57 |         self.assertIsNotNone(thres.thresh_)
 58 | 
 59 |     def test_eval(self):
 60 |         for scores, method in self.params:
 61 |             thres = DECOMP(method=method)
 62 |             pred_labels = thres.eval(scores)
 63 | 
 64 |             self.assertIsNotNone(thres.thresh_)
 65 |             self.assertIsNotNone(thres.dscores_)
 66 |             self.assertGreaterEqual(thres.dscores_.min(), 0)
 67 |             self.assertLessEqual(thres.dscores_.max(), 1)
 68 |             self.check_labels(pred_labels, scores.shape)
 69 | 
 70 |     def test_fit(self):
 71 |         for scores in self.all_scores:
 72 |             self.thres.fit(scores)
 73 |             self.check_fitted_attributes(self.thres)
 74 |             self.check_labels(self.thres.labels_, scores.shape)
 75 | 
 76 |     def test_predict(self):
 77 |         for scores in self.all_scores:
 78 |             self.thres.fit(scores)
 79 |             pred_labels = self.thres.predict(scores)
 80 |             self.check_fitted_attributes(self.thres)
 81 |             self.check_labels(pred_labels, scores.shape)
 82 |             assert_equal(self.thres.labels_, pred_labels)
 83 | 
 84 |     def test_test_data(self):
 85 |         for scores, test_scores in zip(self.all_scores, [
 86 |             self.clfs[0].fit(self.X_train).decision_function(self.X_test),
 87 |             np.vstack([clf.fit(self.X_train).decision_function(self.X_test)
 88 |                       for clf in self.clfs]).T
 89 |         ]):
 90 |             self.thres.fit(scores)
 91 |             pred_labels = self.thres.predict(test_scores)
 92 |             self.check_fitted_attributes(self.thres)
 93 |             self.check_labels(pred_labels, test_scores.shape)
 94 | 
 95 |     def test_save_and_load(self):
 96 |         for scores in self.all_scores:
 97 |             self.thres.fit(scores)
 98 |             joblib.dump(self.thres, 'model.pkl')
 99 |             loaded_thres = joblib.load('model.pkl')
100 | 
101 |             assert_equal(self.thres.predict(scores),
102 |                          loaded_thres.predict(scores))
103 | 


--------------------------------------------------------------------------------
/pythresh/test/test_karch.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import unittest
  3 | from itertools import product
  4 | from os.path import dirname as up
  5 | 
  6 | # noinspection
  7 | import joblib
  8 | import numpy as np
  9 | from numpy.testing import assert_equal
 10 | from pyod.models.iforest import IForest
 11 | from pyod.models.knn import KNN
 12 | from pyod.models.pca import PCA
 13 | from pyod.utils.data import generate_data
 14 | 
 15 | from pythresh.thresholds.karch import KARCH
 16 | 
 17 | # temporary solution for relative imports in case pythresh is not installed
 18 | # if pythresh is installed, no need to use the following line
 19 | 
 20 | path = up(up(up(__file__)))
 21 | sys.path.append(path)
 22 | 
 23 | 
 24 | class TestKARCH(unittest.TestCase):
 25 | 
 26 |     @classmethod
 27 |     def setUpClass(cls):
 28 |         cls.n_train = 200
 29 |         cls.n_test = 100
 30 |         cls.contamination = 0.1
 31 |         cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data(
 32 |             n_train=cls.n_train, n_test=cls.n_test,
 33 |             contamination=cls.contamination, random_state=42)
 34 | 
 35 |         cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)]
 36 |         cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_
 37 |         cls.multiple_scores = np.vstack([
 38 |             clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs
 39 |         ]).T
 40 |         cls.all_scores = [cls.single_score, cls.multiple_scores]
 41 | 
 42 |         cls.methods = ['simple', 'complex']
 43 |         cls.ndims = range(1, 10)
 44 | 
 45 |         cls.params = list(product(cls.all_scores, cls.methods, cls.ndims))
 46 | 
 47 |     def setUp(self):
 48 |         self.thres = KARCH()
 49 | 
 50 |     def check_labels(self, labels, scores_shape):
 51 |         self.assertEqual(labels.shape, scores_shape[:1])
 52 |         self.assertIn(labels.min(), [0, 1])
 53 |         self.assertIn(labels.max(), [0, 1])
 54 | 
 55 |     def check_fitted_attributes(self, thres):
 56 |         self.assertTrue(thres.__sklearn_is_fitted__())
 57 |         self.assertIsNotNone(thres.labels_)
 58 |         self.assertIsNotNone(thres.thresh_)
 59 | 
 60 |     def test_eval(self):
 61 |         for scores, method, ndim in self.params:
 62 |             thres = KARCH(method=method, ndim=ndim)
 63 |             pred_labels = thres.eval(scores)
 64 | 
 65 |             self.assertIsNotNone(thres.thresh_)
 66 |             self.assertIsNotNone(thres.dscores_)
 67 |             self.assertGreaterEqual(thres.dscores_.min(), 0)
 68 |             self.assertLessEqual(thres.dscores_.max(), 1)
 69 |             self.check_labels(pred_labels, scores.shape)
 70 | 
 71 |     def test_fit(self):
 72 |         for scores in self.all_scores:
 73 |             self.thres.fit(scores)
 74 |             self.check_fitted_attributes(self.thres)
 75 |             self.check_labels(self.thres.labels_, scores.shape)
 76 | 
 77 |     def test_predict(self):
 78 |         for scores in self.all_scores:
 79 |             self.thres.fit(scores)
 80 |             pred_labels = self.thres.predict(scores)
 81 |             self.check_fitted_attributes(self.thres)
 82 |             self.check_labels(pred_labels, scores.shape)
 83 |             assert_equal(self.thres.labels_, pred_labels)
 84 | 
 85 |     def test_test_data(self):
 86 |         for scores, test_scores in zip(self.all_scores, [
 87 |             self.clfs[0].fit(self.X_train).decision_function(self.X_test),
 88 |             np.vstack([clf.fit(self.X_train).decision_function(self.X_test)
 89 |                       for clf in self.clfs]).T
 90 |         ]):
 91 |             self.thres.fit(scores)
 92 |             pred_labels = self.thres.predict(test_scores)
 93 |             self.check_fitted_attributes(self.thres)
 94 |             self.check_labels(pred_labels, test_scores.shape)
 95 | 
 96 |     def test_save_and_load(self):
 97 |         for scores in self.all_scores:
 98 |             self.thres.fit(scores)
 99 |             joblib.dump(self.thres, 'model.pkl')
100 |             loaded_thres = joblib.load('model.pkl')
101 | 
102 |             assert_equal(self.thres.predict(scores),
103 |                          loaded_thres.predict(scores))
104 | 


--------------------------------------------------------------------------------