├── pythresh ├── thresholds │ ├── __init__.py │ ├── zscore.py │ ├── mad.py │ ├── iqr.py │ ├── fwfm.py │ ├── fgd.py │ ├── thresh_utility.py │ ├── cpd.py │ ├── mtt.py │ ├── boot.py │ ├── regr.py │ └── yj.py ├── models │ ├── meta_model_GNB.pkl │ ├── meta_model_GNBC.pkl │ ├── meta_model_GNBM.pkl │ └── meta_model_LIN.pkl ├── __init__.py ├── version.py ├── test │ ├── test_conf.py │ ├── test_fastkde.py │ ├── test_rank.py │ ├── test_eb.py │ ├── test_yj.py │ ├── test_fgd.py │ ├── test_iqr.py │ ├── test_aucp.py │ ├── test_boot.py │ ├── test_fwfm.py │ ├── test_mcst.py │ ├── test_moll.py │ ├── test_wind.py │ ├── test_chau.py │ ├── test_mad.py │ ├── test_regr.py │ ├── test_mtt.py │ ├── test_zscore.py │ ├── test_decomp.py │ └── test_karch.py └── utils │ └── rank_utility.py ├── imgs └── All.png ├── requirements.txt ├── docs ├── figs │ ├── All.png │ ├── Comb1.png │ ├── Comb2.png │ ├── Conf1.png │ ├── Conf2.png │ ├── Multi1.png │ ├── Multi2.png │ ├── Rank1.png │ ├── Rank2.png │ ├── Rank3.png │ ├── Rank4.png │ ├── Rank5.png │ ├── Rank6.png │ ├── Rank7.png │ ├── KNN_KARCH.png │ ├── Overpred.png │ ├── Benchmark1.png │ ├── Benchmark2.png │ ├── Randomness.png │ └── Overpred_best.png ├── command.txt ├── rebuild.bat ├── requirements.txt ├── pythresh.rst ├── Makefile ├── pythresh.utils.rst ├── tables │ ├── TimeComplexity.csv │ ├── RankingCorr.csv │ └── Benchmark2.csv ├── make.bat ├── install.rst └── api_cc.rst ├── notebooks └── data │ ├── musk.mat │ ├── pima.mat │ ├── wbc.mat │ ├── cardio.mat │ ├── glass.mat │ ├── letter.mat │ ├── lympho.mat │ ├── mnist.mat │ ├── vowels.mat │ ├── optdigits.mat │ ├── pendigits.mat │ ├── satellite.mat │ ├── shuttle.mat │ ├── vertebral.mat │ ├── arrhythmia.mat │ ├── ionosphere.mat │ ├── satimage-2.mat │ └── README.md ├── setup.cfg ├── requirements-test.txt ├── MANIFEST.in ├── .github ├── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ └── python-package.yml ├── .readthedocs.yaml ├── .codeclimate.yml ├── LICENSE ├── .pre-commit-config.yaml ├── examples ├── eb_example.py ├── boot_example.py ├── clf_example.py ├── cpd_example.py ├── fgd_example.py ├── meta_example.py ├── zscore_example.py ├── chau_example.py ├── clust_example.py ├── dsn_example.py ├── gamgmm_example.py ├── hist_example.py ├── iqr_example.py ├── mad_example.py ├── moll_example.py ├── regr_example.py ├── yj_example.py ├── fwfm_example.py ├── karch_example.py ├── mcst_example.py ├── mixmod_example.py ├── mtt_example.py ├── wind_example.py ├── filter_example.py ├── qmcd_example.py ├── ocsvm_example.py ├── aucp_example.py ├── vae_example.py ├── decomp_example.py └── gesd_example.py ├── setup.py └── .gitignore /pythresh/thresholds/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /imgs/All.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/imgs/All.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.13 2 | pyod 3 | scikit-learn>=0.20.0 4 | scipy>=1.3.1 5 | -------------------------------------------------------------------------------- /docs/figs/All.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/All.png -------------------------------------------------------------------------------- /docs/figs/Comb1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Comb1.png -------------------------------------------------------------------------------- /docs/figs/Comb2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Comb2.png -------------------------------------------------------------------------------- /docs/figs/Conf1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Conf1.png -------------------------------------------------------------------------------- /docs/figs/Conf2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Conf2.png -------------------------------------------------------------------------------- /docs/figs/Multi1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Multi1.png -------------------------------------------------------------------------------- /docs/figs/Multi2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Multi2.png -------------------------------------------------------------------------------- /docs/figs/Rank1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Rank1.png -------------------------------------------------------------------------------- /docs/figs/Rank2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Rank2.png -------------------------------------------------------------------------------- /docs/figs/Rank3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Rank3.png -------------------------------------------------------------------------------- /docs/figs/Rank4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Rank4.png -------------------------------------------------------------------------------- /docs/figs/Rank5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Rank5.png -------------------------------------------------------------------------------- /docs/figs/Rank6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Rank6.png -------------------------------------------------------------------------------- /docs/figs/Rank7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Rank7.png -------------------------------------------------------------------------------- /docs/figs/KNN_KARCH.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/KNN_KARCH.png -------------------------------------------------------------------------------- /docs/figs/Overpred.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Overpred.png -------------------------------------------------------------------------------- /notebooks/data/musk.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/musk.mat -------------------------------------------------------------------------------- /notebooks/data/pima.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/pima.mat -------------------------------------------------------------------------------- /notebooks/data/wbc.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/wbc.mat -------------------------------------------------------------------------------- /docs/figs/Benchmark1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Benchmark1.png -------------------------------------------------------------------------------- /docs/figs/Benchmark2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Benchmark2.png -------------------------------------------------------------------------------- /docs/figs/Randomness.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Randomness.png -------------------------------------------------------------------------------- /notebooks/data/cardio.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/cardio.mat -------------------------------------------------------------------------------- /notebooks/data/glass.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/glass.mat -------------------------------------------------------------------------------- /notebooks/data/letter.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/letter.mat -------------------------------------------------------------------------------- /notebooks/data/lympho.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/lympho.mat -------------------------------------------------------------------------------- /notebooks/data/mnist.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/mnist.mat -------------------------------------------------------------------------------- /notebooks/data/vowels.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/vowels.mat -------------------------------------------------------------------------------- /docs/figs/Overpred_best.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/docs/figs/Overpred_best.png -------------------------------------------------------------------------------- /notebooks/data/optdigits.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/optdigits.mat -------------------------------------------------------------------------------- /notebooks/data/pendigits.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/pendigits.mat -------------------------------------------------------------------------------- /notebooks/data/satellite.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/satellite.mat -------------------------------------------------------------------------------- /notebooks/data/shuttle.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/shuttle.mat -------------------------------------------------------------------------------- /notebooks/data/vertebral.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/vertebral.mat -------------------------------------------------------------------------------- /notebooks/data/arrhythmia.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/arrhythmia.mat -------------------------------------------------------------------------------- /notebooks/data/ionosphere.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/ionosphere.mat -------------------------------------------------------------------------------- /notebooks/data/satimage-2.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/notebooks/data/satimage-2.mat -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description_file = README.rst 3 | 4 | [egg_info] 5 | tag_build = 6 | tag_date = 0 7 | -------------------------------------------------------------------------------- /pythresh/models/meta_model_GNB.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/pythresh/models/meta_model_GNB.pkl -------------------------------------------------------------------------------- /pythresh/models/meta_model_GNBC.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/pythresh/models/meta_model_GNBC.pkl -------------------------------------------------------------------------------- /pythresh/models/meta_model_GNBM.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/pythresh/models/meta_model_GNBM.pkl -------------------------------------------------------------------------------- /pythresh/models/meta_model_LIN.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KulikDM/pythresh/HEAD/pythresh/models/meta_model_LIN.pkl -------------------------------------------------------------------------------- /pythresh/__init__.py: -------------------------------------------------------------------------------- 1 | from . import thresholds 2 | 3 | # TODO: add version information here 4 | 5 | __all__ = ['thresholds'] 6 | -------------------------------------------------------------------------------- /docs/command.txt: -------------------------------------------------------------------------------- 1 | set SPHINX_APIDOC_OPTIONS=members,undoc-members,show-inheritance,inherited-members 2 | sphinx-apidoc -o rst /home/denmark/Documents/pythresh/ 3 | -------------------------------------------------------------------------------- /docs/rebuild.bat: -------------------------------------------------------------------------------- 1 | REM rebuild docs shortcut 2 | REM only works for Windows 3 | cd.. 4 | xcopy examples\*.png docs\figs /Y 5 | cd docs 6 | call make clean 7 | call make html 8 | -------------------------------------------------------------------------------- /pythresh/version.py: -------------------------------------------------------------------------------- 1 | """``pythresh`` is a python toolbox for outlier detection thresholding.""" 2 | # Based on NiLearn package 3 | # License: simplified BSD 4 | 5 | __version__ = '1.0.2' # pragma: no cover 6 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | 2 | docutils 3 | joblib 4 | numpy 5 | pandas 6 | pyclustering 7 | pyod 8 | ruptures 9 | scikit-learn 10 | scikit-lego 11 | scipy 12 | sphinx-datatables 13 | sphinx-rtd-theme 14 | sphinxcontrib-bibtex 15 | torch 16 | tqdm 17 | xgboost 18 | -------------------------------------------------------------------------------- /requirements-test.txt: -------------------------------------------------------------------------------- 1 | #ruptures 2 | git+https://github.com/deepcharles/ruptures.git 3 | #pyclustering 4 | https://github.com/KulikDM/pyclustering/archive/Warning-Fix.zip 5 | joblib>=0.14.1 6 | numpy 7 | pandas 8 | pyod 9 | scikit-learn>=0.20.0 10 | scikit-lego 11 | scipy>=1.3.1 12 | setuptools>=65.5.1 13 | torch 14 | tqdm 15 | xgboost>=2.0.0,<2.1.0 16 | -------------------------------------------------------------------------------- /notebooks/data/README.md: -------------------------------------------------------------------------------- 1 | All datasets stored in this folder are downloaded from 2 | **Outlier Detection DataSets (ODDS)**: http://odds.cs.stonybrook.edu/#table1 3 | 4 | If you use any data here, please see ODDS' citation policy: 5 | 6 | Shebuti Rayana (2016). ODDS Library \[http://odds.cs.stonybrook.edu\]. Stony Brook, NY: Stony Brook University, Department of Computer Science. 7 | -------------------------------------------------------------------------------- /docs/pythresh.rst: -------------------------------------------------------------------------------- 1 | ############### 2 | API Reference 3 | ############### 4 | 5 | .. toctree:: 6 | 7 | pythresh.thresholds 8 | pythresh.utils 9 | 10 | ***************** 11 | Module contents 12 | ***************** 13 | 14 | .. automodule:: pythresh 15 | :members: 16 | :exclude-members: __version__ 17 | :undoc-members: 18 | :show-inheritance: 19 | :inherited-members: 20 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | prune .github 2 | prune docs 3 | prune imgs 4 | prune examples 5 | prune notebooks 6 | prune pythresh/test 7 | exclude .readthedocs.yaml 8 | exclude .gitignore 9 | exclude .codeclimate.yml 10 | exclude .pre-commit-config.yaml 11 | exclude CHANGES.txt 12 | exclude requirements-test.txt 13 | include README.rst 14 | include requirements.txt 15 | include pythresh/utils/* 16 | include pythresh/models/meta_model_GNB.pkl 17 | include pythresh/models/meta_model_GNBC.pkl 18 | include pythresh/models/meta_model_GNBM.pkl 19 | include pythresh/models/meta_model_LIN.pkl 20 | include pythresh/models/rank_model_XGB.json 21 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = pythresh 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | --- 8 | 9 | **Is your feature request related to a problem? Please describe.** 10 | A clear and concise description of what the problem is. Ex. I'm always frustrated when \[...\] 11 | 12 | **Describe the solution you'd like** 13 | A clear and concise description of what you want to happen. 14 | 15 | **Describe alternatives you've considered** 16 | A clear and concise description of any alternative solutions or features you've considered. 17 | 18 | **Additional context** 19 | Add any other context or screenshots about the feature request here. 20 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | --- 8 | 9 | **Describe the bug** 10 | A clear and concise description of what the bug is. 11 | 12 | **To Reproduce** 13 | Steps to reproduce the behavior: 14 | 15 | **Expected behavior** 16 | A clear and concise description of what you expected to happen. 17 | 18 | **Screenshots** 19 | If applicable, add screenshots to help explain your problem. 20 | 21 | **Desktop (please complete the following information):** 22 | 23 | - OS: \[e.g. Windows\] 24 | - Version \[e.g. 0.2.6\] 25 | 26 | **Additional context** 27 | Add any other context about the problem here. 28 | -------------------------------------------------------------------------------- /docs/pythresh.utils.rst: -------------------------------------------------------------------------------- 1 | ################### 2 | Utility Functions 3 | ################### 4 | 5 | **************************** 6 | pythresh.utils.rank module 7 | **************************** 8 | 9 | .. automodule:: pythresh.utils.rank 10 | :members: 11 | :exclude-members: _cdf_metric, _clust_metric, _consensus_metric, _equi_rank, _equi_sort 12 | :undoc-members: 13 | :show-inheritance: 14 | :inherited-members: 15 | 16 | **************************** 17 | pythresh.utils.conf module 18 | **************************** 19 | 20 | .. automodule:: pythresh.utils.conf 21 | :members: 22 | :exclude-members: _valid_thresh, _invalid_thresh 23 | :undoc-members: 24 | :show-inheritance: 25 | :inherited-members: 26 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.10" 13 | # You can also specify other tool versions: 14 | # nodejs: "19" 15 | # rust: "1.64" 16 | # golang: "1.19" 17 | 18 | # Build documentation in the docs/ directory with Sphinx 19 | sphinx: 20 | configuration: docs/conf.py 21 | 22 | # If using Sphinx, optionally build your docs in additional formats such as PDF 23 | formats: 24 | - pdf 25 | 26 | # Optionally declare the Python requirements required to build your docs 27 | python: 28 | install: 29 | - requirements: docs/requirements.txt 30 | -------------------------------------------------------------------------------- /docs/tables/TimeComplexity.csv: -------------------------------------------------------------------------------- 1 | Method,Complexity,Big-O Notation 2 | AUCP,Quadratic,~1e-8*n^2 3 | BOOT,Quadratic,~1e-8*n^2 4 | CHAU,Linear,~1e-8*n 5 | CLF,Quadratic,~1e-8*n^2 6 | CLUST,Quadratic,~1e-8*n^2 7 | CPD,Quadratic,~1e-8*n^2 8 | DECOMP,Linear,~1e-4*n 9 | DSN,Linear,~1e-4*n 10 | EB,Lineararithmic,~1e-6*n*log(n) 11 | FGD,Lineararithmic,~1e-5*n*log(n) 12 | FILTER,Quadratic,~1e-11*n^2 13 | FWFM,Lineararithmic,~1e-5*n*log(n) 14 | GAMGMM,Quadratic,~1e-6*n^2 15 | GESD,Quadratic,~1e-9*n^2 16 | HIST,Linear,~1e-8*n 17 | IQR,Linear,~1e-8*n 18 | KARCH,Lineararithmic,~1e-5*n*log(n) 19 | MAD,Linear,~1e-8*n 20 | MCST,Quadratic,~1e-7*n^2 21 | META,Cubic,~1e-12*n^3 22 | MIXMOD,Linear,~1e-3*n 23 | MOLL,Lineararithmic,~1e-7*n*log(n) 24 | MTT,Quadratic,~1e-10*n^2 25 | OCSVM,Linear,~1e-7*n 26 | QMCD,Quadratic,~1e-9*n^2 27 | REGR,Quadratic,~1e-8*n^2 28 | VAE,Linear,~1e-3*n 29 | WIND,Linear,~1e-4*n 30 | YJ,Linear,~1e-4*n 31 | ZSCORE,Linear,~1e-8*n 32 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=pythresh 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /.codeclimate.yml: -------------------------------------------------------------------------------- 1 | version: "2" 2 | checks: 3 | argument-count: 4 | enabled: true 5 | config: 6 | threshold: 4 7 | complex-logic: 8 | enabled: true 9 | config: 10 | threshold: 4 11 | file-lines: 12 | enabled: true 13 | config: 14 | threshold: 250 15 | method-complexity: 16 | enabled: true 17 | config: 18 | threshold: 5 19 | method-count: 20 | enabled: true 21 | config: 22 | threshold: 20 23 | method-lines: 24 | enabled: true 25 | config: 26 | threshold: 25 27 | nested-control-flow: 28 | enabled: true 29 | config: 30 | threshold: 4 31 | return-statements: 32 | enabled: true 33 | config: 34 | threshold: 4 35 | similar-code: 36 | enabled: false 37 | identical-code: 38 | enabled: false 39 | plugins: 40 | bandit: 41 | enabled: true 42 | radon: 43 | enabled: true 44 | sonar-python: 45 | enabled: true 46 | config: 47 | minimum_severity: major 48 | tests_patterns: 49 | - pythresh/test/** 50 | exclude_patterns: 51 | - "examples/" 52 | - "**/test/" 53 | -------------------------------------------------------------------------------- /docs/tables/RankingCorr.csv: -------------------------------------------------------------------------------- 1 | Label,Mean,Median,Standard Deviation 2 | CAL,0.1872,0.2027,0.4303 3 | DAV,-0.1257,-0.0474,0.4523 4 | SIL,0.1262,0.0589,0.4744 5 | BH,0.0039,0.0391,0.4836 6 | BR,0.0260,0.0300,0.4750 7 | CAL_sc,0.0158,0.0111,0.5083 8 | DAV_sc,-0.0483,-0.0944,0.5230 9 | DR,-0.0157,-0.0111,0.5084 10 | Dunn,0.0270,0.0207,0.5353 11 | Hubert,0.0641,0.1632,0.4674 12 | Iind,0.0648,0.1527,0.4763 13 | MR,0.0789,0.1710,0.4973 14 | PB,0.0367,0.0715,0.5089 15 | RL,-0.0314,-0.0569,0.5081 16 | RT,-0.0034,0.0524,0.4884 17 | SIL_sc,0.0473,0.0801,0.4885 18 | SDBW,-0.0641,-0.0627,0.4736 19 | SS,-0.0034,0.0524,0.4884 20 | WG,-0.0244,-0.0210,0.5221 21 | XBS,-0.0535,0.0032,0.5280 22 | AMA,0.0543,0.0088,0.5046 23 | BHT,0.0239,0.0163,0.5050 24 | BREG,0.1022,0.1546,0.5006 25 | COR,0.0173,0.0364,0.5114 26 | ENG,0.1120,0.1054,0.5030 27 | JS,0.0566,0.1282,0.5045 28 | MAH,0.0749,0.0300,0.5075 29 | LK,0.0749,0.1048,0.5070 30 | WS,0.1133,0.1766,0.5001 31 | EM,0.0261,0.0752,0.4332 32 | MV,0.0094,-0.0164,0.4361 33 | Contam,-0.2003,-0.1498,0.5297 34 | GNB,-0.1931,-0.2902,0.4768 35 | HITS,-0.0449,-0.1235,0.5998 36 | Mode,-0.1505,-0.0840,0.5377 37 | Thresh,-0.1696,-0.1940,0.6031 38 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ### Submissions Basics: 2 | 3 | - [ ] Have you followed the guidelines in our Contributing document? 4 | - [ ] Have you checked to ensure there aren't other open [Pull Requests](../../../../pulls) for the same update/change? 5 | - [ ] Have you checked all [Issues](../../../../issues) to tie the PR to a specific one? 6 | 7 | 8 | 9 | ### Type of Change: 10 | 11 | - [ ] Bug fix 12 | - [ ] New feature 13 | - [ ] Documentation update 14 | 15 | ### All Submissions Cores: 16 | 17 | - [ ] Have you added an explanation of what your changes do and why you'd like us to include them? 18 | - [ ] Have you written new unit tests for your changes? 19 | - [ ] Do all new and existing unit tests pass locally? 20 | 21 | ### New Thresholder Submissions: 22 | 23 | - [ ] Have you created a .py in ~/pythresh/thresholds/? 24 | - [ ] Have you created a \_example.py in ~/examples/? 25 | - [ ] Have you created a test\_.py in ~/pythresh/test/? 26 | - [ ] Have you lint your code locally prior to submission? 27 | - [ ] Have you added a reference of the new thresholder in your explanation? 28 | -------------------------------------------------------------------------------- /docs/install.rst: -------------------------------------------------------------------------------- 1 | ############## 2 | Installation 3 | ############## 4 | 5 | It is recommended to use **pip** or **conda** installation. Please make 6 | sure **the latest version** is installed, as PyThresh is updated 7 | frequently: 8 | 9 | .. code:: bash 10 | 11 | pip install pythresh # normal install 12 | pip install --upgrade pythresh # or update if needed 13 | 14 | .. code:: bash 15 | 16 | conda install -c conda-forge pythresh 17 | 18 | Alternatively, you can get the version with the latest updates by 19 | cloning the repo and run setup.py file: 20 | 21 | .. code:: bash 22 | 23 | git clone https://github.com/KulikDM/pythresh.git 24 | cd pythresh 25 | pip install . 26 | 27 | Or with **pip**: 28 | 29 | .. code:: bash 30 | 31 | pip install https://github.com/KulikDM/pythresh/archive/main.zip 32 | 33 | **Required Dependencies**: 34 | 35 | - numpy>=1.13 36 | - pyod 37 | - scipy>=1.3.1 38 | - scikit_learn>=0.20.0 39 | 40 | **Optional Dependencies**: 41 | 42 | - pyclustering (used in the CLUST thresholder) 43 | - ruptures (used in the CPD thresholder) 44 | - scikit-lego (used in the META thresholder) 45 | - joblib>=0.14.1 (used in the META thresholder and RANK) 46 | - pandas (used in the META thresholder) 47 | - torch (used in the VAE thresholder) 48 | - tqdm (used in the VAE thresholder) 49 | - xgboost>=2.0.0 (used in the RANK) 50 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2022, Daniel Kulik 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v6.0.0 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: end-of-file-fixer 7 | - id: check-yaml 8 | - id: double-quote-string-fixer 9 | - id: requirements-txt-fixer 10 | - id: name-tests-test 11 | always_run: true 12 | args: [--pytest-test-first] 13 | - id: mixed-line-ending 14 | args: [--fix=lf] 15 | 16 | - repo: https://github.com/DanielNoord/pydocstringformatter 17 | rev: v0.7.5 18 | hooks: 19 | - id: pydocstringformatter 20 | args: [--style=pep257, --style=numpydoc] 21 | name: Format docstrings 22 | 23 | - repo: https://github.com/asottile/pyupgrade 24 | rev: v3.21.2 25 | hooks: 26 | - id: pyupgrade 27 | args: [--py38-plus] 28 | name: Upgrade code 29 | 30 | - repo: https://github.com/hhatto/autopep8 31 | rev: v2.3.2 32 | hooks: 33 | - id: autopep8 34 | args: [--in-place] 35 | name: Format code style 36 | 37 | - repo: https://github.com/PyCQA/isort 38 | rev: 7.0.0 39 | hooks: 40 | - id: isort 41 | args: [-m=3] 42 | name: Sort imports 43 | 44 | - repo: https://github.com/charliermarsh/ruff-pre-commit 45 | rev: v0.14.7 46 | hooks: 47 | - id: ruff 48 | args: [--exit-non-zero-on-fix, --fix, --line-length=180] 49 | exclude: "\\.ipynb$" 50 | name: Lint code 51 | -------------------------------------------------------------------------------- /docs/api_cc.rst: -------------------------------------------------------------------------------- 1 | ################ 2 | API CheatSheet 3 | ################ 4 | 5 | The following APIs are applicable for all detector models for ease of 6 | use. 7 | 8 | - :func:`pythresh.thresholders.base.BaseDetector.eval`: evaluate a 9 | single outlier or multiple outlier detection likelihood score set 10 | (Legacy method) 11 | 12 | - :func:`pythresh.thresholders.base.BaseDetector.fit`: fit a 13 | thresholder for a single outlier or multiple outlier detection 14 | likelihood score set 15 | 16 | - :func:`pythresh.thresholders.base.BaseDetector.predict`: predict the 17 | binary labels using the fitted thresholder on a single outlier or 18 | multiple outlier detection likelihood score set 19 | 20 | Key Attributes of a fitted model: 21 | 22 | - :attr:`pythresh.thresholds.base.BaseThresholder.thresh_`: threshold 23 | value from scores normalize between 0 and 1 24 | 25 | - :attr:`pythresh.thresholds.base.BaseThresholder.labels_`: A binary 26 | array of labels for the fitted thresholder on the fitted dataset 27 | 28 | - :attr:`pythresh.thresholders.base.BaseDetector.confidence_interval_`: 29 | Return the lower and upper confidence interval of the contamination 30 | level. Only applies to the COMB thresholder 31 | 32 | - :attr:`pythresh.thresholders.base.BaseDetector.dscores_`: 1D array of 33 | the TruncatedSVD decomposed decision scores if multiple outlier 34 | detector score sets are passed 35 | 36 | - :attr:`pythresh.thresholders.mixmod.MIXMOD.mixture_`: fitted mixture 37 | model class of the selected model used for thresholding. Only applies 38 | to MIXMOD. Attributes include: components, weights, params. Functions 39 | include: fit, loglikelihood, pdf, and posterior. 40 | 41 | See base class definition below: 42 | 43 | ********************************* 44 | pythresh.thresholds.base module 45 | ********************************* 46 | 47 | .. automodule:: pythresh.thresholds.base 48 | :members: 49 | :exclude-members: _data_setup, _set_norm, _set_attributes 50 | :undoc-members: 51 | :show-inheritance: 52 | :inherited-members: 53 | -------------------------------------------------------------------------------- /pythresh/test/test_conf.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unittest 3 | from itertools import product 4 | from os.path import dirname as up 5 | 6 | # noinspection PyProtectedMember 7 | from pyod.models.iforest import IForest 8 | from pyod.utils.data import generate_data 9 | 10 | from pythresh.thresholds.filter import FILTER 11 | from pythresh.thresholds.ocsvm import OCSVM 12 | from pythresh.utils.conf import CONF 13 | 14 | # temporary solution for relative imports in case pythresh is not installed 15 | # if pythresh is installed, no need to use the following line 16 | 17 | path = up(up(up(__file__))) 18 | sys.path.append(path) 19 | 20 | 21 | class TestCONF(unittest.TestCase): 22 | def setUp(self): 23 | self.n_train = 200 24 | self.n_test = 100 25 | self.contamination = 0.1 26 | self.X_train, self.X_test, self.y_train, self.y_test = generate_data( 27 | n_train=self.n_train, n_test=self.n_test, 28 | contamination=self.contamination, random_state=42) 29 | 30 | clf = IForest() 31 | clf.fit(self.X_train) 32 | 33 | self.scores = clf.decision_scores_ 34 | 35 | self.thres = [FILTER(), OCSVM()] 36 | 37 | self.alphas = [0.05, 0.1, 0.2] 38 | 39 | self.splits = [0.2, 0.5, 0.8] 40 | 41 | self.n_tests = [10, 100, 1000] 42 | 43 | def test_prediction_labels(self): 44 | 45 | params = product(self.thres, 46 | self.alphas, 47 | self.splits, 48 | self.n_tests) 49 | 50 | for thres, alpha, split, n_test in params: 51 | 52 | confidence = CONF(thres, alpha=alpha, 53 | split=split, n_test=n_test) 54 | uncertains = confidence.eval(self.scores) 55 | 56 | assert (isinstance(uncertains, list)) 57 | assert (len(uncertains) <= len(self.scores)) 58 | 59 | if len(uncertains) > 0: 60 | 61 | assert (min(uncertains) > 0) 62 | assert (max(uncertains) < len(self.scores)) 63 | assert (len(set(uncertains)) == len(uncertains)) 64 | -------------------------------------------------------------------------------- /docs/tables/Benchmark2.csv: -------------------------------------------------------------------------------- 1 | Label,Method 2 | AUCP,AUCP() 3 | BOOT,BOOT() 4 | CHAU,CHAU() 5 | CLF1,CLF(method=’simple’) 6 | CLF2,CLF(method=’complex’) 7 | CLUST1,CLUST(method=’agg’) 8 | CLUST2,CLUST(method=’birch’) 9 | CLUST3,CLUST(method=’bang’) 10 | CLUST4,CLUST(method=’bgm’) 11 | CLUST5,CLUST(method=’bsas’) 12 | CLUST6,CLUST(method=’dbscan’) 13 | CLUST7,CLUST(method=’ema’) 14 | CLUST8,CLUST(method=’kmeans’) 15 | CLUST9,CLUST(method=’mbsas’) 16 | CLUST10,CLUST(method=’mshift’) 17 | CLUST11,CLUST(method=’optics’) 18 | CLUST12,CLUST(method=’somsc’) 19 | CLUST13,CLUST(method=’spec’) 20 | CLUST14,CLUST(method=’xmeans’) 21 | CPD1,CPD(method=’Dynp’) 22 | CPD2,CPD(method=’KernelCPD’) 23 | CPD3,CPD(method=’Binseg’) 24 | CPD4,CPD(method=’BottomUp’) 25 | DECOMP1,DECOMP(method=’NMF’) 26 | DECOMP2,DECOMP(method=’PCA’) 27 | DECOMP3,DECOMP(method=’GRP’) 28 | DECOMP4,DECOMP(method=’SRP’) 29 | DSN1,DSN(metric=’JS’) 30 | DSN2,DSN(metric=’WS’) 31 | DSN3,DSN(metric=’ENG’) 32 | DSN4,DSN(metric=’BHT’) 33 | DSN5,DSN(metric=’HLL’) 34 | DSN6,DSN(metric=’HI’) 35 | DSN7,DSN(metric=’LK’) 36 | DSN8,DSN(metric=’MAH’) 37 | DSN9,DSN(metric=’TMT’) 38 | DSN10,DSN(metric=’RES’) 39 | DSN11,DSN(metric=’KS’) 40 | DSN12,DSN(metric=’INT’) 41 | DSN13,DSN(metric=’MMD’) 42 | EB,EB() 43 | FGD,FGD() 44 | FILTER1,FILTER(method=’gaussian’) 45 | FILTER2,FILTER(method=’savgol’) 46 | FILTER3,FILTER(method=’hilbert’) 47 | FILTER4,FILTER(method=’wiener’) 48 | FILTER5,FILTER(method=’medfilt’) 49 | FILTER6,FILTER(method=’decimate’) 50 | FILTER7,FILTER(method=’detrend’) 51 | FILTER8,FILTER(method=’resemple’) 52 | FWFM,FWFM() 53 | GESD,GESD() 54 | HIST1,HIST(method=’otsu’) 55 | HIST2,HIST(method=’yen’) 56 | HIST3,HIST(method=’isodata’) 57 | HIST4,HIST(method=’li’) 58 | HIST5,HIST(method=’triangle’) 59 | IQR,IQR() 60 | KARCH,KARCH() 61 | MAD,MAD() 62 | MCST,MCST() 63 | META1,META(method=’LIN’) 64 | META2,META(method=’GNB’) 65 | META3,META(method=’GNBC’) 66 | META4,META(method=’GNBM’) 67 | MOLL,MOLL() 68 | OCSVM1,OCSVM(method=’poly’) 69 | OCSVM2,OCSVM(method=’sgd’) 70 | QMCD1,QMCD(method=’CD’) 71 | QMCD2,QMCD(method=’WD’) 72 | QMCD3,QMCD(method=’MD’) 73 | QMCD4,QMCD(method=’L2-star’) 74 | REGR1,REGR(method=’siegel’) 75 | REGR2,REGR(method=’theil’) 76 | VAE,VAE() 77 | WIND,WIND() 78 | YJ,YJ() 79 | ZSCORE,ZSCORE() 80 | -------------------------------------------------------------------------------- /examples/eb_example.py: -------------------------------------------------------------------------------- 1 | """Example of using elliptical boundaries for outlier thresholding.""" 2 | # Author: D Kulik 3 | # License: BSD 2 clause 4 | 5 | 6 | import os 7 | import sys 8 | 9 | from pyod.models.knn import KNN 10 | from pyod.utils.data import evaluate_print, generate_data 11 | from pyod.utils.example import visualize 12 | 13 | from pythresh.thresholds.eb import EB 14 | 15 | # temporary solution for relative imports in case pyod is not installed 16 | # if pyod is installed, no need to use the following line 17 | sys.path.append( 18 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 19 | 20 | 21 | if __name__ == '__main__': 22 | contamination = 0.1 # percentage of outliers 23 | n_train = 200 # number of training points 24 | n_test = 100 # number of testing points 25 | 26 | # Generate sample data 27 | X_train, X_test, y_train, y_test =\ 28 | generate_data(n_train=n_train, 29 | n_test=n_test, 30 | n_features=2, 31 | contamination=contamination, 32 | random_state=42) 33 | 34 | # train KNN detector 35 | clf_name = 'KNN' 36 | clf = KNN() 37 | clf.fit(X_train) 38 | thres = EB() 39 | 40 | # get the prediction labels and outlier scores of the training data 41 | y_train_scores = clf.decision_scores_ # raw outlier scores 42 | 43 | # (Legacy method) 44 | # y_train_pred = thres.eval(y_train_scores) 45 | 46 | # binary labels (0: inliers, 1: outliers) 47 | thres.fit(y_train_scores) 48 | y_train_pred = thres.labels_ 49 | # or 50 | y_train_pred = thres.predict(y_train_scores) 51 | 52 | # get the prediction on the test data 53 | y_test_scores = clf.decision_function(X_test) # outlier scores 54 | 55 | # (Legacy method) 56 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 57 | y_test_pred = thres.predict(y_test_scores) 58 | 59 | # evaluate and print the results 60 | print('\nOn Training Data:') 61 | evaluate_print(clf_name, y_train, y_train_scores) 62 | print('\nOn Test Data:') 63 | evaluate_print(clf_name, y_test, y_test_scores) 64 | 65 | # visualize the results 66 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 67 | y_test_pred, show_figure=True, save_figure=False) 68 | -------------------------------------------------------------------------------- /pythresh/test/test_fastkde.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unittest 3 | from itertools import product 4 | from os.path import dirname as up 5 | 6 | # noinspection PyProtectedMember 7 | import numpy as np 8 | from numpy.testing import assert_equal 9 | from pyod.models.iforest import IForest 10 | from pyod.models.knn import KNN 11 | from pyod.models.pca import PCA 12 | from pyod.utils.data import generate_data 13 | 14 | from pythresh.thresholds.dsn import DSN 15 | 16 | # temporary solution for relative imports in case pythresh is not installed 17 | # if pythresh is installed, no need to use the following line 18 | 19 | path = up(up(up(__file__))) 20 | sys.path.append(path) 21 | 22 | # Test implementation of the fastkde interpolation method 23 | 24 | 25 | class TestFastKDE(unittest.TestCase): 26 | def setUp(self): 27 | self.n_train = 10000 28 | self.n_test = 100 29 | self.contamination = 0.1 30 | self.X_train, self.X_test, self.y_train, self.y_test = generate_data( 31 | n_train=self.n_train, n_test=self.n_test, 32 | contamination=self.contamination, random_state=42) 33 | 34 | clf = KNN() 35 | clf.fit(self.X_train) 36 | 37 | scores = clf.decision_scores_ 38 | 39 | clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)] 40 | 41 | multiple_scores = [ 42 | clf.fit(self.X_train).decision_scores_ for clf in clfs] 43 | multiple_scores = np.vstack(multiple_scores).T 44 | 45 | self.all_scores = [scores, multiple_scores] 46 | 47 | self.metrics = ['JS', 'MAH'] 48 | 49 | def test_prediction_labels(self): 50 | 51 | params = product(self.all_scores, self.metrics) 52 | 53 | for scores, metric in params: 54 | 55 | self.thres = DSN(metric=metric) 56 | pred_labels = self.thres.eval(scores) 57 | assert (self.thres.thresh_ is not None) 58 | assert (self.thres.dscores_ is not None) 59 | 60 | assert (self.thres.dscores_.min() == 0) 61 | assert (self.thres.dscores_.max() == 1) 62 | 63 | assert_equal(pred_labels.shape, self.y_train.shape) 64 | 65 | if (not np.all(pred_labels == 0)) & (not np.all(pred_labels == 1)): 66 | 67 | assert (pred_labels.min() == 0) 68 | assert (pred_labels.max() == 1) 69 | -------------------------------------------------------------------------------- /examples/boot_example.py: -------------------------------------------------------------------------------- 1 | """Example of using a boostraped method for outlier thresholding.""" 2 | # Author: D Kulik 3 | # License: BSD 2 clause 4 | 5 | 6 | import os 7 | import sys 8 | 9 | from pyod.models.knn import KNN 10 | from pyod.utils.data import evaluate_print, generate_data 11 | from pyod.utils.example import visualize 12 | 13 | from pythresh.thresholds.boot import BOOT 14 | 15 | # temporary solution for relative imports in case pyod is not installed 16 | # if pyod is installed, no need to use the following line 17 | sys.path.append( 18 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 19 | 20 | 21 | if __name__ == '__main__': 22 | contamination = 0.1 # percentage of outliers 23 | n_train = 200 # number of training points 24 | n_test = 100 # number of testing points 25 | 26 | # Generate sample data 27 | X_train, X_test, y_train, y_test =\ 28 | generate_data(n_train=n_train, 29 | n_test=n_test, 30 | n_features=2, 31 | contamination=contamination, 32 | random_state=42) 33 | 34 | # train KNN detector 35 | clf_name = 'KNN' 36 | clf = KNN() 37 | clf.fit(X_train) 38 | thres = BOOT() 39 | 40 | # get the prediction labels and outlier scores of the training data 41 | y_train_scores = clf.decision_scores_ # raw outlier scores 42 | 43 | # (Legacy method) 44 | # y_train_pred = thres.eval(y_train_scores) 45 | 46 | # binary labels (0: inliers, 1: outliers) 47 | thres.fit(y_train_scores) 48 | y_train_pred = thres.labels_ 49 | # or 50 | y_train_pred = thres.predict(y_train_scores) 51 | 52 | # get the prediction on the test data 53 | y_test_scores = clf.decision_function(X_test) # outlier scores 54 | 55 | # (Legacy method) 56 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 57 | y_test_pred = thres.predict(y_test_scores) 58 | 59 | # evaluate and print the results 60 | print('\nOn Training Data:') 61 | evaluate_print(clf_name, y_train, y_train_scores) 62 | print('\nOn Test Data:') 63 | evaluate_print(clf_name, y_test, y_test_scores) 64 | 65 | # visualize the results 66 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 67 | y_test_pred, show_figure=True, save_figure=False) 68 | -------------------------------------------------------------------------------- /examples/clf_example.py: -------------------------------------------------------------------------------- 1 | """Example of using a trained classifier for outlier thresholding.""" 2 | # Author: D Kulik 3 | # License: BSD 2 clause 4 | 5 | 6 | import os 7 | import sys 8 | 9 | from pyod.models.knn import KNN 10 | from pyod.utils.data import evaluate_print, generate_data 11 | from pyod.utils.example import visualize 12 | 13 | from pythresh.thresholds.clf import CLF 14 | 15 | # temporary solution for relative imports in case pyod is not installed 16 | # if pyod is installed, no need to use the following line 17 | sys.path.append( 18 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 19 | 20 | 21 | if __name__ == '__main__': 22 | contamination = 0.1 # percentage of outliers 23 | n_train = 200 # number of training points 24 | n_test = 100 # number of testing points 25 | 26 | # Generate sample data 27 | X_train, X_test, y_train, y_test =\ 28 | generate_data(n_train=n_train, 29 | n_test=n_test, 30 | n_features=2, 31 | contamination=contamination, 32 | random_state=42) 33 | 34 | # train KNN detector 35 | clf_name = 'KNN' 36 | clf = KNN() 37 | clf.fit(X_train) 38 | thres = CLF() 39 | 40 | # get the prediction labels and outlier scores of the training data 41 | y_train_scores = clf.decision_scores_ # raw outlier scores 42 | 43 | # (Legacy method) 44 | # y_train_pred = thres.eval(y_train_scores) 45 | 46 | # binary labels (0: inliers, 1: outliers) 47 | thres.fit(y_train_scores) 48 | y_train_pred = thres.labels_ 49 | # or 50 | y_train_pred = thres.predict(y_train_scores) 51 | 52 | # get the prediction on the test data 53 | y_test_scores = clf.decision_function(X_test) # outlier scores 54 | 55 | # (Legacy method) 56 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 57 | y_test_pred = thres.predict(y_test_scores) 58 | 59 | # evaluate and print the results 60 | print('\nOn Training Data:') 61 | evaluate_print(clf_name, y_train, y_train_scores) 62 | print('\nOn Test Data:') 63 | evaluate_print(clf_name, y_test, y_test_scores) 64 | 65 | # visualize the results 66 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 67 | y_test_pred, show_figure=True, save_figure=False) 68 | -------------------------------------------------------------------------------- /examples/cpd_example.py: -------------------------------------------------------------------------------- 1 | """Example of using a change point detection for outlier thresholding.""" 2 | # Author: D Kulik 3 | # License: BSD 2 clause 4 | 5 | 6 | import os 7 | import sys 8 | 9 | from pyod.models.knn import KNN 10 | from pyod.utils.data import evaluate_print, generate_data 11 | from pyod.utils.example import visualize 12 | 13 | from pythresh.thresholds.cpd import CPD 14 | 15 | # temporary solution for relative imports in case pyod is not installed 16 | # if pyod is installed, no need to use the following line 17 | sys.path.append( 18 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 19 | 20 | 21 | if __name__ == '__main__': 22 | contamination = 0.1 # percentage of outliers 23 | n_train = 200 # number of training points 24 | n_test = 100 # number of testing points 25 | 26 | # Generate sample data 27 | X_train, X_test, y_train, y_test =\ 28 | generate_data(n_train=n_train, 29 | n_test=n_test, 30 | n_features=2, 31 | contamination=contamination, 32 | random_state=42) 33 | 34 | # train KNN detector 35 | clf_name = 'KNN' 36 | clf = KNN() 37 | clf.fit(X_train) 38 | thres = CPD() 39 | 40 | # get the prediction labels and outlier scores of the training data 41 | y_train_scores = clf.decision_scores_ # raw outlier scores 42 | 43 | # (Legacy method) 44 | # y_train_pred = thres.eval(y_train_scores) 45 | 46 | # binary labels (0: inliers, 1: outliers) 47 | thres.fit(y_train_scores) 48 | y_train_pred = thres.labels_ 49 | # or 50 | y_train_pred = thres.predict(y_train_scores) 51 | 52 | # get the prediction on the test data 53 | y_test_scores = clf.decision_function(X_test) # outlier scores 54 | 55 | # (Legacy method) 56 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 57 | y_test_pred = thres.predict(y_test_scores) 58 | 59 | # evaluate and print the results 60 | print('\nOn Training Data:') 61 | evaluate_print(clf_name, y_train, y_train_scores) 62 | print('\nOn Test Data:') 63 | evaluate_print(clf_name, y_test, y_test_scores) 64 | 65 | # visualize the results 66 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 67 | y_test_pred, show_figure=True, save_figure=False) 68 | -------------------------------------------------------------------------------- /examples/fgd_example.py: -------------------------------------------------------------------------------- 1 | """Example of using fixed gradient descent for outlier thresholding.""" 2 | # Author: D Kulik 3 | # License: BSD 2 clause 4 | 5 | 6 | import os 7 | import sys 8 | 9 | from pyod.models.knn import KNN 10 | from pyod.utils.data import evaluate_print, generate_data 11 | from pyod.utils.example import visualize 12 | 13 | from pythresh.thresholds.fgd import FGD 14 | 15 | # temporary solution for relative imports in case pyod is not installed 16 | # if pyod is installed, no need to use the following line 17 | sys.path.append( 18 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 19 | 20 | 21 | if __name__ == '__main__': 22 | contamination = 0.1 # percentage of outliers 23 | n_train = 200 # number of training points 24 | n_test = 100 # number of testing points 25 | 26 | # Generate sample data 27 | X_train, X_test, y_train, y_test =\ 28 | generate_data(n_train=n_train, 29 | n_test=n_test, 30 | n_features=2, 31 | contamination=contamination, 32 | random_state=42) 33 | 34 | # train KNN detector 35 | clf_name = 'KNN' 36 | clf = KNN() 37 | clf.fit(X_train) 38 | thres = FGD() 39 | 40 | # get the prediction labels and outlier scores of the training data 41 | y_train_scores = clf.decision_scores_ # raw outlier scores 42 | 43 | # (Legacy method) 44 | # y_train_pred = thres.eval(y_train_scores) 45 | 46 | # binary labels (0: inliers, 1: outliers) 47 | thres.fit(y_train_scores) 48 | y_train_pred = thres.labels_ 49 | # or 50 | y_train_pred = thres.predict(y_train_scores) 51 | 52 | # get the prediction on the test data 53 | y_test_scores = clf.decision_function(X_test) # outlier scores 54 | 55 | # (Legacy method) 56 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 57 | y_test_pred = thres.predict(y_test_scores) 58 | 59 | # evaluate and print the results 60 | print('\nOn Training Data:') 61 | evaluate_print(clf_name, y_train, y_train_scores) 62 | print('\nOn Test Data:') 63 | evaluate_print(clf_name, y_test, y_test_scores) 64 | 65 | # visualize the results 66 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 67 | y_test_pred, show_figure=True, save_figure=False) 68 | -------------------------------------------------------------------------------- /examples/meta_example.py: -------------------------------------------------------------------------------- 1 | """Example of using a boostraped method for outlier thresholding.""" 2 | # Author: D Kulik 3 | # License: BSD 2 clause 4 | 5 | 6 | import os 7 | import sys 8 | 9 | from pyod.models.knn import KNN 10 | from pyod.utils.data import evaluate_print, generate_data 11 | from pyod.utils.example import visualize 12 | 13 | from pythresh.thresholds.meta import META 14 | 15 | # temporary solution for relative imports in case pyod is not installed 16 | # if pyod is installed, no need to use the following line 17 | sys.path.append( 18 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 19 | 20 | 21 | if __name__ == '__main__': 22 | contamination = 0.1 # percentage of outliers 23 | n_train = 200 # number of training points 24 | n_test = 100 # number of testing points 25 | 26 | # Generate sample data 27 | X_train, X_test, y_train, y_test =\ 28 | generate_data(n_train=n_train, 29 | n_test=n_test, 30 | n_features=2, 31 | contamination=contamination, 32 | random_state=42) 33 | 34 | # train KNN detector 35 | clf_name = 'KNN' 36 | clf = KNN() 37 | clf.fit(X_train) 38 | thres = META() 39 | 40 | # get the prediction labels and outlier scores of the training data 41 | y_train_scores = clf.decision_scores_ # raw outlier scores 42 | 43 | # (Legacy method) 44 | # y_train_pred = thres.eval(y_train_scores) 45 | 46 | # binary labels (0: inliers, 1: outliers) 47 | thres.fit(y_train_scores) 48 | y_train_pred = thres.labels_ 49 | # or 50 | y_train_pred = thres.predict(y_train_scores) 51 | 52 | # get the prediction on the test data 53 | y_test_scores = clf.decision_function(X_test) # outlier scores 54 | 55 | # (Legacy method) 56 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 57 | y_test_pred = thres.predict(y_test_scores) 58 | 59 | # evaluate and print the results 60 | print('\nOn Training Data:') 61 | evaluate_print(clf_name, y_train, y_train_scores) 62 | print('\nOn Test Data:') 63 | evaluate_print(clf_name, y_test, y_test_scores) 64 | 65 | # visualize the results 66 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 67 | y_test_pred, show_figure=True, save_figure=False) 68 | -------------------------------------------------------------------------------- /examples/zscore_example.py: -------------------------------------------------------------------------------- 1 | """Example of using the zscore for outlier thresholding.""" 2 | # Author: D Kulik 3 | # License: BSD 2 clause 4 | 5 | 6 | import os 7 | import sys 8 | 9 | from pyod.models.knn import KNN 10 | from pyod.utils.data import evaluate_print, generate_data 11 | from pyod.utils.example import visualize 12 | 13 | from pythresh.thresholds.zscore import ZSCORE 14 | 15 | # temporary solution for relative imports in case pyod is not installed 16 | # if pyod is installed, no need to use the following line 17 | sys.path.append( 18 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 19 | 20 | 21 | if __name__ == '__main__': 22 | contamination = 0.1 # percentage of outliers 23 | n_train = 200 # number of training points 24 | n_test = 100 # number of testing points 25 | 26 | # Generate sample data 27 | X_train, X_test, y_train, y_test =\ 28 | generate_data(n_train=n_train, 29 | n_test=n_test, 30 | n_features=2, 31 | contamination=contamination, 32 | random_state=42) 33 | 34 | # train KNN detector 35 | clf_name = 'KNN' 36 | clf = KNN() 37 | clf.fit(X_train) 38 | thres = ZSCORE() 39 | 40 | # get the prediction labels and outlier scores of the training data 41 | y_train_scores = clf.decision_scores_ # raw outlier scores 42 | 43 | # (Legacy method) 44 | # y_train_pred = thres.eval(y_train_scores) 45 | 46 | # binary labels (0: inliers, 1: outliers) 47 | thres.fit(y_train_scores) 48 | y_train_pred = thres.labels_ 49 | # or 50 | y_train_pred = thres.predict(y_train_scores) 51 | 52 | # get the prediction on the test data 53 | y_test_scores = clf.decision_function(X_test) # outlier scores 54 | 55 | # (Legacy method) 56 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 57 | y_test_pred = thres.predict(y_test_scores) 58 | 59 | # evaluate and print the results 60 | print('\nOn Training Data:') 61 | evaluate_print(clf_name, y_train, y_train_scores) 62 | print('\nOn Test Data:') 63 | evaluate_print(clf_name, y_test, y_test_scores) 64 | 65 | # visualize the results 66 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 67 | y_test_pred, show_figure=True, save_figure=False) 68 | -------------------------------------------------------------------------------- /examples/chau_example.py: -------------------------------------------------------------------------------- 1 | """Example of using Chauvenet's criterion for outlier thresholding.""" 2 | # Author: D Kulik 3 | # License: BSD 2 clause 4 | 5 | 6 | import os 7 | import sys 8 | 9 | from pyod.models.knn import KNN 10 | from pyod.utils.data import evaluate_print, generate_data 11 | from pyod.utils.example import visualize 12 | 13 | from pythresh.thresholds.chau import CHAU 14 | 15 | # temporary solution for relative imports in case pyod is not installed 16 | # if pyod is installed, no need to use the following line 17 | sys.path.append( 18 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 19 | 20 | 21 | if __name__ == '__main__': 22 | contamination = 0.1 # percentage of outliers 23 | n_train = 200 # number of training points 24 | n_test = 100 # number of testing points 25 | 26 | # Generate sample data 27 | X_train, X_test, y_train, y_test =\ 28 | generate_data(n_train=n_train, 29 | n_test=n_test, 30 | n_features=2, 31 | contamination=contamination, 32 | random_state=42) 33 | 34 | # train KNN detector 35 | clf_name = 'KNN' 36 | clf = KNN() 37 | clf.fit(X_train) 38 | thres = CHAU() 39 | 40 | # get the prediction labels and outlier scores of the training data 41 | y_train_scores = clf.decision_scores_ # raw outlier scores 42 | 43 | # (Legacy method) 44 | # y_train_pred = thres.eval(y_train_scores) 45 | 46 | # binary labels (0: inliers, 1: outliers) 47 | thres.fit(y_train_scores) 48 | y_train_pred = thres.labels_ 49 | # or 50 | y_train_pred = thres.predict(y_train_scores) 51 | 52 | # get the prediction on the test data 53 | y_test_scores = clf.decision_function(X_test) # outlier scores 54 | 55 | # (Legacy method) 56 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 57 | y_test_pred = thres.predict(y_test_scores) 58 | 59 | # evaluate and print the results 60 | print('\nOn Training Data:') 61 | evaluate_print(clf_name, y_train, y_train_scores) 62 | print('\nOn Test Data:') 63 | evaluate_print(clf_name, y_test, y_test_scores) 64 | 65 | # visualize the results 66 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 67 | y_test_pred, show_figure=True, save_figure=False) 68 | -------------------------------------------------------------------------------- /examples/clust_example.py: -------------------------------------------------------------------------------- 1 | """Example of using a trained classifier for outlier thresholding.""" 2 | # Author: D Kulik 3 | # License: BSD 2 clause 4 | 5 | 6 | import os 7 | import sys 8 | 9 | from pyod.models.knn import KNN 10 | from pyod.utils.data import evaluate_print, generate_data 11 | from pyod.utils.example import visualize 12 | 13 | from pythresh.thresholds.clust import CLUST 14 | 15 | # temporary solution for relative imports in case pyod is not installed 16 | # if pyod is installed, no need to use the following line 17 | sys.path.append( 18 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 19 | 20 | 21 | if __name__ == '__main__': 22 | contamination = 0.1 # percentage of outliers 23 | n_train = 200 # number of training points 24 | n_test = 100 # number of testing points 25 | 26 | # Generate sample data 27 | X_train, X_test, y_train, y_test =\ 28 | generate_data(n_train=n_train, 29 | n_test=n_test, 30 | n_features=2, 31 | contamination=contamination, 32 | random_state=42) 33 | 34 | # train KNN detector 35 | clf_name = 'KNN' 36 | clf = KNN() 37 | clf.fit(X_train) 38 | thres = CLUST() 39 | 40 | # get the prediction labels and outlier scores of the training data 41 | y_train_scores = clf.decision_scores_ # raw outlier scores 42 | 43 | # (Legacy method) 44 | # y_train_pred = thres.eval(y_train_scores) 45 | 46 | # binary labels (0: inliers, 1: outliers) 47 | thres.fit(y_train_scores) 48 | y_train_pred = thres.labels_ 49 | # or 50 | y_train_pred = thres.predict(y_train_scores) 51 | 52 | # get the prediction on the test data 53 | y_test_scores = clf.decision_function(X_test) # outlier scores 54 | 55 | # (Legacy method) 56 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 57 | y_test_pred = thres.predict(y_test_scores) 58 | 59 | # evaluate and print the results 60 | print('\nOn Training Data:') 61 | evaluate_print(clf_name, y_train, y_train_scores) 62 | print('\nOn Test Data:') 63 | evaluate_print(clf_name, y_test, y_test_scores) 64 | 65 | # visualize the results 66 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 67 | y_test_pred, show_figure=True, save_figure=False) 68 | -------------------------------------------------------------------------------- /examples/dsn_example.py: -------------------------------------------------------------------------------- 1 | """Example of using distance shift from normal for outlier thresholding.""" 2 | # Author: D Kulik 3 | # License: BSD 2 clause 4 | 5 | 6 | import os 7 | import sys 8 | 9 | from pyod.models.knn import KNN 10 | from pyod.utils.data import evaluate_print, generate_data 11 | from pyod.utils.example import visualize 12 | 13 | from pythresh.thresholds.dsn import DSN 14 | 15 | # temporary solution for relative imports in case pyod is not installed 16 | # if pyod is installed, no need to use the following line 17 | sys.path.append( 18 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 19 | 20 | 21 | if __name__ == '__main__': 22 | contamination = 0.1 # percentage of outliers 23 | n_train = 200 # number of training points 24 | n_test = 100 # number of testing points 25 | 26 | # Generate sample data 27 | X_train, X_test, y_train, y_test =\ 28 | generate_data(n_train=n_train, 29 | n_test=n_test, 30 | n_features=2, 31 | contamination=contamination, 32 | random_state=42) 33 | 34 | # train KNN detector 35 | clf_name = 'KNN' 36 | clf = KNN() 37 | clf.fit(X_train) 38 | thres = DSN() 39 | 40 | # get the prediction labels and outlier scores of the training data 41 | y_train_scores = clf.decision_scores_ # raw outlier scores 42 | 43 | # (Legacy method) 44 | # y_train_pred = thres.eval(y_train_scores) 45 | 46 | # binary labels (0: inliers, 1: outliers) 47 | thres.fit(y_train_scores) 48 | y_train_pred = thres.labels_ 49 | # or 50 | y_train_pred = thres.predict(y_train_scores) 51 | 52 | # get the prediction on the test data 53 | y_test_scores = clf.decision_function(X_test) # outlier scores 54 | 55 | # (Legacy method) 56 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 57 | y_test_pred = thres.predict(y_test_scores) 58 | 59 | # evaluate and print the results 60 | print('\nOn Training Data:') 61 | evaluate_print(clf_name, y_train, y_train_scores) 62 | print('\nOn Test Data:') 63 | evaluate_print(clf_name, y_test, y_test_scores) 64 | 65 | # visualize the results 66 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 67 | y_test_pred, show_figure=True, save_figure=False) 68 | -------------------------------------------------------------------------------- /examples/gamgmm_example.py: -------------------------------------------------------------------------------- 1 | """Example of using gammaGMM for outlier thresholding.""" 2 | # Author: L Perini 3 | # License: BSD 2 clause 4 | 5 | 6 | import os 7 | import sys 8 | 9 | from pyod.models.knn import KNN 10 | from pyod.utils.data import evaluate_print, generate_data 11 | from pyod.utils.example import visualize 12 | 13 | from pythresh.thresholds.gamgmm import GAMGMM 14 | 15 | # temporary solution for relative imports in case pyod is not installed 16 | # if pyod is installed, no need to use the following line 17 | sys.path.append( 18 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 19 | 20 | 21 | if __name__ == '__main__': 22 | contamination = 0.1 # percentage of outliers 23 | n_train = 200 # number of training points 24 | n_test = 100 # number of testing points 25 | 26 | # Generate sample data 27 | X_train, X_test, y_train, y_test =\ 28 | generate_data(n_train=n_train, 29 | n_test=n_test, 30 | n_features=2, 31 | contamination=contamination, 32 | random_state=42) 33 | 34 | # train Autoencoder detector 35 | clf_name = 'KNN' 36 | clf = KNN() 37 | clf.fit(X_train) 38 | thres = GAMGMM() 39 | 40 | # get the prediction labels and outlier scores of the training data 41 | y_train_scores = clf.decision_scores_ # raw outlier scores 42 | 43 | # (Legacy method) 44 | # y_train_pred = thres.eval(y_train_scores) 45 | 46 | # binary labels (0: inliers, 1: outliers) 47 | thres.fit(y_train_scores) 48 | y_train_pred = thres.labels_ 49 | # or 50 | y_train_pred = thres.predict(y_train_scores) 51 | 52 | # get the prediction on the test data 53 | y_test_scores = clf.decision_function(X_test) # outlier scores 54 | 55 | # (Legacy method) 56 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 57 | y_test_pred = thres.predict(y_test_scores) 58 | 59 | # evaluate and print the results 60 | print('\nOn Training Data:') 61 | evaluate_print(clf_name, y_train, y_train_scores) 62 | print('\nOn Test Data:') 63 | evaluate_print(clf_name, y_test, y_test_scores) 64 | 65 | # visualize the results 66 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 67 | y_test_pred, show_figure=True, save_figure=False) 68 | -------------------------------------------------------------------------------- /examples/hist_example.py: -------------------------------------------------------------------------------- 1 | """Example of using histogram based methods for outlier thresholding.""" 2 | # Author: D Kulik 3 | # License: BSD 2 clause 4 | 5 | 6 | import os 7 | import sys 8 | 9 | from pyod.models.knn import KNN 10 | from pyod.utils.data import evaluate_print, generate_data 11 | from pyod.utils.example import visualize 12 | 13 | from pythresh.thresholds.hist import HIST 14 | 15 | # temporary solution for relative imports in case pyod is not installed 16 | # if pyod is installed, no need to use the following line 17 | sys.path.append( 18 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 19 | 20 | 21 | if __name__ == '__main__': 22 | contamination = 0.1 # percentage of outliers 23 | n_train = 200 # number of training points 24 | n_test = 100 # number of testing points 25 | 26 | # Generate sample data 27 | X_train, X_test, y_train, y_test =\ 28 | generate_data(n_train=n_train, 29 | n_test=n_test, 30 | n_features=2, 31 | contamination=contamination, 32 | random_state=42) 33 | 34 | # train KNN detector 35 | clf_name = 'KNN' 36 | clf = KNN() 37 | clf.fit(X_train) 38 | thres = HIST() 39 | 40 | # get the prediction labels and outlier scores of the training data 41 | y_train_scores = clf.decision_scores_ # raw outlier scores 42 | 43 | # (Legacy method) 44 | # y_train_pred = thres.eval(y_train_scores) 45 | 46 | # binary labels (0: inliers, 1: outliers) 47 | thres.fit(y_train_scores) 48 | y_train_pred = thres.labels_ 49 | # or 50 | y_train_pred = thres.predict(y_train_scores) 51 | 52 | # get the prediction on the test data 53 | y_test_scores = clf.decision_function(X_test) # outlier scores 54 | 55 | # (Legacy method) 56 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 57 | y_test_pred = thres.predict(y_test_scores) 58 | 59 | # evaluate and print the results 60 | print('\nOn Training Data:') 61 | evaluate_print(clf_name, y_train, y_train_scores) 62 | print('\nOn Test Data:') 63 | evaluate_print(clf_name, y_test, y_test_scores) 64 | 65 | # visualize the results 66 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 67 | y_test_pred, show_figure=True, save_figure=False) 68 | -------------------------------------------------------------------------------- /examples/iqr_example.py: -------------------------------------------------------------------------------- 1 | """Example of using the inter-quartile region for outlier thresholding.""" 2 | # Author: D Kulik 3 | # License: BSD 2 clause 4 | 5 | 6 | import os 7 | import sys 8 | 9 | from pyod.models.knn import KNN 10 | from pyod.utils.data import evaluate_print, generate_data 11 | from pyod.utils.example import visualize 12 | 13 | from pythresh.thresholds.iqr import IQR 14 | 15 | # temporary solution for relative imports in case pyod is not installed 16 | # if pyod is installed, no need to use the following line 17 | sys.path.append( 18 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 19 | 20 | 21 | if __name__ == '__main__': 22 | contamination = 0.1 # percentage of outliers 23 | n_train = 200 # number of training points 24 | n_test = 100 # number of testing points 25 | 26 | # Generate sample data 27 | X_train, X_test, y_train, y_test =\ 28 | generate_data(n_train=n_train, 29 | n_test=n_test, 30 | n_features=2, 31 | contamination=contamination, 32 | random_state=42) 33 | 34 | # train KNN detector 35 | clf_name = 'KNN' 36 | clf = KNN() 37 | clf.fit(X_train) 38 | thres = IQR() 39 | 40 | # get the prediction labels and outlier scores of the training data 41 | y_train_scores = clf.decision_scores_ # raw outlier scores 42 | 43 | # (Legacy method) 44 | # y_train_pred = thres.eval(y_train_scores) 45 | 46 | # binary labels (0: inliers, 1: outliers) 47 | thres.fit(y_train_scores) 48 | y_train_pred = thres.labels_ 49 | # or 50 | y_train_pred = thres.predict(y_train_scores) 51 | 52 | # get the prediction on the test data 53 | y_test_scores = clf.decision_function(X_test) # outlier scores 54 | 55 | # (Legacy method) 56 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 57 | y_test_pred = thres.predict(y_test_scores) 58 | 59 | # evaluate and print the results 60 | print('\nOn Training Data:') 61 | evaluate_print(clf_name, y_train, y_train_scores) 62 | print('\nOn Test Data:') 63 | evaluate_print(clf_name, y_test, y_test_scores) 64 | 65 | # visualize the results 66 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 67 | y_test_pred, show_figure=True, save_figure=False) 68 | -------------------------------------------------------------------------------- /examples/mad_example.py: -------------------------------------------------------------------------------- 1 | """Example of using the median absolute deviation for outlier thresholding.""" 2 | # Author: D Kulik 3 | # License: BSD 2 clause 4 | 5 | 6 | import os 7 | import sys 8 | 9 | from pyod.models.knn import KNN 10 | from pyod.utils.data import evaluate_print, generate_data 11 | from pyod.utils.example import visualize 12 | 13 | from pythresh.thresholds.mad import MAD 14 | 15 | # temporary solution for relative imports in case pyod is not installed 16 | # if pyod is installed, no need to use the following line 17 | sys.path.append( 18 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 19 | 20 | 21 | if __name__ == '__main__': 22 | contamination = 0.1 # percentage of outliers 23 | n_train = 200 # number of training points 24 | n_test = 100 # number of testing points 25 | 26 | # Generate sample data 27 | X_train, X_test, y_train, y_test =\ 28 | generate_data(n_train=n_train, 29 | n_test=n_test, 30 | n_features=2, 31 | contamination=contamination, 32 | random_state=42) 33 | 34 | # train KNN detector 35 | clf_name = 'KNN' 36 | clf = KNN() 37 | clf.fit(X_train) 38 | thres = MAD() 39 | 40 | # get the prediction labels and outlier scores of the training data 41 | y_train_scores = clf.decision_scores_ # raw outlier scores 42 | 43 | # (Legacy method) 44 | # y_train_pred = thres.eval(y_train_scores) 45 | 46 | # binary labels (0: inliers, 1: outliers) 47 | thres.fit(y_train_scores) 48 | y_train_pred = thres.labels_ 49 | # or 50 | y_train_pred = thres.predict(y_train_scores) 51 | 52 | # get the prediction on the test data 53 | y_test_scores = clf.decision_function(X_test) # outlier scores 54 | 55 | # (Legacy method) 56 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 57 | y_test_pred = thres.predict(y_test_scores) 58 | 59 | # evaluate and print the results 60 | print('\nOn Training Data:') 61 | evaluate_print(clf_name, y_train, y_train_scores) 62 | print('\nOn Test Data:') 63 | evaluate_print(clf_name, y_test, y_test_scores) 64 | 65 | # visualize the results 66 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 67 | y_test_pred, show_figure=True, save_figure=False) 68 | -------------------------------------------------------------------------------- /examples/moll_example.py: -------------------------------------------------------------------------------- 1 | """Example of using Friedrichs' mollifier for outlier thresholding.""" 2 | # Author: D Kulik 3 | # License: BSD 2 clause 4 | 5 | 6 | import os 7 | import sys 8 | 9 | from pyod.models.knn import KNN 10 | from pyod.utils.data import evaluate_print, generate_data 11 | from pyod.utils.example import visualize 12 | 13 | from pythresh.thresholds.moll import MOLL 14 | 15 | # temporary solution for relative imports in case pyod is not installed 16 | # if pyod is installed, no need to use the following line 17 | sys.path.append( 18 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 19 | 20 | 21 | if __name__ == '__main__': 22 | contamination = 0.1 # percentage of outliers 23 | n_train = 200 # number of training points 24 | n_test = 100 # number of testing points 25 | 26 | # Generate sample data 27 | X_train, X_test, y_train, y_test =\ 28 | generate_data(n_train=n_train, 29 | n_test=n_test, 30 | n_features=2, 31 | contamination=contamination, 32 | random_state=42) 33 | 34 | # train KNN detector 35 | clf_name = 'KNN' 36 | clf = KNN() 37 | clf.fit(X_train) 38 | thres = MOLL() 39 | 40 | # get the prediction labels and outlier scores of the training data 41 | y_train_scores = clf.decision_scores_ # raw outlier scores 42 | 43 | # (Legacy method) 44 | # y_train_pred = thres.eval(y_train_scores) 45 | 46 | # binary labels (0: inliers, 1: outliers) 47 | thres.fit(y_train_scores) 48 | y_train_pred = thres.labels_ 49 | # or 50 | y_train_pred = thres.predict(y_train_scores) 51 | 52 | # get the prediction on the test data 53 | y_test_scores = clf.decision_function(X_test) # outlier scores 54 | 55 | # (Legacy method) 56 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 57 | y_test_pred = thres.predict(y_test_scores) 58 | 59 | # evaluate and print the results 60 | print('\nOn Training Data:') 61 | evaluate_print(clf_name, y_train, y_train_scores) 62 | print('\nOn Test Data:') 63 | evaluate_print(clf_name, y_test, y_test_scores) 64 | 65 | # visualize the results 66 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 67 | y_test_pred, show_figure=True, save_figure=False) 68 | -------------------------------------------------------------------------------- /examples/regr_example.py: -------------------------------------------------------------------------------- 1 | """Example of using the regression intercept for outlier thresholding.""" 2 | # Author: D Kulik 3 | # License: BSD 2 clause 4 | 5 | 6 | import os 7 | import sys 8 | 9 | from pyod.models.knn import KNN 10 | from pyod.utils.data import evaluate_print, generate_data 11 | from pyod.utils.example import visualize 12 | 13 | from pythresh.thresholds.regr import REGR 14 | 15 | # temporary solution for relative imports in case pyod is not installed 16 | # if pyod is installed, no need to use the following line 17 | sys.path.append( 18 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 19 | 20 | 21 | if __name__ == '__main__': 22 | contamination = 0.1 # percentage of outliers 23 | n_train = 200 # number of training points 24 | n_test = 100 # number of testing points 25 | 26 | # Generate sample data 27 | X_train, X_test, y_train, y_test =\ 28 | generate_data(n_train=n_train, 29 | n_test=n_test, 30 | n_features=2, 31 | contamination=contamination, 32 | random_state=42) 33 | 34 | # train KNN detector 35 | clf_name = 'KNN' 36 | clf = KNN() 37 | clf.fit(X_train) 38 | thres = REGR() 39 | 40 | # get the prediction labels and outlier scores of the training data 41 | y_train_scores = clf.decision_scores_ # raw outlier scores 42 | 43 | # (Legacy method) 44 | # y_train_pred = thres.eval(y_train_scores) 45 | 46 | # binary labels (0: inliers, 1: outliers) 47 | thres.fit(y_train_scores) 48 | y_train_pred = thres.labels_ 49 | # or 50 | y_train_pred = thres.predict(y_train_scores) 51 | 52 | # get the prediction on the test data 53 | y_test_scores = clf.decision_function(X_test) # outlier scores 54 | 55 | # (Legacy method) 56 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 57 | y_test_pred = thres.predict(y_test_scores) 58 | 59 | # evaluate and print the results 60 | print('\nOn Training Data:') 61 | evaluate_print(clf_name, y_train, y_train_scores) 62 | print('\nOn Test Data:') 63 | evaluate_print(clf_name, y_test, y_test_scores) 64 | 65 | # visualize the results 66 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 67 | y_test_pred, show_figure=True, save_figure=False) 68 | -------------------------------------------------------------------------------- /examples/yj_example.py: -------------------------------------------------------------------------------- 1 | """Example of using the Yeo-Johnson transformation for outlier thresholding.""" 2 | # Author: D Kulik 3 | # License: BSD 2 clause 4 | 5 | 6 | import os 7 | import sys 8 | 9 | from pyod.models.knn import KNN 10 | from pyod.utils.data import evaluate_print, generate_data 11 | from pyod.utils.example import visualize 12 | 13 | from pythresh.thresholds.yj import YJ 14 | 15 | # temporary solution for relative imports in case pyod is not installed 16 | # if pyod is installed, no need to use the following line 17 | sys.path.append( 18 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 19 | 20 | 21 | if __name__ == '__main__': 22 | contamination = 0.1 # percentage of outliers 23 | n_train = 200 # number of training points 24 | n_test = 100 # number of testing points 25 | 26 | # Generate sample data 27 | X_train, X_test, y_train, y_test =\ 28 | generate_data(n_train=n_train, 29 | n_test=n_test, 30 | n_features=2, 31 | contamination=contamination, 32 | random_state=42) 33 | 34 | # train KNN detector 35 | clf_name = 'KNN' 36 | clf = KNN() 37 | clf.fit(X_train) 38 | thres = YJ() 39 | 40 | # get the prediction labels and outlier scores of the training data 41 | y_train_scores = clf.decision_scores_ # raw outlier scores 42 | 43 | # (Legacy method) 44 | # y_train_pred = thres.eval(y_train_scores) 45 | 46 | # binary labels (0: inliers, 1: outliers) 47 | thres.fit(y_train_scores) 48 | y_train_pred = thres.labels_ 49 | # or 50 | y_train_pred = thres.predict(y_train_scores) 51 | 52 | # get the prediction on the test data 53 | y_test_scores = clf.decision_function(X_test) # outlier scores 54 | 55 | # (Legacy method) 56 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 57 | y_test_pred = thres.predict(y_test_scores) 58 | 59 | # evaluate and print the results 60 | print('\nOn Training Data:') 61 | evaluate_print(clf_name, y_train, y_train_scores) 62 | print('\nOn Test Data:') 63 | evaluate_print(clf_name, y_test, y_test_scores) 64 | 65 | # visualize the results 66 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 67 | y_test_pred, show_figure=True, save_figure=False) 68 | -------------------------------------------------------------------------------- /examples/fwfm_example.py: -------------------------------------------------------------------------------- 1 | """Example of using the full width at full maximum for outlier thresholding.""" 2 | # Author: D Kulik 3 | # License: BSD 2 clause 4 | 5 | 6 | import os 7 | import sys 8 | 9 | from pyod.models.knn import KNN 10 | from pyod.utils.data import evaluate_print, generate_data 11 | from pyod.utils.example import visualize 12 | 13 | from pythresh.thresholds.fwfm import FWFM 14 | 15 | # temporary solution for relative imports in case pyod is not installed 16 | # if pyod is installed, no need to use the following line 17 | sys.path.append( 18 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 19 | 20 | 21 | if __name__ == '__main__': 22 | contamination = 0.1 # percentage of outliers 23 | n_train = 200 # number of training points 24 | n_test = 100 # number of testing points 25 | 26 | # Generate sample data 27 | X_train, X_test, y_train, y_test =\ 28 | generate_data(n_train=n_train, 29 | n_test=n_test, 30 | n_features=2, 31 | contamination=contamination, 32 | random_state=42) 33 | 34 | # train KNN detector 35 | clf_name = 'KNN' 36 | clf = KNN() 37 | clf.fit(X_train) 38 | thres = FWFM() 39 | 40 | # get the prediction labels and outlier scores of the training data 41 | y_train_scores = clf.decision_scores_ # raw outlier scores 42 | 43 | # (Legacy method) 44 | # y_train_pred = thres.eval(y_train_scores) 45 | 46 | # binary labels (0: inliers, 1: outliers) 47 | thres.fit(y_train_scores) 48 | y_train_pred = thres.labels_ 49 | # or 50 | y_train_pred = thres.predict(y_train_scores) 51 | 52 | # get the prediction on the test data 53 | y_test_scores = clf.decision_function(X_test) # outlier scores 54 | 55 | # (Legacy method) 56 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 57 | y_test_pred = thres.predict(y_test_scores) 58 | 59 | # evaluate and print the results 60 | print('\nOn Training Data:') 61 | evaluate_print(clf_name, y_train, y_train_scores) 62 | print('\nOn Test Data:') 63 | evaluate_print(clf_name, y_test, y_test_scores) 64 | 65 | # visualize the results 66 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 67 | y_test_pred, show_figure=True, save_figure=False) 68 | -------------------------------------------------------------------------------- /examples/karch_example.py: -------------------------------------------------------------------------------- 1 | """Example of using distance shift from normal for outlier thresholding.""" 2 | # Author: D Kulik 3 | # License: BSD 2 clause 4 | 5 | 6 | import os 7 | import sys 8 | 9 | from pyod.models.knn import KNN 10 | from pyod.utils.data import evaluate_print, generate_data 11 | from pyod.utils.example import visualize 12 | 13 | from pythresh.thresholds.karch import KARCH 14 | 15 | # temporary solution for relative imports in case pyod is not installed 16 | # if pyod is installed, no need to use the following line 17 | sys.path.append( 18 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 19 | 20 | 21 | if __name__ == '__main__': 22 | contamination = 0.1 # percentage of outliers 23 | n_train = 200 # number of training points 24 | n_test = 100 # number of testing points 25 | 26 | # Generate sample data 27 | X_train, X_test, y_train, y_test =\ 28 | generate_data(n_train=n_train, 29 | n_test=n_test, 30 | n_features=2, 31 | contamination=contamination, 32 | random_state=42) 33 | 34 | # train KNN detector 35 | clf_name = 'KNN' 36 | clf = KNN() 37 | clf.fit(X_train) 38 | thres = KARCH() 39 | 40 | # get the prediction labels and outlier scores of the training data 41 | y_train_scores = clf.decision_scores_ # raw outlier scores 42 | 43 | # (Legacy method) 44 | # y_train_pred = thres.eval(y_train_scores) 45 | 46 | # binary labels (0: inliers, 1: outliers) 47 | thres.fit(y_train_scores) 48 | y_train_pred = thres.labels_ 49 | # or 50 | y_train_pred = thres.predict(y_train_scores) 51 | 52 | # get the prediction on the test data 53 | y_test_scores = clf.decision_function(X_test) # outlier scores 54 | 55 | # (Legacy method) 56 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 57 | y_test_pred = thres.predict(y_test_scores) 58 | 59 | # evaluate and print the results 60 | print('\nOn Training Data:') 61 | evaluate_print(clf_name, y_train, y_train_scores) 62 | print('\nOn Test Data:') 63 | evaluate_print(clf_name, y_test, y_test_scores) 64 | 65 | # visualize the results 66 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 67 | y_test_pred, show_figure=True, save_figure=False) 68 | -------------------------------------------------------------------------------- /examples/mcst_example.py: -------------------------------------------------------------------------------- 1 | """Example of using Monte Carlo statistical tests for outlier thresholding.""" 2 | # Author: D Kulik 3 | # License: BSD 2 clause 4 | 5 | 6 | import os 7 | import sys 8 | 9 | from pyod.models.knn import KNN 10 | from pyod.utils.data import evaluate_print, generate_data 11 | from pyod.utils.example import visualize 12 | 13 | from pythresh.thresholds.mcst import MCST 14 | 15 | # temporary solution for relative imports in case pyod is not installed 16 | # if pyod is installed, no need to use the following line 17 | sys.path.append( 18 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 19 | 20 | 21 | if __name__ == '__main__': 22 | contamination = 0.1 # percentage of outliers 23 | n_train = 200 # number of training points 24 | n_test = 100 # number of testing points 25 | 26 | # Generate sample data 27 | X_train, X_test, y_train, y_test =\ 28 | generate_data(n_train=n_train, 29 | n_test=n_test, 30 | n_features=2, 31 | contamination=contamination, 32 | random_state=42) 33 | 34 | # train KNN detector 35 | clf_name = 'KNN' 36 | clf = KNN() 37 | clf.fit(X_train) 38 | thres = MCST() 39 | 40 | # get the prediction labels and outlier scores of the training data 41 | y_train_scores = clf.decision_scores_ # raw outlier scores 42 | 43 | # (Legacy method) 44 | # y_train_pred = thres.eval(y_train_scores) 45 | 46 | # binary labels (0: inliers, 1: outliers) 47 | thres.fit(y_train_scores) 48 | y_train_pred = thres.labels_ 49 | # or 50 | y_train_pred = thres.predict(y_train_scores) 51 | 52 | # get the prediction on the test data 53 | y_test_scores = clf.decision_function(X_test) # outlier scores 54 | 55 | # (Legacy method) 56 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 57 | y_test_pred = thres.predict(y_test_scores) 58 | 59 | # evaluate and print the results 60 | print('\nOn Training Data:') 61 | evaluate_print(clf_name, y_train, y_train_scores) 62 | print('\nOn Test Data:') 63 | evaluate_print(clf_name, y_test, y_test_scores) 64 | 65 | # visualize the results 66 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 67 | y_test_pred, show_figure=True, save_figure=False) 68 | -------------------------------------------------------------------------------- /examples/mixmod_example.py: -------------------------------------------------------------------------------- 1 | """Example of using mixture models for outlier thresholding.""" 2 | # Author: D Kulik 3 | # License: BSD 2 clause 4 | 5 | 6 | import os 7 | import sys 8 | 9 | from pyod.models.knn import KNN 10 | from pyod.utils.data import evaluate_print, generate_data 11 | from pyod.utils.example import visualize 12 | 13 | from pythresh.thresholds.mixmod import MIXMOD 14 | 15 | # temporary solution for relative imports in case pyod is not installed 16 | # if pyod is installed, no need to use the following line 17 | sys.path.append( 18 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 19 | 20 | 21 | if __name__ == '__main__': 22 | contamination = 0.1 # percentage of outliers 23 | n_train = 200 # number of training points 24 | n_test = 100 # number of testing points 25 | 26 | # Generate sample data 27 | X_train, X_test, y_train, y_test =\ 28 | generate_data(n_train=n_train, 29 | n_test=n_test, 30 | n_features=2, 31 | contamination=contamination, 32 | random_state=42) 33 | 34 | # train Autoencoder detector 35 | clf_name = 'KNN' 36 | clf = KNN() 37 | clf.fit(X_train) 38 | thres = MIXMOD() 39 | 40 | # get the prediction labels and outlier scores of the training data 41 | y_train_scores = clf.decision_scores_ # raw outlier scores 42 | 43 | # (Legacy method) 44 | # y_train_pred = thres.eval(y_train_scores) 45 | 46 | # binary labels (0: inliers, 1: outliers) 47 | thres.fit(y_train_scores) 48 | y_train_pred = thres.labels_ 49 | # or 50 | y_train_pred = thres.predict(y_train_scores) 51 | 52 | # get the prediction on the test data 53 | y_test_scores = clf.decision_function(X_test) # outlier scores 54 | 55 | # (Legacy method) 56 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 57 | y_test_pred = thres.predict(y_test_scores) 58 | 59 | # evaluate and print the results 60 | print('\nOn Training Data:') 61 | evaluate_print(clf_name, y_train, y_train_scores) 62 | print('\nOn Test Data:') 63 | evaluate_print(clf_name, y_test, y_test_scores) 64 | 65 | # visualize the results 66 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 67 | y_test_pred, show_figure=True, save_figure=False) 68 | -------------------------------------------------------------------------------- /examples/mtt_example.py: -------------------------------------------------------------------------------- 1 | """Example of using the modified Thompson Tau test for outlier thresholding.""" 2 | # Author: D Kulik 3 | # License: BSD 2 clause 4 | 5 | 6 | import os 7 | import sys 8 | 9 | from pyod.models.knn import KNN 10 | from pyod.utils.data import evaluate_print, generate_data 11 | from pyod.utils.example import visualize 12 | 13 | from pythresh.thresholds.mtt import MTT 14 | 15 | # temporary solution for relative imports in case pyod is not installed 16 | # if pyod is installed, no need to use the following line 17 | sys.path.append( 18 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 19 | 20 | 21 | if __name__ == '__main__': 22 | contamination = 0.1 # percentage of outliers 23 | n_train = 200 # number of training points 24 | n_test = 100 # number of testing points 25 | 26 | # Generate sample data 27 | X_train, X_test, y_train, y_test =\ 28 | generate_data(n_train=n_train, 29 | n_test=n_test, 30 | n_features=2, 31 | contamination=contamination, 32 | random_state=42) 33 | 34 | # train KNN detector 35 | clf_name = 'KNN' 36 | clf = KNN() 37 | clf.fit(X_train) 38 | thres = MTT() 39 | 40 | # get the prediction labels and outlier scores of the training data 41 | y_train_scores = clf.decision_scores_ # raw outlier scores 42 | 43 | # (Legacy method) 44 | # y_train_pred = thres.eval(y_train_scores) 45 | 46 | # binary labels (0: inliers, 1: outliers) 47 | thres.fit(y_train_scores) 48 | y_train_pred = thres.labels_ 49 | # or 50 | y_train_pred = thres.predict(y_train_scores) 51 | 52 | # get the prediction on the test data 53 | y_test_scores = clf.decision_function(X_test) # outlier scores 54 | 55 | # (Legacy method) 56 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 57 | y_test_pred = thres.predict(y_test_scores) 58 | 59 | # evaluate and print the results 60 | print('\nOn Training Data:') 61 | evaluate_print(clf_name, y_train, y_train_scores) 62 | print('\nOn Test Data:') 63 | evaluate_print(clf_name, y_test, y_test_scores) 64 | 65 | # visualize the results 66 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 67 | y_test_pred, show_figure=True, save_figure=False) 68 | -------------------------------------------------------------------------------- /examples/wind_example.py: -------------------------------------------------------------------------------- 1 | """Example of using the topological winding number for outlier thresholding.""" 2 | # Author: D Kulik 3 | # License: BSD 2 clause 4 | 5 | 6 | import os 7 | import sys 8 | 9 | from pyod.models.knn import KNN 10 | from pyod.utils.data import evaluate_print, generate_data 11 | from pyod.utils.example import visualize 12 | 13 | from pythresh.thresholds.wind import WIND 14 | 15 | # temporary solution for relative imports in case pyod is not installed 16 | # if pyod is installed, no need to use the following line 17 | sys.path.append( 18 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 19 | 20 | 21 | if __name__ == '__main__': 22 | contamination = 0.1 # percentage of outliers 23 | n_train = 200 # number of training points 24 | n_test = 100 # number of testing points 25 | 26 | # Generate sample data 27 | X_train, X_test, y_train, y_test =\ 28 | generate_data(n_train=n_train, 29 | n_test=n_test, 30 | n_features=2, 31 | contamination=contamination, 32 | random_state=42) 33 | 34 | # train KNN detector 35 | clf_name = 'KNN' 36 | clf = KNN() 37 | clf.fit(X_train) 38 | thres = WIND() 39 | 40 | # get the prediction labels and outlier scores of the training data 41 | y_train_scores = clf.decision_scores_ # raw outlier scores 42 | 43 | # (Legacy method) 44 | # y_train_pred = thres.eval(y_train_scores) 45 | 46 | # binary labels (0: inliers, 1: outliers) 47 | thres.fit(y_train_scores) 48 | y_train_pred = thres.labels_ 49 | # or 50 | y_train_pred = thres.predict(y_train_scores) 51 | 52 | # get the prediction on the test data 53 | y_test_scores = clf.decision_function(X_test) # outlier scores 54 | 55 | # (Legacy method) 56 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 57 | y_test_pred = thres.predict(y_test_scores) 58 | 59 | # evaluate and print the results 60 | print('\nOn Training Data:') 61 | evaluate_print(clf_name, y_train, y_train_scores) 62 | print('\nOn Test Data:') 63 | evaluate_print(clf_name, y_test, y_test_scores) 64 | 65 | # visualize the results 66 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 67 | y_test_pred, show_figure=True, save_figure=False) 68 | -------------------------------------------------------------------------------- /examples/filter_example.py: -------------------------------------------------------------------------------- 1 | """Example of using distance shift from normal for outlier thresholding.""" 2 | # Author: D Kulik 3 | # License: BSD 2 clause 4 | 5 | 6 | import os 7 | import sys 8 | 9 | from pyod.models.knn import KNN 10 | from pyod.utils.data import evaluate_print, generate_data 11 | from pyod.utils.example import visualize 12 | 13 | from pythresh.thresholds.filter import FILTER 14 | 15 | # temporary solution for relative imports in case pyod is not installed 16 | # if pyod is installed, no need to use the following line 17 | sys.path.append( 18 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 19 | 20 | 21 | if __name__ == '__main__': 22 | contamination = 0.1 # percentage of outliers 23 | n_train = 200 # number of training points 24 | n_test = 100 # number of testing points 25 | 26 | # Generate sample data 27 | X_train, X_test, y_train, y_test =\ 28 | generate_data(n_train=n_train, 29 | n_test=n_test, 30 | n_features=2, 31 | contamination=contamination, 32 | random_state=42) 33 | 34 | # train KNN detector 35 | clf_name = 'KNN' 36 | clf = KNN() 37 | clf.fit(X_train) 38 | thres = FILTER() 39 | 40 | # get the prediction labels and outlier scores of the training data 41 | y_train_scores = clf.decision_scores_ # raw outlier scores 42 | 43 | # (Legacy method) 44 | # y_train_pred = thres.eval(y_train_scores) 45 | 46 | # binary labels (0: inliers, 1: outliers) 47 | thres.fit(y_train_scores) 48 | y_train_pred = thres.labels_ 49 | # or 50 | y_train_pred = thres.predict(y_train_scores) 51 | 52 | # get the prediction on the test data 53 | y_test_scores = clf.decision_function(X_test) # outlier scores 54 | 55 | # (Legacy method) 56 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 57 | y_test_pred = thres.predict(y_test_scores) 58 | 59 | # evaluate and print the results 60 | print('\nOn Training Data:') 61 | evaluate_print(clf_name, y_train, y_train_scores) 62 | print('\nOn Test Data:') 63 | evaluate_print(clf_name, y_test, y_test_scores) 64 | 65 | # visualize the results 66 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 67 | y_test_pred, show_figure=True, save_figure=False) 68 | -------------------------------------------------------------------------------- /examples/qmcd_example.py: -------------------------------------------------------------------------------- 1 | """Example of using the quasi-Monte Carlo discreprancy for outlier thresholding.""" 2 | # Author: D Kulik 3 | # License: BSD 2 clause 4 | 5 | 6 | import os 7 | import sys 8 | 9 | from pyod.models.knn import KNN 10 | from pyod.utils.data import evaluate_print, generate_data 11 | from pyod.utils.example import visualize 12 | 13 | from pythresh.thresholds.qmcd import QMCD 14 | 15 | # temporary solution for relative imports in case pyod is not installed 16 | # if pyod is installed, no need to use the following line 17 | sys.path.append( 18 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 19 | 20 | 21 | if __name__ == '__main__': 22 | contamination = 0.1 # percentage of outliers 23 | n_train = 200 # number of training points 24 | n_test = 100 # number of testing points 25 | 26 | # Generate sample data 27 | X_train, X_test, y_train, y_test =\ 28 | generate_data(n_train=n_train, 29 | n_test=n_test, 30 | n_features=2, 31 | contamination=contamination, 32 | random_state=42) 33 | 34 | # train KNN detector 35 | clf_name = 'KNN' 36 | clf = KNN() 37 | clf.fit(X_train) 38 | thres = QMCD() 39 | 40 | # get the prediction labels and outlier scores of the training data 41 | y_train_scores = clf.decision_scores_ # raw outlier scores 42 | 43 | # (Legacy method) 44 | # y_train_pred = thres.eval(y_train_scores) 45 | 46 | # binary labels (0: inliers, 1: outliers) 47 | thres.fit(y_train_scores) 48 | y_train_pred = thres.labels_ 49 | # or 50 | y_train_pred = thres.predict(y_train_scores) 51 | 52 | # get the prediction on the test data 53 | y_test_scores = clf.decision_function(X_test) # outlier scores 54 | 55 | # (Legacy method) 56 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 57 | y_test_pred = thres.predict(y_test_scores) 58 | 59 | # evaluate and print the results 60 | print('\nOn Training Data:') 61 | evaluate_print(clf_name, y_train, y_train_scores) 62 | print('\nOn Test Data:') 63 | evaluate_print(clf_name, y_test, y_test_scores) 64 | 65 | # visualize the results 66 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 67 | y_test_pred, show_figure=True, save_figure=False) 68 | -------------------------------------------------------------------------------- /examples/ocsvm_example.py: -------------------------------------------------------------------------------- 1 | """Example of using a one-class svm. 2 | 3 | for outlier thresholding 4 | """ 5 | # Author: D Kulik 6 | # License: BSD 2 clause 7 | 8 | 9 | import os 10 | import sys 11 | 12 | from pyod.models.knn import KNN 13 | from pyod.utils.data import evaluate_print, generate_data 14 | from pyod.utils.example import visualize 15 | 16 | from pythresh.thresholds.ocsvm import OCSVM 17 | 18 | # temporary solution for relative imports in case pyod is not installed 19 | # if pyod is installed, no need to use the following line 20 | sys.path.append( 21 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 22 | 23 | 24 | if __name__ == '__main__': 25 | contamination = 0.1 # percentage of outliers 26 | n_train = 200 # number of training points 27 | n_test = 100 # number of testing points 28 | 29 | # Generate sample data 30 | X_train, X_test, y_train, y_test =\ 31 | generate_data(n_train=n_train, 32 | n_test=n_test, 33 | n_features=2, 34 | contamination=contamination, 35 | random_state=42) 36 | 37 | # train KNN detector 38 | clf_name = 'KNN' 39 | clf = KNN() 40 | clf.fit(X_train) 41 | thres = OCSVM() 42 | 43 | # get the prediction labels and outlier scores of the training data 44 | y_train_scores = clf.decision_scores_ # raw outlier scores 45 | 46 | # (Legacy method) 47 | # y_train_pred = thres.eval(y_train_scores) 48 | 49 | # binary labels (0: inliers, 1: outliers) 50 | thres.fit(y_train_scores) 51 | y_train_pred = thres.labels_ 52 | # or 53 | y_train_pred = thres.predict(y_train_scores) 54 | 55 | # get the prediction on the test data 56 | y_test_scores = clf.decision_function(X_test) # outlier scores 57 | 58 | # (Legacy method) 59 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 60 | y_test_pred = thres.predict(y_test_scores) 61 | 62 | # evaluate and print the results 63 | print('\nOn Training Data:') 64 | evaluate_print(clf_name, y_train, y_train_scores) 65 | print('\nOn Test Data:') 66 | evaluate_print(clf_name, y_test, y_test_scores) 67 | 68 | # visualize the results 69 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 70 | y_test_pred, show_figure=True, save_figure=False) 71 | -------------------------------------------------------------------------------- /examples/aucp_example.py: -------------------------------------------------------------------------------- 1 | """Example of using the area under the curve percentage. 2 | 3 | for outlier thresholding 4 | """ 5 | # Author: D Kulik 6 | # License: BSD 2 clause 7 | 8 | 9 | import os 10 | import sys 11 | 12 | from pyod.models.knn import KNN 13 | from pyod.utils.data import evaluate_print, generate_data 14 | from pyod.utils.example import visualize 15 | 16 | from pythresh.thresholds.aucp import AUCP 17 | 18 | # temporary solution for relative imports in case pyod is not installed 19 | # if pyod is installed, no need to use the following line 20 | sys.path.append( 21 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 22 | 23 | 24 | if __name__ == '__main__': 25 | contamination = 0.1 # percentage of outliers 26 | n_train = 200 # number of training points 27 | n_test = 100 # number of testing points 28 | 29 | # Generate sample data 30 | X_train, X_test, y_train, y_test =\ 31 | generate_data(n_train=n_train, 32 | n_test=n_test, 33 | n_features=2, 34 | contamination=contamination, 35 | random_state=42) 36 | 37 | # train KNN detector 38 | clf_name = 'KNN' 39 | clf = KNN() 40 | clf.fit(X_train) 41 | thres = AUCP() 42 | 43 | # get the prediction labels and outlier scores of the training data 44 | y_train_scores = clf.decision_scores_ # raw outlier scores 45 | 46 | # (Legacy method) 47 | # y_train_pred = thres.eval(y_train_scores) 48 | 49 | # binary labels (0: inliers, 1: outliers) 50 | thres.fit(y_train_scores) 51 | y_train_pred = thres.labels_ 52 | # or 53 | y_train_pred = thres.predict(y_train_scores) 54 | 55 | # get the prediction on the test data 56 | y_test_scores = clf.decision_function(X_test) # outlier scores 57 | 58 | # (Legacy method) 59 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 60 | y_test_pred = thres.predict(y_test_scores) 61 | 62 | # evaluate and print the results 63 | print('\nOn Training Data:') 64 | evaluate_print(clf_name, y_train, y_train_scores) 65 | print('\nOn Test Data:') 66 | evaluate_print(clf_name, y_test, y_test_scores) 67 | 68 | # visualize the results 69 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 70 | y_test_pred, show_figure=True, save_figure=False) 71 | -------------------------------------------------------------------------------- /examples/vae_example.py: -------------------------------------------------------------------------------- 1 | """Example of using the area under the curve percentage. 2 | 3 | for outlier thresholding 4 | """ 5 | # Author: D Kulik 6 | # License: BSD 2 clause 7 | 8 | 9 | import os 10 | import sys 11 | 12 | from pyod.models.knn import KNN 13 | from pyod.utils.data import evaluate_print, generate_data 14 | from pyod.utils.example import visualize 15 | 16 | from pythresh.thresholds.vae import VAE 17 | 18 | # temporary solution for relative imports in case pyod is not installed 19 | # if pyod is installed, no need to use the following line 20 | sys.path.append( 21 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 22 | 23 | 24 | if __name__ == '__main__': 25 | contamination = 0.1 # percentage of outliers 26 | n_train = 200 # number of training points 27 | n_test = 100 # number of testing points 28 | 29 | # Generate sample data 30 | X_train, X_test, y_train, y_test =\ 31 | generate_data(n_train=n_train, 32 | n_test=n_test, 33 | n_features=2, 34 | contamination=contamination, 35 | random_state=42) 36 | 37 | # train KNN detector 38 | clf_name = 'KNN' 39 | clf = KNN() 40 | clf.fit(X_train) 41 | thres = VAE() 42 | 43 | # get the prediction labels and outlier scores of the training data 44 | y_train_scores = clf.decision_scores_ # raw outlier scores 45 | 46 | # (Legacy method) 47 | # y_train_pred = thres.eval(y_train_scores) 48 | 49 | # binary labels (0: inliers, 1: outliers) 50 | thres.fit(y_train_scores) 51 | y_train_pred = thres.labels_ 52 | # or 53 | y_train_pred = thres.predict(y_train_scores) 54 | 55 | # get the prediction on the test data 56 | y_test_scores = clf.decision_function(X_test) # outlier scores 57 | 58 | # (Legacy method) 59 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 60 | y_test_pred = thres.predict(y_test_scores) 61 | 62 | # evaluate and print the results 63 | print('\nOn Training Data:') 64 | evaluate_print(clf_name, y_train, y_train_scores) 65 | print('\nOn Test Data:') 66 | evaluate_print(clf_name, y_test, y_test_scores) 67 | 68 | # visualize the results 69 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 70 | y_test_pred, show_figure=True, save_figure=False) 71 | -------------------------------------------------------------------------------- /examples/decomp_example.py: -------------------------------------------------------------------------------- 1 | """Example of using the area under the curve percentage. 2 | 3 | for outlier thresholding 4 | """ 5 | # Author: D Kulik 6 | # License: BSD 2 clause 7 | 8 | 9 | import os 10 | import sys 11 | 12 | from pyod.models.knn import KNN 13 | from pyod.utils.data import evaluate_print, generate_data 14 | from pyod.utils.example import visualize 15 | 16 | from pythresh.thresholds.decomp import DECOMP 17 | 18 | # temporary solution for relative imports in case pyod is not installed 19 | # if pyod is installed, no need to use the following line 20 | sys.path.append( 21 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 22 | 23 | 24 | if __name__ == '__main__': 25 | contamination = 0.1 # percentage of outliers 26 | n_train = 200 # number of training points 27 | n_test = 100 # number of testing points 28 | 29 | # Generate sample data 30 | X_train, X_test, y_train, y_test =\ 31 | generate_data(n_train=n_train, 32 | n_test=n_test, 33 | n_features=2, 34 | contamination=contamination, 35 | random_state=42) 36 | 37 | # train KNN detector 38 | clf_name = 'KNN' 39 | clf = KNN() 40 | clf.fit(X_train) 41 | thres = DECOMP() 42 | 43 | # get the prediction labels and outlier scores of the training data 44 | y_train_scores = clf.decision_scores_ # raw outlier scores 45 | 46 | # (Legacy method) 47 | # y_train_pred = thres.eval(y_train_scores) 48 | 49 | # binary labels (0: inliers, 1: outliers) 50 | thres.fit(y_train_scores) 51 | y_train_pred = thres.labels_ 52 | # or 53 | y_train_pred = thres.predict(y_train_scores) 54 | 55 | # get the prediction on the test data 56 | y_test_scores = clf.decision_function(X_test) # outlier scores 57 | 58 | # (Legacy method) 59 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 60 | y_test_pred = thres.predict(y_test_scores) 61 | 62 | # evaluate and print the results 63 | print('\nOn Training Data:') 64 | evaluate_print(clf_name, y_train, y_train_scores) 65 | print('\nOn Test Data:') 66 | evaluate_print(clf_name, y_test, y_test_scores) 67 | 68 | # visualize the results 69 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 70 | y_test_pred, show_figure=True, save_figure=False) 71 | -------------------------------------------------------------------------------- /examples/gesd_example.py: -------------------------------------------------------------------------------- 1 | """Example of using the generalized extreme studentized deviate. 2 | 3 | for outlier thresholding 4 | """ 5 | # Author: D Kulik 6 | # License: BSD 2 clause 7 | 8 | 9 | import os 10 | import sys 11 | 12 | from pyod.models.knn import KNN 13 | from pyod.utils.data import evaluate_print, generate_data 14 | from pyod.utils.example import visualize 15 | 16 | from pythresh.thresholds.gesd import GESD 17 | 18 | # temporary solution for relative imports in case pyod is not installed 19 | # if pyod is installed, no need to use the following line 20 | sys.path.append( 21 | os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))) 22 | 23 | 24 | if __name__ == '__main__': 25 | contamination = 0.1 # percentage of outliers 26 | n_train = 200 # number of training points 27 | n_test = 100 # number of testing points 28 | 29 | # Generate sample data 30 | X_train, X_test, y_train, y_test =\ 31 | generate_data(n_train=n_train, 32 | n_test=n_test, 33 | n_features=2, 34 | contamination=contamination, 35 | random_state=42) 36 | 37 | # train KNN detector 38 | clf_name = 'KNN' 39 | clf = KNN() 40 | clf.fit(X_train) 41 | thres = GESD() 42 | 43 | # get the prediction labels and outlier scores of the training data 44 | y_train_scores = clf.decision_scores_ # raw outlier scores 45 | 46 | # (Legacy method) 47 | # y_train_pred = thres.eval(y_train_scores) 48 | 49 | # binary labels (0: inliers, 1: outliers) 50 | thres.fit(y_train_scores) 51 | y_train_pred = thres.labels_ 52 | # or 53 | y_train_pred = thres.predict(y_train_scores) 54 | 55 | # get the prediction on the test data 56 | y_test_scores = clf.decision_function(X_test) # outlier scores 57 | 58 | # (Legacy method) 59 | # y_test_pred = thres.eval(y_test_scores) # outlier labels (0 or 1) 60 | y_test_pred = thres.predict(y_test_scores) 61 | 62 | # evaluate and print the results 63 | print('\nOn Training Data:') 64 | evaluate_print(clf_name, y_train, y_train_scores) 65 | print('\nOn Test Data:') 66 | evaluate_print(clf_name, y_test, y_test_scores) 67 | 68 | # visualize the results 69 | visualize(clf_name, X_train, X_test, y_train, y_test, y_train_pred, 70 | y_test_pred, show_figure=True, save_figure=False) 71 | -------------------------------------------------------------------------------- /pythresh/utils/rank_utility.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.decomposition import TruncatedSVD 3 | from sklearn.naive_bayes import GaussianNB 4 | 5 | 6 | def BREG_metric(x1, x2): 7 | """Calculate the Exponential Euclidean Bregman distance.""" 8 | 9 | gradient_x = np.exp(x1) - 1 10 | gradient_y = np.exp(x2) - 1 11 | 12 | distance = np.sum((x1 - x2) * (gradient_x - gradient_y)) 13 | 14 | return distance 15 | 16 | 17 | def mclain_rao_index(data, labels): 18 | """Calculate the Mclain Rao index.""" 19 | 20 | unique_labels = np.unique(labels) 21 | centroids = [] 22 | 23 | # Calculate the centroids of each cluster 24 | for label in unique_labels: 25 | cluster_data = data[labels == label] 26 | centroid = np.mean(cluster_data) 27 | centroids.append(centroid) 28 | 29 | num_clusters = len(centroids) 30 | mri = 0.0 31 | 32 | # Calculate the MRI 33 | for i in range(num_clusters): 34 | for j in range(i + 1, num_clusters): 35 | distance = (centroids[i] - centroids[j]) ** 2 36 | mri += distance 37 | 38 | # Normalize the MRI by the number of cluster pairs 39 | num_cluster_pairs = num_clusters * (num_clusters - 1) / 2 40 | mri /= num_cluster_pairs 41 | 42 | return mri 43 | 44 | 45 | def GNB_score(data, labels): 46 | """Calculate the Gaussian Naive-Bayes trained consensus score.""" 47 | 48 | # Setup data for training 49 | X = np.tile(data, (len(labels), 1)) 50 | y = np.hstack(labels) 51 | 52 | # Fit model and predict 53 | model = GaussianNB() 54 | model.fit(X, y) 55 | 56 | pred = model.predict(data) 57 | 58 | # Find the deviation of each model from fitted GNB model 59 | dev = np.sum(np.abs(np.vstack(labels) - pred), axis=1) 60 | 61 | return dev.squeeze() 62 | 63 | 64 | def Contam_score(data, labels, contam): 65 | """Calculate the mean contamination deviation based on TruncatedSVD decomposed scores.""" 66 | 67 | # Fit model and transform data 68 | decomp = TruncatedSVD(n_components=1, random_state=1234) 69 | dat = decomp.fit_transform(np.vstack(data).T).squeeze() 70 | 71 | # Find the deviation of the contamination of each model from the decomposed model 72 | thr = np.zeros(len(labels[0])) 73 | thr[dat > np.percentile(dat, (1-np.mean(contam))*100)] = 1 74 | 75 | dev = np.sum(np.abs(np.vstack(labels) - thr), axis=1) 76 | 77 | return dev.squeeze() 78 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: [ "main" ] 9 | pull_request: 10 | branches: [ "main" ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ${{ matrix.os }} 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | os: [ubuntu-latest, windows-latest, macos-latest] 21 | python-version: ["3.10", "3.11", "3.12", "3.13"] 22 | 23 | steps: 24 | - uses: actions/checkout@v4 25 | with: 26 | fetch-depth: 0 27 | - name: Set up Python ${{ matrix.python-version }} 28 | uses: actions/setup-python@v6 29 | with: 30 | python-version: ${{ matrix.python-version }} 31 | - name: Install dependencies 32 | run: | 33 | python -m pip install --upgrade pip wheel setuptools setuptools-scm 34 | python -m pip install flake8 pytest-cov mypy cython 35 | python -m pip install -r requirements-test.txt --use-pep517 36 | - name: Lint with flake8 37 | run: | 38 | # stop the build if there are Python syntax errors or undefined names 39 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 40 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 41 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 42 | - name: Test with pytest 43 | run: | 44 | pytest -vs --doctest-modules --cov-fail-under=90 --cov-branch --cov=pythresh --cov-report term-missing --pyargs pythresh --continue-on-collection-errors 45 | - name: Upload coverage to Codecov (partial) 46 | uses: codecov/codecov-action@v5 47 | with: 48 | files: ./coverage.xml 49 | flags: ${{ matrix.os }}-${{ matrix.python-version }} 50 | fail_ci_if_error: true 51 | verbose: true 52 | partial: true 53 | env: 54 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 55 | 56 | finalize: 57 | needs: build 58 | runs-on: ubuntu-latest 59 | steps: 60 | - name: Finalize Codecov uploads 61 | uses: codecov/codecov-action@v5 62 | with: 63 | finalize: true 64 | env: 65 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 66 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | # read the contents of README file 4 | from os import path 5 | 6 | from setuptools import find_packages, setup 7 | 8 | # get __version__ from version.py 9 | try: 10 | verpath = path.join('pythresh', 'version.py') 11 | with open(verpath) as version_file: 12 | __version__ = str(re.findall( 13 | r'\b\d+(?:\.\d+)+', version_file.read())[0]) 14 | except Exception as error: 15 | __version__ = '0.0.1' 16 | sys.stderr.write("Warning: Could not open '%s' due %s\n" % 17 | (verpath, error)) 18 | 19 | 20 | this_directory = path.abspath(path.dirname(__file__)) 21 | 22 | 23 | # read the contents of README.rst 24 | def readme(): 25 | with open(path.join(this_directory, 'README.rst'), encoding='utf-8') as f: 26 | return f.read() 27 | 28 | 29 | # read the contents of requirements.txt 30 | with open(path.join(this_directory, 'requirements.txt'), 31 | encoding='utf-8') as f: 32 | requirements = f.read().splitlines() 33 | 34 | setup( 35 | name='pythresh', 36 | version=__version__, 37 | description='A Python Toolbox for Outlier Detection Thresholding', 38 | long_description=readme(), 39 | long_description_content_type='text/x-rst', 40 | author='D Kulik', 41 | url='https://github.com/KulikDM/pythresh', 42 | download_url='https://github.com/KulikDM/pythresh/archive/master.zip', 43 | keywords=['outlier detection', 'anomaly detection', 'thresholding', 'cutoff', 44 | 'contamintion level', 'data science', 'machine learning'], 45 | project_urls={ 46 | 'Documentation': 'https://pythresh.readthedocs.io/en/latest/'}, 47 | packages=find_packages(exclude=['test']), 48 | include_package_data=True, 49 | install_requires=requirements, 50 | setup_requires=['setuptools>=38.6.0'], 51 | classifiers=[ 52 | 'Development Status :: 5 - Production/Stable', 53 | 'Intended Audience :: Education', 54 | 'Intended Audience :: Financial and Insurance Industry', 55 | 'Intended Audience :: Science/Research', 56 | 'Intended Audience :: Developers', 57 | 'Intended Audience :: Information Technology', 58 | 'License :: OSI Approved :: BSD License', 59 | 'Programming Language :: Python :: 3.7', 60 | 'Programming Language :: Python :: 3.8', 61 | 'Programming Language :: Python :: 3.9', 62 | 'Programming Language :: Python :: 3.10', 63 | 'Programming Language :: Python :: 3.11', 64 | 'Programming Language :: Python :: 3.12', 65 | 'Programming Language :: Python :: 3.13', 66 | ], 67 | ) 68 | -------------------------------------------------------------------------------- /pythresh/test/test_rank.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unittest 3 | from itertools import product 4 | from os.path import dirname as up 5 | 6 | # noinspection PyProtectedMember 7 | from pyod.models.iforest import IForest 8 | from pyod.models.knn import KNN 9 | from pyod.models.pca import PCA 10 | from pyod.utils.data import generate_data 11 | 12 | from pythresh.thresholds.filter import FILTER 13 | from pythresh.thresholds.iqr import IQR 14 | from pythresh.thresholds.ocsvm import OCSVM 15 | from pythresh.utils.rank import RANK 16 | 17 | # temporary solution for relative imports in case pythresh is not installed 18 | # if pythresh is installed, no need to use the following line 19 | 20 | path = up(up(up(__file__))) 21 | sys.path.append(path) 22 | 23 | 24 | class TestRANK(unittest.TestCase): 25 | def setUp(self): 26 | self.n_train = 200 27 | self.n_test = 100 28 | self.contamination = 0.1 29 | self.X_train, self.X_test, self.y_train, self.y_test = generate_data( 30 | n_train=self.n_train, n_test=self.n_test, 31 | contamination=self.contamination, random_state=42) 32 | 33 | self.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)] 34 | 35 | self.thres = [FILTER(), self.contamination, 36 | [FILTER(), IQR(), OCSVM()]] 37 | 38 | self.method = ['model', 'native'] 39 | 40 | self.weights = [[0.5, 0.25, 0.25], 41 | [0.25, 0.5, 0.25], 42 | [0.25, 0.25, 0.5], 43 | None] 44 | 45 | def test_prediction_labels(self): 46 | 47 | params = product(self.thres, 48 | self.method, 49 | self.weights) 50 | 51 | for thres, method, weights in params: 52 | 53 | ranker = RANK(self.clfs, thres, method=method, weights=weights) 54 | rankings = ranker.eval(self.X_train) 55 | 56 | cdf_rank = ranker.cdf_rank_ 57 | clust_rank = ranker.clust_rank_ 58 | consensus_rank = ranker.consensus_rank_ 59 | 60 | assert (cdf_rank is not None) 61 | assert (clust_rank is not None) 62 | assert (consensus_rank is not None) 63 | assert (rankings is not None) 64 | 65 | n_clfs = len(self.clfs) 66 | n_thres = len(thres) if isinstance(thres, list) else 1 67 | len_models = n_clfs * n_thres 68 | 69 | assert (len(cdf_rank) == len_models) 70 | assert (len(clust_rank) == len_models) 71 | assert (len(consensus_rank) == len_models) 72 | assert (len(rankings) == len_models) 73 | 74 | assert (len(set(rankings)) == len_models) 75 | -------------------------------------------------------------------------------- /pythresh/thresholds/zscore.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.stats as stats 3 | 4 | from .base import BaseThresholder 5 | 6 | 7 | class ZSCORE(BaseThresholder): 8 | r"""ZSCORE class for ZSCORE thresholder. 9 | 10 | Use the zscore to evaluate a non-parametric means to threshold 11 | scores generated by the decision_scores where outliers are set 12 | to any value beyond a zscore of one. 13 | See :cite:`bagdonavicius2020zscore` for details. 14 | 15 | Parameters 16 | ---------- 17 | 18 | factor : int, optional (default=1) 19 | The factor to multiply the zscore by to set the threshold. 20 | The default is 1. 21 | 22 | random_state : int, optional (default=1234) 23 | Random seed for the random number generators of the thresholders. Can also 24 | be set to None. 25 | 26 | Attributes 27 | ---------- 28 | 29 | thresh_ : threshold value that separates inliers from outliers 30 | 31 | dscores_ : 1D array of decomposed decision scores 32 | 33 | Notes 34 | ----- 35 | 36 | The z-score can be calculated as follows: 37 | 38 | .. math:: 39 | 40 | Z = \frac{x-\bar{x}}{\sigma} \mathrm{,} 41 | 42 | where :math:`\bar{x}` and :math:`\sigma` are the mean and the 43 | standard deviation of the decision scores respectively. The threshold 44 | is set that any value beyond an absolute z-score of 1 is considered 45 | and outlier. 46 | 47 | """ 48 | 49 | def __init__(self, factor=1, random_state=1234): 50 | 51 | super().__init__() 52 | self.factor = factor 53 | self.random_state = random_state 54 | np.random.seed(random_state) 55 | 56 | def eval(self, decision): 57 | """Outlier/inlier evaluation process for decision scores. 58 | 59 | Parameters 60 | ---------- 61 | decision : np.array or list of shape (n_samples) 62 | or np.array of shape (n_samples, n_detectors) 63 | which are the decision scores from a 64 | outlier detection. 65 | 66 | Returns 67 | ------- 68 | outlier_labels : numpy array of shape (n_samples,) 69 | For each observation, tells whether or not 70 | it should be considered as an outlier according to the 71 | fitted model. 0 stands for inliers and 1 for outliers. 72 | """ 73 | 74 | decision = self._data_setup(decision) 75 | 76 | # Get the zscore of the decision scores 77 | zscore = stats.zscore(decision) 78 | 79 | # Set the limit to where the zscore is greater than the factor 80 | labels = np.zeros(len(decision), dtype=int) 81 | mask = np.where(zscore >= self.factor) 82 | labels[mask] = 1 83 | 84 | self.thresh_ = np.min(decision[labels == 1]) 85 | 86 | return labels 87 | -------------------------------------------------------------------------------- /pythresh/thresholds/mad.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.stats as stats 3 | 4 | from .base import BaseThresholder 5 | from .thresh_utility import cut 6 | 7 | 8 | class MAD(BaseThresholder): 9 | r"""MAD class for Median Absolute Deviation thresholder. 10 | 11 | Use the median absolute deviation to evaluate a non-parametric 12 | means to threshold scores generated by the decision_scores 13 | where outliers are set to any value beyond the mean plus the 14 | median absolute deviation over the standard deviation. 15 | See :cite:`archana2015mad` for details. 16 | 17 | Parameters 18 | ---------- 19 | 20 | factor : int, optional (default=1) 21 | The factor to multiply the MAD by to set the threshold. 22 | The default is 1. 23 | 24 | random_state : int, optional (default=1234) 25 | Random seed for the random number generators of the thresholders. Can also 26 | be set to None. 27 | 28 | Attributes 29 | ---------- 30 | 31 | thresh_ : threshold value that separates inliers from outliers 32 | 33 | dscores_ : 1D array of decomposed decision scores 34 | 35 | Notes 36 | ----- 37 | 38 | The median absolute deviation is defined as: 39 | 40 | .. math:: 41 | 42 | MAD = med\lvert x - med(x)\rvert \mathrm{.} 43 | 44 | And the threshold is set such that: 45 | 46 | .. math:: 47 | 48 | \mathrm{lim} = \bar{x} + \frac{MAD}{\sigma} \mathrm{,} 49 | 50 | where :math:`\bar{x}` and :math:`\sigma` are the mean and 51 | standard deviation of the scores respectively 52 | 53 | """ 54 | 55 | def __init__(self, factor=1, random_state=1234): 56 | 57 | super().__init__() 58 | self.factor = factor 59 | self.random_state = random_state 60 | np.random.seed(random_state) 61 | 62 | def eval(self, decision): 63 | """Outlier/inlier evaluation process for decision scores. 64 | 65 | Parameters 66 | ---------- 67 | decision : np.array or list of shape (n_samples) 68 | or np.array of shape (n_samples, n_detectors) 69 | which are the decision scores from a 70 | outlier detection. 71 | 72 | Returns 73 | ------- 74 | outlier_labels : numpy array of shape (n_samples,) 75 | For each observation, tells whether or not 76 | it should be considered as an outlier according to the 77 | fitted model. 0 stands for inliers and 1 for outliers. 78 | """ 79 | 80 | decision = self._data_setup(decision) 81 | 82 | # Set limit 83 | mean = np.mean(decision) 84 | mad = stats.median_abs_deviation(decision, scale=np.std(decision)) 85 | limit = mean + self.factor * mad 86 | 87 | self.thresh_ = limit 88 | 89 | return cut(decision, limit) 90 | -------------------------------------------------------------------------------- /pythresh/thresholds/iqr.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | 3 | import numpy as np 4 | 5 | from .base import BaseThresholder 6 | from .thresh_utility import cut 7 | 8 | 9 | class IQR(BaseThresholder): 10 | r"""IQR class for Inter-Qaurtile Region thresholder. 11 | 12 | Use the inter-quartile region to evaluate a non-parametric 13 | means to threshold scores generated by the decision_scores 14 | where outliers are set to any value beyond the third quartile 15 | plus 1.5 times the inter-quartile region. 16 | See :cite:`bardet2015iqr` for details. 17 | 18 | Parameters 19 | ---------- 20 | 21 | random_state : int, optional (default=1234) 22 | Random seed for the random number generators of the thresholders. Can also 23 | be set to None. 24 | 25 | Attributes 26 | ---------- 27 | 28 | thresh_ : threshold value that separates inliers from outliers 29 | 30 | dscores_ : 1D array of decomposed decision scores 31 | 32 | Notes 33 | ----- 34 | 35 | The inter-quartile region is given as: 36 | 37 | .. math:: 38 | 39 | IQR = \lvert Q_3-Q_1 \rvert 40 | 41 | where :math:`Q_1` and :math:`Q_3` are the first and third quartile 42 | respectively. The threshold for the decision scores is set as: 43 | 44 | .. math:: 45 | 46 | t = Q_3 + 1.5 IQR 47 | 48 | """ 49 | 50 | def __init__(self, random_state=1234): 51 | 52 | super().__init__() 53 | self.random_state = random_state 54 | np.random.seed(random_state) 55 | 56 | def eval(self, decision): 57 | """Outlier/inlier evaluation process for decision scores. 58 | 59 | Parameters 60 | ---------- 61 | decision : np.array or list of shape (n_samples) 62 | or np.array of shape (n_samples, n_detectors) 63 | which are the decision scores from a 64 | outlier detection. 65 | 66 | Returns 67 | ------- 68 | outlier_labels : numpy array of shape (n_samples,) 69 | For each observation, tells whether or not 70 | it should be considered as an outlier according to the 71 | fitted model. 0 stands for inliers and 1 for outliers. 72 | """ 73 | 74 | decision = self._data_setup(decision) 75 | 76 | arg_map = {'old': 'interpolation', 'new': 'method'} 77 | arg_name = (arg_map['new'] if 'method' in 78 | inspect.signature(np.percentile).parameters 79 | else arg_map['old']) 80 | 81 | # First quartile (Q1) 82 | P1 = np.percentile(decision, 25, **{arg_name: 'midpoint'}) 83 | 84 | # Third quartile (Q3) 85 | P3 = np.percentile(decision, 75, **{arg_name: 'midpoint'}) 86 | 87 | # Calculate IQR and generate limit 88 | iqr = abs(P3-P1) 89 | limit = P3 + 1.5*iqr 90 | 91 | self.thresh_ = limit 92 | 93 | return cut(decision, limit) 94 | -------------------------------------------------------------------------------- /pythresh/thresholds/fwfm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.signal import find_peaks, peak_widths 3 | 4 | from .base import BaseThresholder 5 | from .thresh_utility import cut, gen_kde, normalize 6 | 7 | 8 | class FWFM(BaseThresholder): 9 | """FWFM class for Full Width at Full Minimum thresholder. 10 | 11 | Use the full width at full minimum (aka base width) to evaluate 12 | a non-parametric means to threshold scores generated by the 13 | decision_scores where outliers are set to any value beyond the base 14 | width. See :cite:`joneidi2013fwfm` for details. 15 | 16 | Parameters 17 | ---------- 18 | 19 | random_state : int, optional (default=1234) 20 | Random seed for the random number generators of the thresholders. Can also 21 | be set to None. 22 | 23 | Attributes 24 | ---------- 25 | 26 | thresh_ : threshold value that separates inliers from outliers 27 | 28 | dscores_ : 1D array of decomposed decision scores 29 | 30 | Notes 31 | ----- 32 | 33 | The outlier detection scores are assumed to be a mixture of Gaussian 34 | distributions. The probability density function of this Gaussian mixture 35 | is approximated using kernel density estimation. The highest peak within the 36 | PDF is used to find the base width of the mixture and the threshold is set 37 | to the base width divided by the number of scores. 38 | """ 39 | 40 | def __init__(self, random_state=1234): 41 | 42 | super().__init__() 43 | self.random_state = random_state 44 | np.random.seed(random_state) 45 | 46 | def eval(self, decision): 47 | """Outlier/inlier evaluation process for decision scores. 48 | 49 | Parameters 50 | ---------- 51 | decision : np.array or list of shape (n_samples) 52 | or np.array of shape (n_samples, n_detectors) 53 | which are the decision scores from a 54 | outlier detection. 55 | 56 | Returns 57 | ------- 58 | outlier_labels : numpy array of shape (n_samples,) 59 | For each observation, tells whether or not 60 | it should be considered as an outlier according to the 61 | fitted model. 0 stands for inliers and 1 for outliers. 62 | """ 63 | 64 | decision = self._data_setup(decision) 65 | 66 | # Generate KDE 67 | val, _ = gen_kde(decision, -1, 1, len(decision)*3) 68 | val = normalize(val) 69 | 70 | # Find the greatest peak of the KDE 71 | peaks, _ = find_peaks(val, prominence=0.75) 72 | 73 | # Find the base width of the peak 74 | base_width = peak_widths(val, peaks, rel_height=0.99)[0] 75 | 76 | # Normalize and set limit 77 | eps = np.finfo(decision.dtype).eps 78 | limit = base_width[0]/len(val) if len(base_width) > 0 else 1.0 + eps 79 | 80 | self.thresh_ = limit 81 | 82 | return cut(decision, limit) 83 | -------------------------------------------------------------------------------- /pythresh/thresholds/fgd.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .base import BaseThresholder 4 | from .thresh_utility import cut, gen_kde 5 | 6 | 7 | class FGD(BaseThresholder): 8 | """FGD class for Fixed Gradient Descent thresholder. 9 | 10 | Use the fixed gradient descent to evaluate a non-parametric means 11 | to threshold scores generated by the decision_scores where outliers 12 | are set to any value beyond where the first derivative of the kde 13 | with respect to the decision scores passes the mean of the first 14 | and second inflection points. See :cite:`qi2021fgd` for details. 15 | 16 | Parameters 17 | ---------- 18 | 19 | random_state : int, optional (default=1234) 20 | Random seed for the random number generators of the thresholders. Can also 21 | be set to None. 22 | 23 | Attributes 24 | ---------- 25 | 26 | thresh_ : threshold value that separates inliers from outliers 27 | 28 | dscores_ : 1D array of decomposed decision scores 29 | 30 | Notes 31 | ----- 32 | 33 | A probability distribution of the decision scores is generated using 34 | kernel density estimation. The first derivative of the pdf is 35 | calculated, and the threshold is set as the middle point between the 36 | first and second inflection points starting from the left side of the 37 | data range. 38 | """ 39 | 40 | def __init__(self, random_state=1234): 41 | 42 | super().__init__() 43 | self.random_state = random_state 44 | np.random.seed(random_state) 45 | 46 | def eval(self, decision): 47 | """Outlier/inlier evaluation process for decision scores. 48 | 49 | Parameters 50 | ---------- 51 | decision : np.array or list of shape (n_samples) 52 | or np.array of shape (n_samples, n_detectors) 53 | which are the decision scores from a 54 | outlier detection. 55 | 56 | Returns 57 | ------- 58 | outlier_labels : numpy array of shape (n_samples,) 59 | For each observation, tells whether or not 60 | it should be considered as an outlier according to the 61 | fitted model. 0 stands for inliers and 1 for outliers. 62 | """ 63 | 64 | decision = self._data_setup(decision) 65 | 66 | # Generate KDE 67 | val, dat_range = gen_kde(decision, 0, 1, len(decision)*3) 68 | 69 | # Calculate the first derivative of the KDE with respect 70 | # to the data range 71 | deriv = np.gradient(val, dat_range[1]-dat_range[0]) 72 | 73 | count = 0 74 | ind = [] 75 | 76 | # Find the first two inflection points 77 | for i in range(len(deriv)-1): 78 | 79 | if (deriv[i] > 0) & (deriv[i+1] <= 0): 80 | count += 1 81 | ind.append(i) 82 | if count == 2: 83 | break 84 | 85 | eps = np.finfo(decision.dtype).eps 86 | 87 | limit = ((dat_range[ind[0]]+dat_range[ind[1]])/2 if 88 | len(ind) > 1 else 1.0 + eps) 89 | 90 | self.thresh_ = limit 91 | 92 | return cut(decision, limit) 93 | -------------------------------------------------------------------------------- /pythresh/thresholds/thresh_utility.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.stats as stats 3 | from scipy.interpolate import interp1d 4 | from scipy.special import ndtr 5 | from sklearn.decomposition import TruncatedSVD 6 | from sklearn.utils import check_array 7 | 8 | 9 | def get_min_max(data): 10 | 11 | min_val = np.min(data, axis=0) 12 | max_val = np.max(data, axis=0) 13 | 14 | return min_val, max_val 15 | 16 | 17 | def normalize(data, min_val=None, max_val=None): 18 | 19 | if min_val is None or max_val is None: 20 | min_val, max_val = get_min_max(data) 21 | 22 | normed = (data - min_val) / (max_val - min_val) 23 | 24 | return normed 25 | 26 | 27 | def cut(decision, limit): 28 | 29 | labels = np.zeros(len(decision), dtype=int) 30 | 31 | labels[decision >= limit] = 1 32 | 33 | return labels 34 | 35 | 36 | def gen_interp(x, y): 37 | 38 | interpolator = interp1d(x, y, kind='cubic', 39 | fill_value='extrapolate') 40 | 41 | return interpolator 42 | 43 | 44 | def gen_kde(data, lower, upper, size): 45 | 46 | insize = min(size, 5000) 47 | 48 | # Create a KDE of the data 49 | kde = stats.gaussian_kde(data) 50 | dat_range = np.linspace(lower, upper, insize) 51 | dat_eval = np.linspace(lower, upper, size) 52 | 53 | # Use interpolation for fast KDE upsampling 54 | if size > insize: 55 | interpolator = gen_interp(dat_range, kde(dat_range)) 56 | return interpolator(dat_eval), dat_eval 57 | 58 | return kde(dat_eval), dat_eval 59 | 60 | 61 | def gen_cdf(data, lower, upper, size): 62 | 63 | insize = min(size, 5000) 64 | 65 | # Create a KDE & CDF of the data 66 | kde = stats.gaussian_kde(data) 67 | dat_range = np.linspace(lower, upper, insize) 68 | dat_eval = np.linspace(lower, upper, size) 69 | 70 | cdf = np.array(tuple(ndtr(np.ravel(item - kde.dataset) / kde.factor).mean() 71 | for item in dat_range)) 72 | 73 | # Use interpolation for fast CDF upsampling 74 | if size > insize: 75 | interpolator = gen_interp(dat_range, cdf) 76 | return interpolator(dat_eval), dat_eval 77 | 78 | return cdf, dat_eval 79 | 80 | 81 | def check_scores(decision, decomp=None, min_val=None, max_val=None, random_state=1234): 82 | 83 | # Check decision scores dimensionality and pre-process 84 | if (np.asarray(decision).ndim == 2) & (np.atleast_2d(decision).shape[1] > 1): 85 | 86 | decision = check_array(decision, ensure_2d=True) 87 | decision = normalize(decision, min_val, max_val) 88 | decision, decomp = decompose(decision, decomp, random_state) 89 | 90 | else: 91 | decision = check_array(decision, ensure_2d=False) 92 | 93 | return decision.squeeze(), decomp 94 | 95 | 96 | def decompose(data, decomp=None, random_state=1234): 97 | 98 | # Decompose decision scores to 1D array for thresholding 99 | if decomp is None: 100 | decomp = TruncatedSVD(n_components=1, random_state=random_state) 101 | data = decomp.fit_transform(data) 102 | else: 103 | data = decomp.transform(data) 104 | 105 | return data, decomp 106 | -------------------------------------------------------------------------------- /pythresh/thresholds/cpd.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import ruptures as rpt 3 | 4 | from .base import BaseThresholder 5 | from .thresh_utility import cut, gen_cdf, gen_kde 6 | 7 | 8 | class CPD(BaseThresholder): 9 | r"""CPD class for Change Point Detection thresholder. 10 | 11 | Use change point detection to find a non-parametric means 12 | to threshold scores generated by the decision_scores where outliers 13 | are set to any value beyond the detected change point. 14 | See :cite:`fearnhead2016cpd` for details 15 | 16 | Parameters 17 | ---------- 18 | 19 | method : {'Dynp', 'KernelCPD', 'Binseg', 'BottomUp'}, optional (default='Dynp') 20 | Method for change point detection 21 | 22 | - 'Dynp': Dynamic programming (optimal minimum sum of errors per partition) 23 | - 'KernelCPD': RBF kernel function (optimal minimum sum of errors per partition) 24 | - 'Binseg': Binary segmentation 25 | - 'BottomUp': Bottom-up segmentation 26 | 27 | transform : {'cdf', 'kde'}, optional (default='cdf') 28 | Data transformation method prior to fit 29 | 30 | - 'cdf': Use the cumulative distribution function 31 | - 'kde': Use the kernel density estimation 32 | 33 | random_state : int, optional (default=1234) 34 | Random seed for the random number generators of the thresholders. Can also 35 | be set to None. 36 | 37 | Attributes 38 | ---------- 39 | 40 | thresh_ : threshold value that separates inliers from outliers 41 | 42 | dscores_ : 1D array of decomposed decision scores 43 | 44 | """ 45 | 46 | def __init__(self, method='Dynp', transform='cdf', random_state=1234): 47 | 48 | super().__init__() 49 | self.method = method 50 | self.transform = transform 51 | self.method_func = {'Dynp': rpt.Dynp(), 'KernelCPD': rpt.KernelCPD(kernel='rbf'), 52 | 'Binseg': rpt.Binseg(), 'BottomUp': rpt.BottomUp()} 53 | self.random_state = random_state 54 | np.random.seed(random_state) 55 | 56 | def eval(self, decision): 57 | """Outlier/inlier evaluation process for decision scores. 58 | 59 | Parameters 60 | ---------- 61 | decision : np.array or list of shape (n_samples) 62 | or np.array of shape (n_samples, n_detectors) 63 | which are the decision scores from a 64 | outlier detection. 65 | 66 | Returns 67 | ------- 68 | outlier_labels : numpy array of shape (n_samples,) 69 | For each observation, tells whether or not 70 | it should be considered as an outlier according to the 71 | fitted model. 0 stands for inliers and 1 for outliers. 72 | """ 73 | 74 | decision = self._data_setup(decision) 75 | 76 | # Transform data prior to fit 77 | if self.transform == 'cdf': 78 | val_data, data_range = gen_cdf(decision, 0, 1, len(decision)*3) 79 | else: 80 | val_data, data_range = gen_kde(decision, 0, 1, len(decision)*3) 81 | 82 | # Change point detection 83 | det = self.method_func[self.method].fit(val_data) 84 | change = det.predict(n_bkps=1) 85 | 86 | # Set limit at change point 87 | limit = data_range[change[0]] 88 | self.thresh_ = limit 89 | 90 | return cut(decision, limit) 91 | -------------------------------------------------------------------------------- /pythresh/thresholds/mtt.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.stats as stats 3 | 4 | from .base import BaseThresholder 5 | from .thresh_utility import cut 6 | 7 | # https://github.com/vvaezian/modified_thompson_tau_test/blob/main/src/Modified_Thompson_Tau_Test/modified_thompson_tau_test.py 8 | 9 | 10 | class MTT(BaseThresholder): 11 | r"""MTT class for Modified Thompson Tau test thresholder. 12 | 13 | Use the modified Thompson Tau test to evaluate a non-parametric means 14 | to threshold scores generated by the decision_scores where outliers 15 | are set to any value beyond the smallest outlier detected by the test. 16 | See :cite:`rengasamy2020mtt` for details. 17 | 18 | Parameters 19 | ---------- 20 | 21 | alpha : float, optional (default=0.01) 22 | Confidence level corresponding to the t-Student distribution map to sample 23 | 24 | random_state : int, optional (default=1234) 25 | Random seed for the random number generators of the thresholders. Can also 26 | be set to None. 27 | 28 | Attributes 29 | ---------- 30 | 31 | thresh_ : threshold value that separates inliers from outliers 32 | 33 | dscores_ : 1D array of decomposed decision scores 34 | 35 | Notes 36 | ----- 37 | 38 | The Modified Thompson Tau test is a modified univariate t-test that eliminates outliers 39 | that are more than a number of standard deviations away from the mean. This method is 40 | done iteratively with the Tau critical value being recalculated after each outlier removal 41 | until the dataset no longer has data points that fall outside of the criterion. The Tau 42 | critical value can be obtained by, 43 | 44 | .. math:: 45 | 46 | \tau = \frac{t \cdot (n-1)}{\sqrt{n}\sqrt{n-2+t^2}} \mathrm{,} 47 | 48 | where :math:`n` is the number of data points and :math:`t` is the student t-value 49 | 50 | """ 51 | 52 | def __init__(self, alpha=0.01, random_state=1234): 53 | 54 | super().__init__() 55 | self.alpha = alpha if alpha <= 0.5 else 1 - alpha 56 | self.random_state = random_state 57 | np.random.seed(random_state) 58 | 59 | def eval(self, decision): 60 | """Outlier/inlier evaluation process for decision scores. 61 | 62 | Parameters 63 | ---------- 64 | decision : np.array or list of shape (n_samples) 65 | or np.array of shape (n_samples, n_detectors) 66 | which are the decision scores from a 67 | outlier detection. 68 | 69 | Returns 70 | ------- 71 | outlier_labels : numpy array of shape (n_samples,) 72 | For each observation, tells whether or not 73 | it should be considered as an outlier according to the 74 | fitted model. 0 stands for inliers and 1 for outliers. 75 | """ 76 | 77 | decision = self._data_setup(decision) 78 | 79 | arr = np.sort(decision.copy()) 80 | 81 | eps = np.finfo(decision.dtype).eps 82 | limit = 1.0 + eps 83 | 84 | while True: 85 | 86 | # Calculate the rejection threshold 87 | n = len(arr) 88 | t = stats.t.ppf(1-self.alpha, df=n-2) 89 | thres = (t * (n - 1))/(np.sqrt(n) * np.sqrt(n - 2 + t**2)) 90 | delta = np.abs(arr[-1] - arr.mean())/arr.std() 91 | 92 | if delta > thres: 93 | limit = arr[-1] 94 | arr = np.delete(arr, n-1) 95 | 96 | else: 97 | break 98 | 99 | self.thresh_ = limit 100 | 101 | return cut(decision, limit) 102 | -------------------------------------------------------------------------------- /pythresh/test/test_eb.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unittest 3 | from os.path import dirname as up 4 | 5 | # noinspection PyProtectedMember 6 | import joblib 7 | import numpy as np 8 | from numpy.testing import assert_equal 9 | from pyod.models.iforest import IForest 10 | from pyod.models.knn import KNN 11 | from pyod.models.pca import PCA 12 | from pyod.utils.data import generate_data 13 | 14 | from pythresh.thresholds.eb import EB 15 | 16 | # temporary solution for relative imports in case pythresh is not installed 17 | # if pythresh is installed, no need to use the following line 18 | 19 | path = up(up(up(__file__))) 20 | sys.path.append(path) 21 | 22 | 23 | class TestEB(unittest.TestCase): 24 | 25 | @classmethod 26 | def setUpClass(cls): 27 | cls.n_train = 200 28 | cls.n_test = 100 29 | cls.contamination = 0.1 30 | cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data( 31 | n_train=cls.n_train, n_test=cls.n_test, 32 | contamination=cls.contamination, random_state=42) 33 | 34 | cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)] 35 | cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_ 36 | cls.multiple_scores = np.vstack([ 37 | clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs 38 | ]).T 39 | cls.all_scores = [cls.single_score, cls.multiple_scores] 40 | 41 | def setUp(self): 42 | self.thres = EB() 43 | 44 | def check_labels(self, labels, scores_shape): 45 | self.assertEqual(labels.shape, scores_shape[:1]) 46 | self.assertIn(labels.min(), [0, 1]) 47 | self.assertIn(labels.max(), [0, 1]) 48 | 49 | def check_fitted_attributes(self, thres): 50 | self.assertTrue(thres.__sklearn_is_fitted__()) 51 | self.assertIsNotNone(thres.labels_) 52 | self.assertIsNotNone(thres.thresh_) 53 | 54 | def test_eval(self): 55 | for scores in self.all_scores: 56 | pred_labels = self.thres.eval(scores) 57 | 58 | self.assertIsNotNone(self.thres.thresh_) 59 | self.assertIsNotNone(self.thres.dscores_) 60 | self.assertGreaterEqual(self.thres.dscores_.min(), 0) 61 | self.assertLessEqual(self.thres.dscores_.max(), 1) 62 | self.check_labels(pred_labels, scores.shape) 63 | 64 | def test_fit(self): 65 | for scores in self.all_scores: 66 | self.thres.fit(scores) 67 | self.check_fitted_attributes(self.thres) 68 | self.check_labels(self.thres.labels_, scores.shape) 69 | 70 | def test_predict(self): 71 | for scores in self.all_scores: 72 | self.thres.fit(scores) 73 | pred_labels = self.thres.predict(scores) 74 | self.check_fitted_attributes(self.thres) 75 | self.check_labels(pred_labels, scores.shape) 76 | assert_equal(self.thres.labels_, pred_labels) 77 | 78 | def test_test_data(self): 79 | for scores, test_scores in zip(self.all_scores, [ 80 | self.clfs[0].fit(self.X_train).decision_function(self.X_test), 81 | np.vstack([clf.fit(self.X_train).decision_function(self.X_test) 82 | for clf in self.clfs]).T 83 | ]): 84 | self.thres.fit(scores) 85 | pred_labels = self.thres.predict(test_scores) 86 | self.check_fitted_attributes(self.thres) 87 | self.check_labels(pred_labels, test_scores.shape) 88 | 89 | def test_save_and_load(self): 90 | for scores in self.all_scores: 91 | self.thres.fit(scores) 92 | joblib.dump(self.thres, 'model.pkl') 93 | loaded_thres = joblib.load('model.pkl') 94 | 95 | assert_equal(self.thres.predict(scores), 96 | loaded_thres.predict(scores)) 97 | -------------------------------------------------------------------------------- /pythresh/test/test_yj.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unittest 3 | from os.path import dirname as up 4 | 5 | # noinspection PyProtectedMember 6 | import joblib 7 | import numpy as np 8 | from numpy.testing import assert_equal 9 | from pyod.models.iforest import IForest 10 | from pyod.models.knn import KNN 11 | from pyod.models.pca import PCA 12 | from pyod.utils.data import generate_data 13 | 14 | from pythresh.thresholds.yj import YJ 15 | 16 | # temporary solution for relative imports in case pythresh is not installed 17 | # if pythresh is installed, no need to use the following line 18 | 19 | path = up(up(up(__file__))) 20 | sys.path.append(path) 21 | 22 | 23 | class TestYJ(unittest.TestCase): 24 | 25 | @classmethod 26 | def setUpClass(cls): 27 | cls.n_train = 200 28 | cls.n_test = 100 29 | cls.contamination = 0.1 30 | cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data( 31 | n_train=cls.n_train, n_test=cls.n_test, 32 | contamination=cls.contamination, random_state=42) 33 | 34 | cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)] 35 | cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_ 36 | cls.multiple_scores = np.vstack([ 37 | clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs 38 | ]).T 39 | cls.all_scores = [cls.single_score, cls.multiple_scores] 40 | 41 | def setUp(self): 42 | self.thres = YJ() 43 | 44 | def check_labels(self, labels, scores_shape): 45 | self.assertEqual(labels.shape, scores_shape[:1]) 46 | self.assertIn(labels.min(), [0, 1]) 47 | self.assertIn(labels.max(), [0, 1]) 48 | 49 | def check_fitted_attributes(self, thres): 50 | self.assertTrue(thres.__sklearn_is_fitted__()) 51 | self.assertIsNotNone(thres.labels_) 52 | self.assertIsNotNone(thres.thresh_) 53 | 54 | def test_eval(self): 55 | for scores in self.all_scores: 56 | pred_labels = self.thres.eval(scores) 57 | 58 | self.assertIsNotNone(self.thres.thresh_) 59 | self.assertIsNotNone(self.thres.dscores_) 60 | self.assertGreaterEqual(self.thres.dscores_.min(), 0) 61 | self.assertLessEqual(self.thres.dscores_.max(), 1) 62 | self.check_labels(pred_labels, scores.shape) 63 | 64 | def test_fit(self): 65 | for scores in self.all_scores: 66 | self.thres.fit(scores) 67 | self.check_fitted_attributes(self.thres) 68 | self.check_labels(self.thres.labels_, scores.shape) 69 | 70 | def test_predict(self): 71 | for scores in self.all_scores: 72 | self.thres.fit(scores) 73 | pred_labels = self.thres.predict(scores) 74 | self.check_fitted_attributes(self.thres) 75 | self.check_labels(pred_labels, scores.shape) 76 | assert_equal(self.thres.labels_, pred_labels) 77 | 78 | def test_test_data(self): 79 | for scores, test_scores in zip(self.all_scores, [ 80 | self.clfs[0].fit(self.X_train).decision_function(self.X_test), 81 | np.vstack([clf.fit(self.X_train).decision_function(self.X_test) 82 | for clf in self.clfs]).T 83 | ]): 84 | self.thres.fit(scores) 85 | pred_labels = self.thres.predict(test_scores) 86 | self.check_fitted_attributes(self.thres) 87 | self.check_labels(pred_labels, test_scores.shape) 88 | 89 | def test_save_and_load(self): 90 | for scores in self.all_scores: 91 | self.thres.fit(scores) 92 | joblib.dump(self.thres, 'model.pkl') 93 | loaded_thres = joblib.load('model.pkl') 94 | 95 | assert_equal(self.thres.predict(scores), 96 | loaded_thres.predict(scores)) 97 | -------------------------------------------------------------------------------- /pythresh/test/test_fgd.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unittest 3 | from os.path import dirname as up 4 | 5 | # noinspection PyProtectedMember 6 | import joblib 7 | import numpy as np 8 | from numpy.testing import assert_equal 9 | from pyod.models.iforest import IForest 10 | from pyod.models.knn import KNN 11 | from pyod.models.pca import PCA 12 | from pyod.utils.data import generate_data 13 | 14 | from pythresh.thresholds.fgd import FGD 15 | 16 | # temporary solution for relative imports in case pythresh is not installed 17 | # if pythresh is installed, no need to use the following line 18 | 19 | path = up(up(up(__file__))) 20 | sys.path.append(path) 21 | 22 | 23 | class TestFGD(unittest.TestCase): 24 | 25 | @classmethod 26 | def setUpClass(cls): 27 | cls.n_train = 200 28 | cls.n_test = 100 29 | cls.contamination = 0.1 30 | cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data( 31 | n_train=cls.n_train, n_test=cls.n_test, 32 | contamination=cls.contamination, random_state=42) 33 | 34 | cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)] 35 | cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_ 36 | cls.multiple_scores = np.vstack([ 37 | clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs 38 | ]).T 39 | cls.all_scores = [cls.single_score, cls.multiple_scores] 40 | 41 | def setUp(self): 42 | self.thres = FGD() 43 | 44 | def check_labels(self, labels, scores_shape): 45 | self.assertEqual(labels.shape, scores_shape[:1]) 46 | self.assertIn(labels.min(), [0, 1]) 47 | self.assertIn(labels.max(), [0, 1]) 48 | 49 | def check_fitted_attributes(self, thres): 50 | self.assertTrue(thres.__sklearn_is_fitted__()) 51 | self.assertIsNotNone(thres.labels_) 52 | self.assertIsNotNone(thres.thresh_) 53 | 54 | def test_eval(self): 55 | for scores in self.all_scores: 56 | pred_labels = self.thres.eval(scores) 57 | 58 | self.assertIsNotNone(self.thres.thresh_) 59 | self.assertIsNotNone(self.thres.dscores_) 60 | self.assertGreaterEqual(self.thres.dscores_.min(), 0) 61 | self.assertLessEqual(self.thres.dscores_.max(), 1) 62 | self.check_labels(pred_labels, scores.shape) 63 | 64 | def test_fit(self): 65 | for scores in self.all_scores: 66 | self.thres.fit(scores) 67 | self.check_fitted_attributes(self.thres) 68 | self.check_labels(self.thres.labels_, scores.shape) 69 | 70 | def test_predict(self): 71 | for scores in self.all_scores: 72 | self.thres.fit(scores) 73 | pred_labels = self.thres.predict(scores) 74 | self.check_fitted_attributes(self.thres) 75 | self.check_labels(pred_labels, scores.shape) 76 | assert_equal(self.thres.labels_, pred_labels) 77 | 78 | def test_test_data(self): 79 | for scores, test_scores in zip(self.all_scores, [ 80 | self.clfs[0].fit(self.X_train).decision_function(self.X_test), 81 | np.vstack([clf.fit(self.X_train).decision_function(self.X_test) 82 | for clf in self.clfs]).T 83 | ]): 84 | self.thres.fit(scores) 85 | pred_labels = self.thres.predict(test_scores) 86 | self.check_fitted_attributes(self.thres) 87 | self.check_labels(pred_labels, test_scores.shape) 88 | 89 | def test_save_and_load(self): 90 | for scores in self.all_scores: 91 | self.thres.fit(scores) 92 | joblib.dump(self.thres, 'model.pkl') 93 | loaded_thres = joblib.load('model.pkl') 94 | 95 | assert_equal(self.thres.predict(scores), 96 | loaded_thres.predict(scores)) 97 | -------------------------------------------------------------------------------- /pythresh/test/test_iqr.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unittest 3 | from os.path import dirname as up 4 | 5 | # noinspection PyProtectedMember 6 | import joblib 7 | import numpy as np 8 | from numpy.testing import assert_equal 9 | from pyod.models.iforest import IForest 10 | from pyod.models.knn import KNN 11 | from pyod.models.pca import PCA 12 | from pyod.utils.data import generate_data 13 | 14 | from pythresh.thresholds.iqr import IQR 15 | 16 | # temporary solution for relative imports in case pythresh is not installed 17 | # if pythresh is installed, no need to use the following line 18 | 19 | path = up(up(up(__file__))) 20 | sys.path.append(path) 21 | 22 | 23 | class TestIQR(unittest.TestCase): 24 | 25 | @classmethod 26 | def setUpClass(cls): 27 | cls.n_train = 200 28 | cls.n_test = 100 29 | cls.contamination = 0.1 30 | cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data( 31 | n_train=cls.n_train, n_test=cls.n_test, 32 | contamination=cls.contamination, random_state=42) 33 | 34 | cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)] 35 | cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_ 36 | cls.multiple_scores = np.vstack([ 37 | clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs 38 | ]).T 39 | cls.all_scores = [cls.single_score, cls.multiple_scores] 40 | 41 | def setUp(self): 42 | self.thres = IQR() 43 | 44 | def check_labels(self, labels, scores_shape): 45 | self.assertEqual(labels.shape, scores_shape[:1]) 46 | self.assertIn(labels.min(), [0, 1]) 47 | self.assertIn(labels.max(), [0, 1]) 48 | 49 | def check_fitted_attributes(self, thres): 50 | self.assertTrue(thres.__sklearn_is_fitted__()) 51 | self.assertIsNotNone(thres.labels_) 52 | self.assertIsNotNone(thres.thresh_) 53 | 54 | def test_eval(self): 55 | for scores in self.all_scores: 56 | pred_labels = self.thres.eval(scores) 57 | 58 | self.assertIsNotNone(self.thres.thresh_) 59 | self.assertIsNotNone(self.thres.dscores_) 60 | self.assertGreaterEqual(self.thres.dscores_.min(), 0) 61 | self.assertLessEqual(self.thres.dscores_.max(), 1) 62 | self.check_labels(pred_labels, scores.shape) 63 | 64 | def test_fit(self): 65 | for scores in self.all_scores: 66 | self.thres.fit(scores) 67 | self.check_fitted_attributes(self.thres) 68 | self.check_labels(self.thres.labels_, scores.shape) 69 | 70 | def test_predict(self): 71 | for scores in self.all_scores: 72 | self.thres.fit(scores) 73 | pred_labels = self.thres.predict(scores) 74 | self.check_fitted_attributes(self.thres) 75 | self.check_labels(pred_labels, scores.shape) 76 | assert_equal(self.thres.labels_, pred_labels) 77 | 78 | def test_test_data(self): 79 | for scores, test_scores in zip(self.all_scores, [ 80 | self.clfs[0].fit(self.X_train).decision_function(self.X_test), 81 | np.vstack([clf.fit(self.X_train).decision_function(self.X_test) 82 | for clf in self.clfs]).T 83 | ]): 84 | self.thres.fit(scores) 85 | pred_labels = self.thres.predict(test_scores) 86 | self.check_fitted_attributes(self.thres) 87 | self.check_labels(pred_labels, test_scores.shape) 88 | 89 | def test_save_and_load(self): 90 | for scores in self.all_scores: 91 | self.thres.fit(scores) 92 | joblib.dump(self.thres, 'model.pkl') 93 | loaded_thres = joblib.load('model.pkl') 94 | 95 | assert_equal(self.thres.predict(scores), 96 | loaded_thres.predict(scores)) 97 | -------------------------------------------------------------------------------- /pythresh/test/test_aucp.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unittest 3 | from os.path import dirname as up 4 | 5 | # noinspection PyProtectedMember 6 | import joblib 7 | import numpy as np 8 | from numpy.testing import assert_equal 9 | from pyod.models.iforest import IForest 10 | from pyod.models.knn import KNN 11 | from pyod.models.pca import PCA 12 | from pyod.utils.data import generate_data 13 | 14 | from pythresh.thresholds.aucp import AUCP 15 | 16 | # temporary solution for relative imports in case pythresh is not installed 17 | # if pythresh is installed, no need to use the following line 18 | 19 | path = up(up(up(__file__))) 20 | sys.path.append(path) 21 | 22 | 23 | class TestAUCP(unittest.TestCase): 24 | 25 | @classmethod 26 | def setUpClass(cls): 27 | cls.n_train = 200 28 | cls.n_test = 100 29 | cls.contamination = 0.1 30 | cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data( 31 | n_train=cls.n_train, n_test=cls.n_test, 32 | contamination=cls.contamination, random_state=42) 33 | 34 | cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)] 35 | cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_ 36 | cls.multiple_scores = np.vstack([ 37 | clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs 38 | ]).T 39 | cls.all_scores = [cls.single_score, cls.multiple_scores] 40 | 41 | def setUp(self): 42 | self.thres = AUCP() 43 | 44 | def check_labels(self, labels, scores_shape): 45 | self.assertEqual(labels.shape, scores_shape[:1]) 46 | self.assertIn(labels.min(), [0, 1]) 47 | self.assertIn(labels.max(), [0, 1]) 48 | 49 | def check_fitted_attributes(self, thres): 50 | self.assertTrue(thres.__sklearn_is_fitted__()) 51 | self.assertIsNotNone(thres.labels_) 52 | self.assertIsNotNone(thres.thresh_) 53 | 54 | def test_eval(self): 55 | for scores in self.all_scores: 56 | pred_labels = self.thres.eval(scores) 57 | 58 | self.assertIsNotNone(self.thres.thresh_) 59 | self.assertIsNotNone(self.thres.dscores_) 60 | self.assertGreaterEqual(self.thres.dscores_.min(), 0) 61 | self.assertLessEqual(self.thres.dscores_.max(), 1) 62 | self.check_labels(pred_labels, scores.shape) 63 | 64 | def test_fit(self): 65 | for scores in self.all_scores: 66 | self.thres.fit(scores) 67 | self.check_fitted_attributes(self.thres) 68 | self.check_labels(self.thres.labels_, scores.shape) 69 | 70 | def test_predict(self): 71 | for scores in self.all_scores: 72 | self.thres.fit(scores) 73 | pred_labels = self.thres.predict(scores) 74 | self.check_fitted_attributes(self.thres) 75 | self.check_labels(pred_labels, scores.shape) 76 | assert_equal(self.thres.labels_, pred_labels) 77 | 78 | def test_test_data(self): 79 | for scores, test_scores in zip(self.all_scores, [ 80 | self.clfs[0].fit(self.X_train).decision_function(self.X_test), 81 | np.vstack([clf.fit(self.X_train).decision_function(self.X_test) 82 | for clf in self.clfs]).T 83 | ]): 84 | self.thres.fit(scores) 85 | pred_labels = self.thres.predict(test_scores) 86 | self.check_fitted_attributes(self.thres) 87 | self.check_labels(pred_labels, test_scores.shape) 88 | 89 | def test_save_and_load(self): 90 | for scores in self.all_scores: 91 | self.thres.fit(scores) 92 | joblib.dump(self.thres, 'model.pkl') 93 | loaded_thres = joblib.load('model.pkl') 94 | 95 | assert_equal(self.thres.predict(scores), 96 | loaded_thres.predict(scores)) 97 | -------------------------------------------------------------------------------- /pythresh/test/test_boot.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unittest 3 | from os.path import dirname as up 4 | 5 | # noinspection PyProtectedMember 6 | import joblib 7 | import numpy as np 8 | from numpy.testing import assert_equal 9 | from pyod.models.iforest import IForest 10 | from pyod.models.knn import KNN 11 | from pyod.models.pca import PCA 12 | from pyod.utils.data import generate_data 13 | 14 | from pythresh.thresholds.boot import BOOT 15 | 16 | # temporary solution for relative imports in case pythresh is not installed 17 | # if pythresh is installed, no need to use the following line 18 | 19 | path = up(up(up(__file__))) 20 | sys.path.append(path) 21 | 22 | 23 | class TestBOOT(unittest.TestCase): 24 | 25 | @classmethod 26 | def setUpClass(cls): 27 | cls.n_train = 200 28 | cls.n_test = 100 29 | cls.contamination = 0.1 30 | cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data( 31 | n_train=cls.n_train, n_test=cls.n_test, 32 | contamination=cls.contamination, random_state=42) 33 | 34 | cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)] 35 | cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_ 36 | cls.multiple_scores = np.vstack([ 37 | clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs 38 | ]).T 39 | cls.all_scores = [cls.single_score, cls.multiple_scores] 40 | 41 | def setUp(self): 42 | self.thres = BOOT() 43 | 44 | def check_labels(self, labels, scores_shape): 45 | self.assertEqual(labels.shape, scores_shape[:1]) 46 | self.assertIn(labels.min(), [0, 1]) 47 | self.assertIn(labels.max(), [0, 1]) 48 | 49 | def check_fitted_attributes(self, thres): 50 | self.assertTrue(thres.__sklearn_is_fitted__()) 51 | self.assertIsNotNone(thres.labels_) 52 | self.assertIsNotNone(thres.thresh_) 53 | 54 | def test_eval(self): 55 | for scores in self.all_scores: 56 | pred_labels = self.thres.eval(scores) 57 | 58 | self.assertIsNotNone(self.thres.thresh_) 59 | self.assertIsNotNone(self.thres.dscores_) 60 | self.assertGreaterEqual(self.thres.dscores_.min(), 0) 61 | self.assertLessEqual(self.thres.dscores_.max(), 1) 62 | self.check_labels(pred_labels, scores.shape) 63 | 64 | def test_fit(self): 65 | for scores in self.all_scores: 66 | self.thres.fit(scores) 67 | self.check_fitted_attributes(self.thres) 68 | self.check_labels(self.thres.labels_, scores.shape) 69 | 70 | def test_predict(self): 71 | for scores in self.all_scores: 72 | self.thres.fit(scores) 73 | pred_labels = self.thres.predict(scores) 74 | self.check_fitted_attributes(self.thres) 75 | self.check_labels(pred_labels, scores.shape) 76 | assert_equal(self.thres.labels_, pred_labels) 77 | 78 | def test_test_data(self): 79 | for scores, test_scores in zip(self.all_scores, [ 80 | self.clfs[0].fit(self.X_train).decision_function(self.X_test), 81 | np.vstack([clf.fit(self.X_train).decision_function(self.X_test) 82 | for clf in self.clfs]).T 83 | ]): 84 | self.thres.fit(scores) 85 | pred_labels = self.thres.predict(test_scores) 86 | self.check_fitted_attributes(self.thres) 87 | self.check_labels(pred_labels, test_scores.shape) 88 | 89 | def test_save_and_load(self): 90 | for scores in self.all_scores: 91 | self.thres.fit(scores) 92 | joblib.dump(self.thres, 'model.pkl') 93 | loaded_thres = joblib.load('model.pkl') 94 | 95 | assert_equal(self.thres.predict(scores), 96 | loaded_thres.predict(scores)) 97 | -------------------------------------------------------------------------------- /pythresh/test/test_fwfm.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unittest 3 | from os.path import dirname as up 4 | 5 | # noinspection PyProtectedMember 6 | import joblib 7 | import numpy as np 8 | from numpy.testing import assert_equal 9 | from pyod.models.iforest import IForest 10 | from pyod.models.knn import KNN 11 | from pyod.models.pca import PCA 12 | from pyod.utils.data import generate_data 13 | 14 | from pythresh.thresholds.fwfm import FWFM 15 | 16 | # temporary solution for relative imports in case pythresh is not installed 17 | # if pythresh is installed, no need to use the following line 18 | 19 | path = up(up(up(__file__))) 20 | sys.path.append(path) 21 | 22 | 23 | class TestFWFM(unittest.TestCase): 24 | 25 | @classmethod 26 | def setUpClass(cls): 27 | cls.n_train = 200 28 | cls.n_test = 100 29 | cls.contamination = 0.1 30 | cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data( 31 | n_train=cls.n_train, n_test=cls.n_test, 32 | contamination=cls.contamination, random_state=42) 33 | 34 | cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)] 35 | cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_ 36 | cls.multiple_scores = np.vstack([ 37 | clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs 38 | ]).T 39 | cls.all_scores = [cls.single_score, cls.multiple_scores] 40 | 41 | def setUp(self): 42 | self.thres = FWFM() 43 | 44 | def check_labels(self, labels, scores_shape): 45 | self.assertEqual(labels.shape, scores_shape[:1]) 46 | self.assertIn(labels.min(), [0, 1]) 47 | self.assertIn(labels.max(), [0, 1]) 48 | 49 | def check_fitted_attributes(self, thres): 50 | self.assertTrue(thres.__sklearn_is_fitted__()) 51 | self.assertIsNotNone(thres.labels_) 52 | self.assertIsNotNone(thres.thresh_) 53 | 54 | def test_eval(self): 55 | for scores in self.all_scores: 56 | pred_labels = self.thres.eval(scores) 57 | 58 | self.assertIsNotNone(self.thres.thresh_) 59 | self.assertIsNotNone(self.thres.dscores_) 60 | self.assertGreaterEqual(self.thres.dscores_.min(), 0) 61 | self.assertLessEqual(self.thres.dscores_.max(), 1) 62 | self.check_labels(pred_labels, scores.shape) 63 | 64 | def test_fit(self): 65 | for scores in self.all_scores: 66 | self.thres.fit(scores) 67 | self.check_fitted_attributes(self.thres) 68 | self.check_labels(self.thres.labels_, scores.shape) 69 | 70 | def test_predict(self): 71 | for scores in self.all_scores: 72 | self.thres.fit(scores) 73 | pred_labels = self.thres.predict(scores) 74 | self.check_fitted_attributes(self.thres) 75 | self.check_labels(pred_labels, scores.shape) 76 | assert_equal(self.thres.labels_, pred_labels) 77 | 78 | def test_test_data(self): 79 | for scores, test_scores in zip(self.all_scores, [ 80 | self.clfs[0].fit(self.X_train).decision_function(self.X_test), 81 | np.vstack([clf.fit(self.X_train).decision_function(self.X_test) 82 | for clf in self.clfs]).T 83 | ]): 84 | self.thres.fit(scores) 85 | pred_labels = self.thres.predict(test_scores) 86 | self.check_fitted_attributes(self.thres) 87 | self.check_labels(pred_labels, test_scores.shape) 88 | 89 | def test_save_and_load(self): 90 | for scores in self.all_scores: 91 | self.thres.fit(scores) 92 | joblib.dump(self.thres, 'model.pkl') 93 | loaded_thres = joblib.load('model.pkl') 94 | 95 | assert_equal(self.thres.predict(scores), 96 | loaded_thres.predict(scores)) 97 | -------------------------------------------------------------------------------- /pythresh/test/test_mcst.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unittest 3 | from os.path import dirname as up 4 | 5 | # noinspection PyProtectedMember 6 | import joblib 7 | import numpy as np 8 | from numpy.testing import assert_equal 9 | from pyod.models.iforest import IForest 10 | from pyod.models.knn import KNN 11 | from pyod.models.pca import PCA 12 | from pyod.utils.data import generate_data 13 | 14 | from pythresh.thresholds.mcst import MCST 15 | 16 | # temporary solution for relative imports in case pythresh is not installed 17 | # if pythresh is installed, no need to use the following line 18 | 19 | path = up(up(up(__file__))) 20 | sys.path.append(path) 21 | 22 | 23 | class TestMCST(unittest.TestCase): 24 | 25 | @classmethod 26 | def setUpClass(cls): 27 | cls.n_train = 200 28 | cls.n_test = 100 29 | cls.contamination = 0.1 30 | cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data( 31 | n_train=cls.n_train, n_test=cls.n_test, 32 | contamination=cls.contamination, random_state=42) 33 | 34 | cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)] 35 | cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_ 36 | cls.multiple_scores = np.vstack([ 37 | clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs 38 | ]).T 39 | cls.all_scores = [cls.single_score, cls.multiple_scores] 40 | 41 | def setUp(self): 42 | self.thres = MCST() 43 | 44 | def check_labels(self, labels, scores_shape): 45 | self.assertEqual(labels.shape, scores_shape[:1]) 46 | self.assertIn(labels.min(), [0, 1]) 47 | self.assertIn(labels.max(), [0, 1]) 48 | 49 | def check_fitted_attributes(self, thres): 50 | self.assertTrue(thres.__sklearn_is_fitted__()) 51 | self.assertIsNotNone(thres.labels_) 52 | self.assertIsNotNone(thres.thresh_) 53 | 54 | def test_eval(self): 55 | for scores in self.all_scores: 56 | pred_labels = self.thres.eval(scores) 57 | 58 | self.assertIsNotNone(self.thres.thresh_) 59 | self.assertIsNotNone(self.thres.dscores_) 60 | self.assertGreaterEqual(self.thres.dscores_.min(), 0) 61 | self.assertLessEqual(self.thres.dscores_.max(), 1) 62 | self.check_labels(pred_labels, scores.shape) 63 | 64 | def test_fit(self): 65 | for scores in self.all_scores: 66 | self.thres.fit(scores) 67 | self.check_fitted_attributes(self.thres) 68 | self.check_labels(self.thres.labels_, scores.shape) 69 | 70 | def test_predict(self): 71 | for scores in self.all_scores: 72 | self.thres.fit(scores) 73 | pred_labels = self.thres.predict(scores) 74 | self.check_fitted_attributes(self.thres) 75 | self.check_labels(pred_labels, scores.shape) 76 | assert_equal(self.thres.labels_, pred_labels) 77 | 78 | def test_test_data(self): 79 | for scores, test_scores in zip(self.all_scores, [ 80 | self.clfs[0].fit(self.X_train).decision_function(self.X_test), 81 | np.vstack([clf.fit(self.X_train).decision_function(self.X_test) 82 | for clf in self.clfs]).T 83 | ]): 84 | self.thres.fit(scores) 85 | pred_labels = self.thres.predict(test_scores) 86 | self.check_fitted_attributes(self.thres) 87 | self.check_labels(pred_labels, test_scores.shape) 88 | 89 | def test_save_and_load(self): 90 | for scores in self.all_scores: 91 | self.thres.fit(scores) 92 | joblib.dump(self.thres, 'model.pkl') 93 | loaded_thres = joblib.load('model.pkl') 94 | 95 | assert_equal(self.thres.predict(scores), 96 | loaded_thres.predict(scores)) 97 | -------------------------------------------------------------------------------- /pythresh/test/test_moll.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unittest 3 | from os.path import dirname as up 4 | 5 | # noinspection PyProtectedMember 6 | import joblib 7 | import numpy as np 8 | from numpy.testing import assert_equal 9 | from pyod.models.iforest import IForest 10 | from pyod.models.knn import KNN 11 | from pyod.models.pca import PCA 12 | from pyod.utils.data import generate_data 13 | 14 | from pythresh.thresholds.moll import MOLL 15 | 16 | # temporary solution for relative imports in case pythresh is not installed 17 | # if pythresh is installed, no need to use the following line 18 | 19 | path = up(up(up(__file__))) 20 | sys.path.append(path) 21 | 22 | 23 | class TestMOLL(unittest.TestCase): 24 | 25 | @classmethod 26 | def setUpClass(cls): 27 | cls.n_train = 200 28 | cls.n_test = 100 29 | cls.contamination = 0.1 30 | cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data( 31 | n_train=cls.n_train, n_test=cls.n_test, 32 | contamination=cls.contamination, random_state=42) 33 | 34 | cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)] 35 | cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_ 36 | cls.multiple_scores = np.vstack([ 37 | clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs 38 | ]).T 39 | cls.all_scores = [cls.single_score, cls.multiple_scores] 40 | 41 | def setUp(self): 42 | self.thres = MOLL() 43 | 44 | def check_labels(self, labels, scores_shape): 45 | self.assertEqual(labels.shape, scores_shape[:1]) 46 | self.assertIn(labels.min(), [0, 1]) 47 | self.assertIn(labels.max(), [0, 1]) 48 | 49 | def check_fitted_attributes(self, thres): 50 | self.assertTrue(thres.__sklearn_is_fitted__()) 51 | self.assertIsNotNone(thres.labels_) 52 | self.assertIsNotNone(thres.thresh_) 53 | 54 | def test_eval(self): 55 | for scores in self.all_scores: 56 | pred_labels = self.thres.eval(scores) 57 | 58 | self.assertIsNotNone(self.thres.thresh_) 59 | self.assertIsNotNone(self.thres.dscores_) 60 | self.assertGreaterEqual(self.thres.dscores_.min(), 0) 61 | self.assertLessEqual(self.thres.dscores_.max(), 1) 62 | self.check_labels(pred_labels, scores.shape) 63 | 64 | def test_fit(self): 65 | for scores in self.all_scores: 66 | self.thres.fit(scores) 67 | self.check_fitted_attributes(self.thres) 68 | self.check_labels(self.thres.labels_, scores.shape) 69 | 70 | def test_predict(self): 71 | for scores in self.all_scores: 72 | self.thres.fit(scores) 73 | pred_labels = self.thres.predict(scores) 74 | self.check_fitted_attributes(self.thres) 75 | self.check_labels(pred_labels, scores.shape) 76 | assert_equal(self.thres.labels_, pred_labels) 77 | 78 | def test_test_data(self): 79 | for scores, test_scores in zip(self.all_scores, [ 80 | self.clfs[0].fit(self.X_train).decision_function(self.X_test), 81 | np.vstack([clf.fit(self.X_train).decision_function(self.X_test) 82 | for clf in self.clfs]).T 83 | ]): 84 | self.thres.fit(scores) 85 | pred_labels = self.thres.predict(test_scores) 86 | self.check_fitted_attributes(self.thres) 87 | self.check_labels(pred_labels, test_scores.shape) 88 | 89 | def test_save_and_load(self): 90 | for scores in self.all_scores: 91 | self.thres.fit(scores) 92 | joblib.dump(self.thres, 'model.pkl') 93 | loaded_thres = joblib.load('model.pkl') 94 | 95 | assert_equal(self.thres.predict(scores), 96 | loaded_thres.predict(scores)) 97 | -------------------------------------------------------------------------------- /pythresh/test/test_wind.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unittest 3 | from os.path import dirname as up 4 | 5 | # noinspection PyProtectedMember 6 | import joblib 7 | import numpy as np 8 | from numpy.testing import assert_equal 9 | from pyod.models.iforest import IForest 10 | from pyod.models.knn import KNN 11 | from pyod.models.pca import PCA 12 | from pyod.utils.data import generate_data 13 | 14 | from pythresh.thresholds.wind import WIND 15 | 16 | # temporary solution for relative imports in case pythresh is not installed 17 | # if pythresh is installed, no need to use the following line 18 | 19 | path = up(up(up(__file__))) 20 | sys.path.append(path) 21 | 22 | 23 | class TestWIND(unittest.TestCase): 24 | 25 | @classmethod 26 | def setUpClass(cls): 27 | cls.n_train = 200 28 | cls.n_test = 100 29 | cls.contamination = 0.1 30 | cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data( 31 | n_train=cls.n_train, n_test=cls.n_test, 32 | contamination=cls.contamination, random_state=42) 33 | 34 | cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)] 35 | cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_ 36 | cls.multiple_scores = np.vstack([ 37 | clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs 38 | ]).T 39 | cls.all_scores = [cls.single_score, cls.multiple_scores] 40 | 41 | def setUp(self): 42 | self.thres = WIND() 43 | 44 | def check_labels(self, labels, scores_shape): 45 | self.assertEqual(labels.shape, scores_shape[:1]) 46 | self.assertIn(labels.min(), [0, 1]) 47 | self.assertIn(labels.max(), [0, 1]) 48 | 49 | def check_fitted_attributes(self, thres): 50 | self.assertTrue(thres.__sklearn_is_fitted__()) 51 | self.assertIsNotNone(thres.labels_) 52 | self.assertIsNotNone(thres.thresh_) 53 | 54 | def test_eval(self): 55 | for scores in self.all_scores: 56 | pred_labels = self.thres.eval(scores) 57 | 58 | self.assertIsNotNone(self.thres.thresh_) 59 | self.assertIsNotNone(self.thres.dscores_) 60 | self.assertGreaterEqual(self.thres.dscores_.min(), 0) 61 | self.assertLessEqual(self.thres.dscores_.max(), 1) 62 | self.check_labels(pred_labels, scores.shape) 63 | 64 | def test_fit(self): 65 | for scores in self.all_scores: 66 | self.thres.fit(scores) 67 | self.check_fitted_attributes(self.thres) 68 | self.check_labels(self.thres.labels_, scores.shape) 69 | 70 | def test_predict(self): 71 | for scores in self.all_scores: 72 | self.thres.fit(scores) 73 | pred_labels = self.thres.predict(scores) 74 | self.check_fitted_attributes(self.thres) 75 | self.check_labels(pred_labels, scores.shape) 76 | assert_equal(self.thres.labels_, pred_labels) 77 | 78 | def test_test_data(self): 79 | for scores, test_scores in zip(self.all_scores, [ 80 | self.clfs[0].fit(self.X_train).decision_function(self.X_test), 81 | np.vstack([clf.fit(self.X_train).decision_function(self.X_test) 82 | for clf in self.clfs]).T 83 | ]): 84 | self.thres.fit(scores) 85 | pred_labels = self.thres.predict(test_scores) 86 | self.check_fitted_attributes(self.thres) 87 | self.check_labels(pred_labels, test_scores.shape) 88 | 89 | def test_save_and_load(self): 90 | for scores in self.all_scores: 91 | self.thres.fit(scores) 92 | joblib.dump(self.thres, 'model.pkl') 93 | loaded_thres = joblib.load('model.pkl') 94 | 95 | assert_equal(self.thres.predict(scores), 96 | loaded_thres.predict(scores)) 97 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /pythresh/thresholds/boot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.stats as stats 3 | 4 | from .base import BaseThresholder 5 | from .thresh_utility import cut 6 | 7 | 8 | class BOOT(BaseThresholder): 9 | r"""BOOT class for Bootstrapping thresholder. 10 | 11 | Use a bootstrapping based method to find a non-parametric means 12 | to threshold scores generated by the decision_scores where outliers 13 | are set to any value beyond the mean of the confidence intervals. 14 | See :cite:`martin2006boot` for details 15 | 16 | Parameters 17 | ---------- 18 | random_state : int, optional (default=1234) 19 | Random seed for bootstrapping a confidence interval. Can also be set to None. 20 | 21 | Attributes 22 | ---------- 23 | 24 | thresh_ : threshold value that separates inliers from outliers 25 | 26 | dscores_ : 1D array of decomposed decision scores 27 | 28 | Notes 29 | ----- 30 | 31 | The two sided bias-corrected and accelerated bootstrap confidence interval 32 | is calculated with a confidence level of 0.95. The statistic calculating 33 | the confidence interval is the standard deviation of the decision 34 | scores, with the statistic treating corresponding elements of the 35 | samples in the decision scores as paired 36 | 37 | The returned upper and lower confidence intervals are used to threshold 38 | the decision scores. Outliers are set to any value above the mean of the 39 | upper and lower confidence intervals. 40 | 41 | Examples 42 | -------- 43 | The effects of randomness can affect the thresholder's output performance 44 | significantly. Therefore, to alleviate the effects of randomness on the 45 | thresholder a combined model can be used with different random_state values. 46 | E.g. 47 | 48 | .. code:: python 49 | 50 | # train the KNN detector 51 | from pyod.models.knn import KNN 52 | from pythresh.thresholds.comb import COMB 53 | from pythresh.thresholds.boot import BOOT 54 | 55 | clf = KNN() 56 | clf.fit(X_train) 57 | 58 | # get outlier scores 59 | decision_scores = clf.decision_scores_ # raw outlier scores 60 | 61 | # get outlier labels with combined model 62 | thres = COMB(thresholders = [BOOT(random_state=1234), 63 | BOOT(random_state=42), BOOT(random_state=9685), 64 | BOOT(random_state=111222)]) 65 | labels = thres.eval(decision_scores) 66 | 67 | """ 68 | 69 | def __init__(self, random_state=1234): 70 | 71 | super().__init__() 72 | self.random_state = random_state 73 | np.random.seed(random_state) 74 | 75 | def eval(self, decision): 76 | """Outlier/inlier evaluation process for decision scores. 77 | 78 | Parameters 79 | ---------- 80 | decision : np.array or list of shape (n_samples) 81 | or np.array of shape (n_samples, n_detectors) 82 | which are the decision scores from a 83 | outlier detection. 84 | 85 | Returns 86 | ------- 87 | outlier_labels : numpy array of shape (n_samples,) 88 | For each observation, tells whether or not 89 | it should be considered as an outlier according to the 90 | fitted model. 0 stands for inliers and 1 for outliers. 91 | """ 92 | 93 | decision = self._data_setup(decision) 94 | 95 | limit1, limit2 = stats.bootstrap( 96 | decision.reshape(1, -1), 97 | np.std, 98 | paired=True, 99 | random_state=self.random_state 100 | ).confidence_interval 101 | 102 | self.thresh_ = (limit1+limit2)/2 103 | 104 | return cut(decision, (limit1+limit2)/2) 105 | -------------------------------------------------------------------------------- /pythresh/thresholds/regr.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.stats as stats 3 | 4 | from .base import BaseThresholder 5 | from .thresh_utility import cut, normalize 6 | 7 | 8 | class REGR(BaseThresholder): 9 | """REGR class for Regression based thresholder. 10 | 11 | Use the regression to evaluate a non-parametric means 12 | to threshold scores generated by the decision_scores where outliers 13 | are set to any value beyond the y-intercept value of the linear fit. 14 | See :cite:`aggarwal2017clf` for details. 15 | 16 | Parameters 17 | ---------- 18 | 19 | method : {'siegel', 'theil'}, optional (default='siegel') 20 | Regression based method to calculate the y-intercept 21 | 22 | - 'siegel': implements a method for robust linear regression using repeated medians 23 | - 'theil': implements a method for robust linear regression using paired values 24 | 25 | random_state : int, optional (default=1234) 26 | random seed for the normal distribution. Can also be set to None 27 | 28 | Attributes 29 | ---------- 30 | 31 | thresh_ : threshold value that separates inliers from outliers 32 | 33 | Examples 34 | -------- 35 | The effects of randomness can affect the thresholder's output performance 36 | significantly. Therefore, to alleviate the effects of randomness on the 37 | thresholder a combined model can be used with different random_state values. 38 | E.g. 39 | 40 | .. code:: python 41 | 42 | # train the KNN detector 43 | from pyod.models.knn import KNN 44 | from pythresh.thresholds.comb import COMB 45 | from pythresh.thresholds.regr import REGR 46 | 47 | clf = KNN() 48 | clf.fit(X_train) 49 | 50 | # get outlier scores 51 | decision_scores = clf.decision_scores_ # raw outlier scores 52 | 53 | # get outlier labels with combined model 54 | thres = COMB(thresholders = [REGR(random_state=1234), 55 | REGR(random_state=42), REGR(random_state=9685), 56 | REGR(random_state=111222)]) 57 | labels = thres.eval(decision_scores) 58 | """ 59 | 60 | def __init__(self, method='siegel', random_state=1234): 61 | 62 | super().__init__() 63 | self.method = method 64 | self.random_state = random_state 65 | np.random.seed(random_state) 66 | 67 | def eval(self, decision): 68 | """Outlier/inlier evaluation process for decision scores. 69 | 70 | Parameters 71 | ---------- 72 | decision : np.array or list of shape (n_samples) 73 | or np.array of shape (n_samples, n_detectors) 74 | which are the decision scores from a 75 | outlier detection. 76 | 77 | Returns 78 | ------- 79 | outlier_labels : numpy array of shape (n_samples,) 80 | For each observation, tells whether or not 81 | it should be considered as an outlier according to the 82 | fitted model. 0 stands for inliers and 1 for outliers. 83 | """ 84 | decision = self._data_setup(decision) 85 | 86 | # Create a normal distribution and normalize 87 | norm = np.random.default_rng(self.random_state).normal( 88 | loc=0.0, scale=1.0, size=decision.shape) 89 | norm = normalize(norm) 90 | 91 | # Set limit to the y-intercept 92 | try: 93 | if self.method == 'siegel': 94 | res = stats.siegelslopes(norm, decision) 95 | elif self.method == 'theil': 96 | res = stats.theilslopes(norm, decision) 97 | except MemoryError: 98 | res = [0.0, 1.0] 99 | 100 | limit = res[1] 101 | 102 | self.thresh_ = limit 103 | 104 | return cut(decision, limit) 105 | -------------------------------------------------------------------------------- /pythresh/thresholds/yj.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.stats as stats 3 | 4 | from .base import BaseThresholder 5 | from .thresh_utility import cut, gen_kde 6 | 7 | 8 | class YJ(BaseThresholder): 9 | r"""YJ class for Yeo-Johnson transformation thresholder. 10 | 11 | Use the Yeo-Johnson transformation to evaluate 12 | a non-parametric means to threshold scores generated by the 13 | decision_scores where outliers are set to any value beyond the 14 | max value in the YJ transformed data. 15 | See :cite:`raymaekers2021yj` for details. 16 | 17 | Parameters 18 | ---------- 19 | 20 | random_state : int, optional (default=1234) 21 | Random seed for the random number generators of the thresholders. Can also 22 | be set to None. 23 | 24 | Attributes 25 | ---------- 26 | 27 | thresh_ : threshold value that separates inliers from outliers 28 | 29 | dscores_ : 1D array of decomposed decision scores 30 | 31 | Notes 32 | ----- 33 | 34 | The Yeo-Johnson transformation is a power transform which is a 35 | set of power functions that apply a monotonic transformation to 36 | the dataset. For the decision scores this make their distribution 37 | more normal-like. The transformation is given by: 38 | 39 | .. math:: 40 | 41 | \psi_{(y, \lambda)} = \begin{cases} 42 | \left((y+1)^\lambda-1\right)/\lambda & \text{if } \lambda \neq 0 \text{, } y \geq 0 \\ 43 | \text{log}(y+1) & \text{if } \lambda = 0 \text{, } y \geq 0 \\ 44 | -\left((-y+1)^{(2-\lambda)}-1\right)/{(2-\lambda)} & \text{if } \lambda \neq 2 \text{, } y < 0 \\ 45 | -\text{log}(-y+1) & \text{if } \lambda = 2 \text{, } y < 0 46 | \end{cases} \mathrm{,} 47 | 48 | 49 | where :math:`\lambda` is a power parameter that is chosen via maximum 50 | likelihood estimation. Therefore, any values from the original decision 51 | scores that are beyond maximum value after this transformation are 52 | considered outliers. However, the closer a set of decision scores are 53 | to a normal distribution originally the smaller the probability this 54 | threshold will be able to identify outliers. 55 | 56 | """ 57 | 58 | def __init__(self, random_state=1234): 59 | 60 | super().__init__() 61 | self.random_state = random_state 62 | np.random.seed(random_state) 63 | 64 | def eval(self, decision): 65 | """Outlier/inlier evaluation process for decision scores. 66 | 67 | Parameters 68 | ---------- 69 | decision : np.array or list of shape (n_samples) 70 | or np.array of shape (n_samples, n_detectors) 71 | which are the decision scores from a 72 | outlier detection. 73 | 74 | Returns 75 | ------- 76 | outlier_labels : numpy array of shape (n_samples,) 77 | For each observation, tells whether or not 78 | it should be considered as an outlier according to the 79 | fitted model. 0 stands for inliers and 1 for outliers. 80 | """ 81 | 82 | decision = self._data_setup(decision) 83 | 84 | # Generate KDE 85 | val, _ = gen_kde(decision, 0, 1, len(decision)*3) 86 | 87 | # Use Yeo-Johnson transformation to reshape distribution 88 | # iterate to get average transformation 89 | mean_s = np.zeros(len(val)) 90 | for _ in range(50): 91 | scores = stats.yeojohnson(val)[0] 92 | mean_s += scores 93 | mean_s = mean_s/50 94 | 95 | # Set limit to the max value from the transformation 96 | limit = np.max(mean_s) 97 | 98 | self.thresh_ = limit 99 | 100 | return cut(decision, limit) 101 | -------------------------------------------------------------------------------- /pythresh/test/test_chau.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unittest 3 | from itertools import product 4 | from os.path import dirname as up 5 | 6 | # noinspection PyProtectedMember 7 | import joblib 8 | import numpy as np 9 | from numpy.testing import assert_equal 10 | from pyod.models.iforest import IForest 11 | from pyod.models.knn import KNN 12 | from pyod.models.pca import PCA 13 | from pyod.utils.data import generate_data 14 | 15 | from pythresh.thresholds.chau import CHAU 16 | 17 | # temporary solution for relative imports in case pythresh is not installed 18 | # if pythresh is installed, no need to use the following line 19 | 20 | path = up(up(up(__file__))) 21 | sys.path.append(path) 22 | 23 | 24 | class TestCHAU(unittest.TestCase): 25 | 26 | @classmethod 27 | def setUpClass(cls): 28 | cls.n_train = 200 29 | cls.n_test = 100 30 | cls.contamination = 0.1 31 | cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data( 32 | n_train=cls.n_train, n_test=cls.n_test, 33 | contamination=cls.contamination, random_state=42) 34 | 35 | cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)] 36 | cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_ 37 | cls.multiple_scores = np.vstack([ 38 | clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs 39 | ]).T 40 | cls.all_scores = [cls.single_score, cls.multiple_scores] 41 | 42 | cls.methods = ['mean', 'median', 'gmean'] 43 | 44 | cls.params = list(product(cls.all_scores, cls.methods)) 45 | 46 | def setUp(self): 47 | self.thres = CHAU() 48 | 49 | def check_labels(self, labels, scores_shape): 50 | self.assertEqual(labels.shape, scores_shape[:1]) 51 | self.assertIn(labels.min(), [0, 1]) 52 | self.assertIn(labels.max(), [0, 1]) 53 | 54 | def check_fitted_attributes(self, thres): 55 | self.assertTrue(thres.__sklearn_is_fitted__()) 56 | self.assertIsNotNone(thres.labels_) 57 | self.assertIsNotNone(thres.thresh_) 58 | 59 | def test_eval(self): 60 | for scores, method in self.params: 61 | thres = CHAU(method=method) 62 | pred_labels = thres.eval(scores) 63 | 64 | self.assertIsNotNone(thres.thresh_) 65 | self.assertIsNotNone(thres.dscores_) 66 | self.assertGreaterEqual(thres.dscores_.min(), 0) 67 | self.assertLessEqual(thres.dscores_.max(), 1) 68 | self.check_labels(pred_labels, scores.shape) 69 | 70 | def test_fit(self): 71 | for scores in self.all_scores: 72 | self.thres.fit(scores) 73 | self.check_fitted_attributes(self.thres) 74 | self.check_labels(self.thres.labels_, scores.shape) 75 | 76 | def test_predict(self): 77 | for scores in self.all_scores: 78 | self.thres.fit(scores) 79 | pred_labels = self.thres.predict(scores) 80 | self.check_fitted_attributes(self.thres) 81 | self.check_labels(pred_labels, scores.shape) 82 | 83 | def test_test_data(self): 84 | for scores, test_scores in zip(self.all_scores, [ 85 | self.clfs[0].fit(self.X_train).decision_function(self.X_test), 86 | np.vstack([clf.fit(self.X_train).decision_function(self.X_test) 87 | for clf in self.clfs]).T 88 | ]): 89 | self.thres.fit(scores) 90 | pred_labels = self.thres.predict(test_scores) 91 | self.check_fitted_attributes(self.thres) 92 | self.check_labels(pred_labels, test_scores.shape) 93 | 94 | def test_save_and_load(self): 95 | for scores in self.all_scores: 96 | self.thres.fit(scores) 97 | joblib.dump(self.thres, 'model.pkl') 98 | loaded_thres = joblib.load('model.pkl') 99 | 100 | assert_equal(self.thres.predict(scores), 101 | loaded_thres.predict(scores)) 102 | -------------------------------------------------------------------------------- /pythresh/test/test_mad.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unittest 3 | from itertools import product 4 | from os.path import dirname as up 5 | 6 | # noinspection PyProtectedMember 7 | import joblib 8 | import numpy as np 9 | from numpy.testing import assert_equal 10 | from pyod.models.iforest import IForest 11 | from pyod.models.knn import KNN 12 | from pyod.models.pca import PCA 13 | from pyod.utils.data import generate_data 14 | 15 | from pythresh.thresholds.mad import MAD 16 | 17 | # temporary solution for relative imports in case pythresh is not installed 18 | # if pythresh is installed, no need to use the following line 19 | 20 | path = up(up(up(__file__))) 21 | sys.path.append(path) 22 | 23 | 24 | class TestMAD(unittest.TestCase): 25 | 26 | @classmethod 27 | def setUpClass(cls): 28 | cls.n_train = 200 29 | cls.n_test = 100 30 | cls.contamination = 0.1 31 | cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data( 32 | n_train=cls.n_train, n_test=cls.n_test, 33 | contamination=cls.contamination, random_state=42) 34 | 35 | cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)] 36 | cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_ 37 | cls.multiple_scores = np.vstack([ 38 | clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs 39 | ]).T 40 | cls.all_scores = [cls.single_score, cls.multiple_scores] 41 | 42 | cls.factors = [0.5, 1, 2] 43 | 44 | cls.params = list(product(cls.all_scores, cls.factors)) 45 | 46 | def setUp(self): 47 | self.thres = MAD() 48 | 49 | def check_labels(self, labels, scores_shape): 50 | self.assertEqual(labels.shape, scores_shape[:1]) 51 | self.assertIn(labels.min(), [0, 1]) 52 | self.assertIn(labels.max(), [0, 1]) 53 | 54 | def check_fitted_attributes(self, thres): 55 | self.assertTrue(thres.__sklearn_is_fitted__()) 56 | self.assertIsNotNone(thres.labels_) 57 | self.assertIsNotNone(thres.thresh_) 58 | 59 | def test_eval(self): 60 | for scores, factor in self.params: 61 | thres = MAD(factor=factor) 62 | pred_labels = thres.eval(scores) 63 | 64 | self.assertIsNotNone(thres.thresh_) 65 | self.assertIsNotNone(thres.dscores_) 66 | self.assertGreaterEqual(thres.dscores_.min(), 0) 67 | self.assertLessEqual(thres.dscores_.max(), 1) 68 | self.check_labels(pred_labels, scores.shape) 69 | 70 | def test_fit(self): 71 | for scores in self.all_scores: 72 | self.thres.fit(scores) 73 | self.check_fitted_attributes(self.thres) 74 | self.check_labels(self.thres.labels_, scores.shape) 75 | 76 | def test_predict(self): 77 | for scores in self.all_scores: 78 | self.thres.fit(scores) 79 | pred_labels = self.thres.predict(scores) 80 | self.check_fitted_attributes(self.thres) 81 | self.check_labels(pred_labels, scores.shape) 82 | assert_equal(self.thres.labels_, pred_labels) 83 | 84 | def test_test_data(self): 85 | for scores, test_scores in zip(self.all_scores, [ 86 | self.clfs[0].fit(self.X_train).decision_function(self.X_test), 87 | np.vstack([clf.fit(self.X_train).decision_function(self.X_test) 88 | for clf in self.clfs]).T 89 | ]): 90 | self.thres.fit(scores) 91 | pred_labels = self.thres.predict(test_scores) 92 | self.check_fitted_attributes(self.thres) 93 | self.check_labels(pred_labels, test_scores.shape) 94 | 95 | def test_save_and_load(self): 96 | for scores in self.all_scores: 97 | self.thres.fit(scores) 98 | joblib.dump(self.thres, 'model.pkl') 99 | loaded_thres = joblib.load('model.pkl') 100 | 101 | assert_equal(self.thres.predict(scores), 102 | loaded_thres.predict(scores)) 103 | -------------------------------------------------------------------------------- /pythresh/test/test_regr.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unittest 3 | from itertools import product 4 | from os.path import dirname as up 5 | 6 | # noinspection PyProtectedMember 7 | import joblib 8 | import numpy as np 9 | from numpy.testing import assert_equal 10 | from pyod.models.iforest import IForest 11 | from pyod.models.knn import KNN 12 | from pyod.models.pca import PCA 13 | from pyod.utils.data import generate_data 14 | 15 | from pythresh.thresholds.regr import REGR 16 | 17 | # temporary solution for relative imports in case pythresh is not installed 18 | # if pythresh is installed, no need to use the following line 19 | 20 | path = up(up(up(__file__))) 21 | sys.path.append(path) 22 | 23 | 24 | class TestREGR(unittest.TestCase): 25 | 26 | @classmethod 27 | def setUpClass(cls): 28 | cls.n_train = 200 29 | cls.n_test = 100 30 | cls.contamination = 0.1 31 | cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data( 32 | n_train=cls.n_train, n_test=cls.n_test, 33 | contamination=cls.contamination, random_state=42) 34 | 35 | cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)] 36 | cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_ 37 | cls.multiple_scores = np.vstack([ 38 | clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs 39 | ]).T 40 | cls.all_scores = [cls.single_score, cls.multiple_scores] 41 | 42 | cls.methods = ['siegel', 'theil'] 43 | 44 | cls.params = list(product(cls.all_scores, cls.methods)) 45 | 46 | def setUp(self): 47 | self.thres = REGR() 48 | 49 | def check_labels(self, labels, scores_shape): 50 | self.assertEqual(labels.shape, scores_shape[:1]) 51 | self.assertIn(labels.min(), [0, 1]) 52 | self.assertIn(labels.max(), [0, 1]) 53 | 54 | def check_fitted_attributes(self, thres): 55 | self.assertTrue(thres.__sklearn_is_fitted__()) 56 | self.assertIsNotNone(thres.labels_) 57 | self.assertIsNotNone(thres.thresh_) 58 | 59 | def test_eval(self): 60 | for scores, method in self.params: 61 | thres = REGR(method=method) 62 | pred_labels = thres.eval(scores) 63 | 64 | self.assertIsNotNone(thres.thresh_) 65 | self.assertIsNotNone(thres.dscores_) 66 | self.assertGreaterEqual(thres.dscores_.min(), 0) 67 | self.assertLessEqual(thres.dscores_.max(), 1) 68 | self.check_labels(pred_labels, scores.shape) 69 | 70 | def test_fit(self): 71 | for scores in self.all_scores: 72 | self.thres.fit(scores) 73 | self.check_fitted_attributes(self.thres) 74 | self.check_labels(self.thres.labels_, scores.shape) 75 | 76 | def test_predict(self): 77 | for scores in self.all_scores: 78 | self.thres.fit(scores) 79 | pred_labels = self.thres.predict(scores) 80 | self.check_fitted_attributes(self.thres) 81 | self.check_labels(pred_labels, scores.shape) 82 | assert_equal(self.thres.labels_, pred_labels) 83 | 84 | def test_test_data(self): 85 | for scores, test_scores in zip(self.all_scores, [ 86 | self.clfs[0].fit(self.X_train).decision_function(self.X_test), 87 | np.vstack([clf.fit(self.X_train).decision_function(self.X_test) 88 | for clf in self.clfs]).T 89 | ]): 90 | self.thres.fit(scores) 91 | pred_labels = self.thres.predict(test_scores) 92 | self.check_fitted_attributes(self.thres) 93 | self.check_labels(pred_labels, test_scores.shape) 94 | 95 | def test_save_and_load(self): 96 | for scores in self.all_scores: 97 | self.thres.fit(scores) 98 | joblib.dump(self.thres, 'model.pkl') 99 | loaded_thres = joblib.load('model.pkl') 100 | 101 | assert_equal(self.thres.predict(scores), 102 | loaded_thres.predict(scores)) 103 | -------------------------------------------------------------------------------- /pythresh/test/test_mtt.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unittest 3 | from itertools import product 4 | from os.path import dirname as up 5 | 6 | # noinspection PyProtectedMember 7 | import joblib 8 | import numpy as np 9 | from numpy.testing import assert_equal 10 | from pyod.models.iforest import IForest 11 | from pyod.models.knn import KNN 12 | from pyod.models.pca import PCA 13 | from pyod.utils.data import generate_data 14 | 15 | from pythresh.thresholds.mtt import MTT 16 | 17 | # temporary solution for relative imports in case pythresh is not installed 18 | # if pythresh is installed, no need to use the following line 19 | 20 | path = up(up(up(__file__))) 21 | sys.path.append(path) 22 | 23 | 24 | class TestMTT(unittest.TestCase): 25 | 26 | @classmethod 27 | def setUpClass(cls): 28 | cls.n_train = 200 29 | cls.n_test = 100 30 | cls.contamination = 0.1 31 | cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data( 32 | n_train=cls.n_train, n_test=cls.n_test, 33 | contamination=cls.contamination, random_state=42) 34 | 35 | cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)] 36 | cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_ 37 | cls.multiple_scores = np.vstack([ 38 | clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs 39 | ]).T 40 | cls.all_scores = [cls.single_score, cls.multiple_scores] 41 | 42 | cls.alphas = [0.1, 0.05, 0.025, 0.01, 0.005] 43 | 44 | cls.params = list(product(cls.all_scores, cls.alphas)) 45 | 46 | def setUp(self): 47 | self.thres = MTT() 48 | 49 | def check_labels(self, labels, scores_shape): 50 | self.assertEqual(labels.shape, scores_shape[:1]) 51 | self.assertIn(labels.min(), [0, 1]) 52 | self.assertIn(labels.max(), [0, 1]) 53 | 54 | def check_fitted_attributes(self, thres): 55 | self.assertTrue(thres.__sklearn_is_fitted__()) 56 | self.assertIsNotNone(thres.labels_) 57 | self.assertIsNotNone(thres.thresh_) 58 | 59 | def test_eval(self): 60 | for scores, alpha in self.params: 61 | thres = MTT(alpha=alpha) 62 | pred_labels = thres.eval(scores) 63 | 64 | self.assertIsNotNone(thres.thresh_) 65 | self.assertIsNotNone(thres.dscores_) 66 | self.assertGreaterEqual(thres.dscores_.min(), 0) 67 | self.assertLessEqual(thres.dscores_.max(), 1) 68 | self.check_labels(pred_labels, scores.shape) 69 | 70 | def test_fit(self): 71 | for scores in self.all_scores: 72 | self.thres.fit(scores) 73 | self.check_fitted_attributes(self.thres) 74 | self.check_labels(self.thres.labels_, scores.shape) 75 | 76 | def test_predict(self): 77 | for scores in self.all_scores: 78 | self.thres.fit(scores) 79 | pred_labels = self.thres.predict(scores) 80 | self.check_fitted_attributes(self.thres) 81 | self.check_labels(pred_labels, scores.shape) 82 | assert_equal(self.thres.labels_, pred_labels) 83 | 84 | def test_test_data(self): 85 | for scores, test_scores in zip(self.all_scores, [ 86 | self.clfs[0].fit(self.X_train).decision_function(self.X_test), 87 | np.vstack([clf.fit(self.X_train).decision_function(self.X_test) 88 | for clf in self.clfs]).T 89 | ]): 90 | self.thres.fit(scores) 91 | pred_labels = self.thres.predict(test_scores) 92 | self.check_fitted_attributes(self.thres) 93 | self.check_labels(pred_labels, test_scores.shape) 94 | 95 | def test_save_and_load(self): 96 | for scores in self.all_scores: 97 | self.thres.fit(scores) 98 | joblib.dump(self.thres, 'model.pkl') 99 | loaded_thres = joblib.load('model.pkl') 100 | 101 | assert_equal(self.thres.predict(scores), 102 | loaded_thres.predict(scores)) 103 | -------------------------------------------------------------------------------- /pythresh/test/test_zscore.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unittest 3 | from itertools import product 4 | from os.path import dirname as up 5 | 6 | # noinspection PyProtectedMember 7 | import joblib 8 | import numpy as np 9 | from numpy.testing import assert_equal 10 | from pyod.models.iforest import IForest 11 | from pyod.models.knn import KNN 12 | from pyod.models.pca import PCA 13 | from pyod.utils.data import generate_data 14 | 15 | from pythresh.thresholds.zscore import ZSCORE 16 | 17 | # temporary solution for relative imports in case pythresh is not installed 18 | # if pythresh is installed, no need to use the following line 19 | 20 | path = up(up(up(__file__))) 21 | sys.path.append(path) 22 | 23 | 24 | class TestZSCORE(unittest.TestCase): 25 | 26 | @classmethod 27 | def setUpClass(cls): 28 | cls.n_train = 200 29 | cls.n_test = 100 30 | cls.contamination = 0.1 31 | cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data( 32 | n_train=cls.n_train, n_test=cls.n_test, 33 | contamination=cls.contamination, random_state=42) 34 | 35 | cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)] 36 | cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_ 37 | cls.multiple_scores = np.vstack([ 38 | clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs 39 | ]).T 40 | cls.all_scores = [cls.single_score, cls.multiple_scores] 41 | 42 | cls.factors = [0.5, 1, 2] 43 | 44 | cls.params = list(product(cls.all_scores, cls.factors)) 45 | 46 | def setUp(self): 47 | self.thres = ZSCORE() 48 | 49 | def check_labels(self, labels, scores_shape): 50 | self.assertEqual(labels.shape, scores_shape[:1]) 51 | self.assertIn(labels.min(), [0, 1]) 52 | self.assertIn(labels.max(), [0, 1]) 53 | 54 | def check_fitted_attributes(self, thres): 55 | self.assertTrue(thres.__sklearn_is_fitted__()) 56 | self.assertIsNotNone(thres.labels_) 57 | self.assertIsNotNone(thres.thresh_) 58 | 59 | def test_eval(self): 60 | for scores, factor in self.params: 61 | thres = ZSCORE(factor=factor) 62 | pred_labels = thres.eval(scores) 63 | 64 | self.assertIsNotNone(thres.thresh_) 65 | self.assertIsNotNone(thres.dscores_) 66 | self.assertGreaterEqual(thres.dscores_.min(), 0) 67 | self.assertLessEqual(thres.dscores_.max(), 1) 68 | self.check_labels(pred_labels, scores.shape) 69 | 70 | def test_fit(self): 71 | for scores in self.all_scores: 72 | self.thres.fit(scores) 73 | self.check_fitted_attributes(self.thres) 74 | self.check_labels(self.thres.labels_, scores.shape) 75 | 76 | def test_predict(self): 77 | for scores in self.all_scores: 78 | self.thres.fit(scores) 79 | pred_labels = self.thres.predict(scores) 80 | self.check_fitted_attributes(self.thres) 81 | self.check_labels(pred_labels, scores.shape) 82 | assert_equal(self.thres.labels_, pred_labels) 83 | 84 | def test_test_data(self): 85 | for scores, test_scores in zip(self.all_scores, [ 86 | self.clfs[0].fit(self.X_train).decision_function(self.X_test), 87 | np.vstack([clf.fit(self.X_train).decision_function(self.X_test) 88 | for clf in self.clfs]).T 89 | ]): 90 | self.thres.fit(scores) 91 | pred_labels = self.thres.predict(test_scores) 92 | self.check_fitted_attributes(self.thres) 93 | self.check_labels(pred_labels, test_scores.shape) 94 | 95 | def test_save_and_load(self): 96 | for scores in self.all_scores: 97 | self.thres.fit(scores) 98 | joblib.dump(self.thres, 'model.pkl') 99 | loaded_thres = joblib.load('model.pkl') 100 | 101 | assert_equal(self.thres.predict(scores), 102 | loaded_thres.predict(scores)) 103 | -------------------------------------------------------------------------------- /pythresh/test/test_decomp.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unittest 3 | from itertools import product 4 | from os.path import dirname as up 5 | 6 | # noinspection PyProtectedMember 7 | import joblib 8 | import numpy as np 9 | from numpy.testing import assert_equal 10 | from pyod.models.iforest import IForest 11 | from pyod.models.knn import KNN 12 | from pyod.models.pca import PCA 13 | from pyod.utils.data import generate_data 14 | 15 | from pythresh.thresholds.decomp import DECOMP 16 | 17 | # temporary solution for relative imports in case pyod is not installed 18 | # if pythresh is installed, no need to use the following line 19 | 20 | path = up(up(up(__file__))) 21 | sys.path.append(path) 22 | 23 | 24 | class TestDECOMP(unittest.TestCase): 25 | 26 | @classmethod 27 | def setUpClass(cls): 28 | cls.n_train = 200 29 | cls.n_test = 100 30 | cls.contamination = 0.1 31 | cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data( 32 | n_train=cls.n_train, n_test=cls.n_test, 33 | contamination=cls.contamination, random_state=42) 34 | 35 | cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)] 36 | cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_ 37 | cls.multiple_scores = np.vstack([ 38 | clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs 39 | ]).T 40 | cls.all_scores = [cls.single_score, cls.multiple_scores] 41 | 42 | cls.methods = ['NMF', 'PCA', 'GRP', 'SRP'] 43 | 44 | cls.params = list(product(cls.all_scores, cls.methods)) 45 | 46 | def setUp(self): 47 | self.thres = DECOMP() 48 | 49 | def check_labels(self, labels, scores_shape): 50 | self.assertEqual(labels.shape, scores_shape[:1]) 51 | self.assertIn(labels.min(), [0, 1]) 52 | self.assertIn(labels.max(), [0, 1]) 53 | 54 | def check_fitted_attributes(self, thres): 55 | self.assertTrue(thres.__sklearn_is_fitted__()) 56 | self.assertIsNotNone(thres.labels_) 57 | self.assertIsNotNone(thres.thresh_) 58 | 59 | def test_eval(self): 60 | for scores, method in self.params: 61 | thres = DECOMP(method=method) 62 | pred_labels = thres.eval(scores) 63 | 64 | self.assertIsNotNone(thres.thresh_) 65 | self.assertIsNotNone(thres.dscores_) 66 | self.assertGreaterEqual(thres.dscores_.min(), 0) 67 | self.assertLessEqual(thres.dscores_.max(), 1) 68 | self.check_labels(pred_labels, scores.shape) 69 | 70 | def test_fit(self): 71 | for scores in self.all_scores: 72 | self.thres.fit(scores) 73 | self.check_fitted_attributes(self.thres) 74 | self.check_labels(self.thres.labels_, scores.shape) 75 | 76 | def test_predict(self): 77 | for scores in self.all_scores: 78 | self.thres.fit(scores) 79 | pred_labels = self.thres.predict(scores) 80 | self.check_fitted_attributes(self.thres) 81 | self.check_labels(pred_labels, scores.shape) 82 | assert_equal(self.thres.labels_, pred_labels) 83 | 84 | def test_test_data(self): 85 | for scores, test_scores in zip(self.all_scores, [ 86 | self.clfs[0].fit(self.X_train).decision_function(self.X_test), 87 | np.vstack([clf.fit(self.X_train).decision_function(self.X_test) 88 | for clf in self.clfs]).T 89 | ]): 90 | self.thres.fit(scores) 91 | pred_labels = self.thres.predict(test_scores) 92 | self.check_fitted_attributes(self.thres) 93 | self.check_labels(pred_labels, test_scores.shape) 94 | 95 | def test_save_and_load(self): 96 | for scores in self.all_scores: 97 | self.thres.fit(scores) 98 | joblib.dump(self.thres, 'model.pkl') 99 | loaded_thres = joblib.load('model.pkl') 100 | 101 | assert_equal(self.thres.predict(scores), 102 | loaded_thres.predict(scores)) 103 | -------------------------------------------------------------------------------- /pythresh/test/test_karch.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unittest 3 | from itertools import product 4 | from os.path import dirname as up 5 | 6 | # noinspection 7 | import joblib 8 | import numpy as np 9 | from numpy.testing import assert_equal 10 | from pyod.models.iforest import IForest 11 | from pyod.models.knn import KNN 12 | from pyod.models.pca import PCA 13 | from pyod.utils.data import generate_data 14 | 15 | from pythresh.thresholds.karch import KARCH 16 | 17 | # temporary solution for relative imports in case pythresh is not installed 18 | # if pythresh is installed, no need to use the following line 19 | 20 | path = up(up(up(__file__))) 21 | sys.path.append(path) 22 | 23 | 24 | class TestKARCH(unittest.TestCase): 25 | 26 | @classmethod 27 | def setUpClass(cls): 28 | cls.n_train = 200 29 | cls.n_test = 100 30 | cls.contamination = 0.1 31 | cls.X_train, cls.X_test, cls.y_train, cls.y_test = generate_data( 32 | n_train=cls.n_train, n_test=cls.n_test, 33 | contamination=cls.contamination, random_state=42) 34 | 35 | cls.clfs = [KNN(), PCA(random_state=1234), IForest(random_state=1234)] 36 | cls.single_score = cls.clfs[0].fit(cls.X_train).decision_scores_ 37 | cls.multiple_scores = np.vstack([ 38 | clf.fit(cls.X_train).decision_scores_ for clf in cls.clfs 39 | ]).T 40 | cls.all_scores = [cls.single_score, cls.multiple_scores] 41 | 42 | cls.methods = ['simple', 'complex'] 43 | cls.ndims = range(1, 10) 44 | 45 | cls.params = list(product(cls.all_scores, cls.methods, cls.ndims)) 46 | 47 | def setUp(self): 48 | self.thres = KARCH() 49 | 50 | def check_labels(self, labels, scores_shape): 51 | self.assertEqual(labels.shape, scores_shape[:1]) 52 | self.assertIn(labels.min(), [0, 1]) 53 | self.assertIn(labels.max(), [0, 1]) 54 | 55 | def check_fitted_attributes(self, thres): 56 | self.assertTrue(thres.__sklearn_is_fitted__()) 57 | self.assertIsNotNone(thres.labels_) 58 | self.assertIsNotNone(thres.thresh_) 59 | 60 | def test_eval(self): 61 | for scores, method, ndim in self.params: 62 | thres = KARCH(method=method, ndim=ndim) 63 | pred_labels = thres.eval(scores) 64 | 65 | self.assertIsNotNone(thres.thresh_) 66 | self.assertIsNotNone(thres.dscores_) 67 | self.assertGreaterEqual(thres.dscores_.min(), 0) 68 | self.assertLessEqual(thres.dscores_.max(), 1) 69 | self.check_labels(pred_labels, scores.shape) 70 | 71 | def test_fit(self): 72 | for scores in self.all_scores: 73 | self.thres.fit(scores) 74 | self.check_fitted_attributes(self.thres) 75 | self.check_labels(self.thres.labels_, scores.shape) 76 | 77 | def test_predict(self): 78 | for scores in self.all_scores: 79 | self.thres.fit(scores) 80 | pred_labels = self.thres.predict(scores) 81 | self.check_fitted_attributes(self.thres) 82 | self.check_labels(pred_labels, scores.shape) 83 | assert_equal(self.thres.labels_, pred_labels) 84 | 85 | def test_test_data(self): 86 | for scores, test_scores in zip(self.all_scores, [ 87 | self.clfs[0].fit(self.X_train).decision_function(self.X_test), 88 | np.vstack([clf.fit(self.X_train).decision_function(self.X_test) 89 | for clf in self.clfs]).T 90 | ]): 91 | self.thres.fit(scores) 92 | pred_labels = self.thres.predict(test_scores) 93 | self.check_fitted_attributes(self.thres) 94 | self.check_labels(pred_labels, test_scores.shape) 95 | 96 | def test_save_and_load(self): 97 | for scores in self.all_scores: 98 | self.thres.fit(scores) 99 | joblib.dump(self.thres, 'model.pkl') 100 | loaded_thres = joblib.load('model.pkl') 101 | 102 | assert_equal(self.thres.predict(scores), 103 | loaded_thres.predict(scores)) 104 | --------------------------------------------------------------------------------