├── .github
    ├── FUNDING.yml
    └── workflows
    │   ├── release-please.yml
    │   └── tests.yml
├── .gitignore
├── LICENSE
├── PyNomaly
    ├── __init__.py
    └── loop.py
├── changelog.md
├── dist
    ├── PyNomaly-0.1.0.tar.gz
    ├── PyNomaly-0.1.1.tar.gz
    ├── PyNomaly-0.1.2.tar.gz
    ├── PyNomaly-0.1.3.tar.gz
    ├── PyNomaly-0.1.4.tar.gz
    ├── PyNomaly-0.1.5.tar.gz
    ├── PyNomaly-0.1.6.tar.gz
    ├── PyNomaly-0.1.7.tar.gz
    ├── PyNomaly-0.1.8.tar.gz
    ├── PyNomaly-0.2.0.tar.gz
    ├── PyNomaly-0.2.1.tar.gz
    ├── PyNomaly-0.2.2.tar.gz
    ├── PyNomaly-0.2.4.tar.gz
    ├── PyNomaly-0.2.5.tar.gz
    ├── PyNomaly-0.2.6.tar.gz
    ├── PyNomaly-0.2.7.tar.gz
    ├── PyNomaly-0.3.0-py3-none-any.whl
    ├── PyNomaly-0.3.0.tar.gz
    ├── PyNomaly-0.3.1-py3-none-any.whl
    ├── PyNomaly-0.3.1.tar.gz
    ├── PyNomaly-0.3.2-py3-none-any.whl
    ├── PyNomaly-0.3.2.tar.gz
    ├── PyNomaly-0.3.3-py3-none-any.whl
    └── PyNomaly-0.3.3.tar.gz
├── examples
    ├── iris.py
    ├── iris_dist_grid.py
    ├── multiple_gaussian_2d.py
    ├── numba_speed_diff.py
    ├── numpy_array.py
    └── stream.py
├── images
    ├── animation
    │   ├── 1.png
    │   ├── 10.png
    │   ├── 11.png
    │   ├── 12.png
    │   ├── 13.png
    │   ├── 14.png
    │   ├── 15.png
    │   ├── 16.png
    │   ├── 17.png
    │   ├── 18.png
    │   ├── 19.png
    │   ├── 2.png
    │   ├── 20.png
    │   ├── 21.png
    │   ├── 22.png
    │   ├── 23.png
    │   ├── 24.png
    │   ├── 25.png
    │   ├── 26.png
    │   ├── 27.png
    │   ├── 28.png
    │   ├── 29.png
    │   ├── 3.png
    │   ├── 30.png
    │   ├── 31.png
    │   ├── 32.png
    │   ├── 33.png
    │   ├── 34.png
    │   ├── 35.png
    │   ├── 36.png
    │   ├── 37.png
    │   ├── 38.png
    │   ├── 39.png
    │   ├── 4.png
    │   ├── 40.png
    │   ├── 41.png
    │   ├── 42.png
    │   ├── 43.png
    │   ├── 44.png
    │   ├── 45.png
    │   ├── 46.png
    │   ├── 47.png
    │   ├── 48.png
    │   ├── 49.png
    │   ├── 5.png
    │   ├── 50.png
    │   ├── 51.png
    │   ├── 52.png
    │   ├── 53.png
    │   ├── 54.png
    │   ├── 55.png
    │   ├── 56.png
    │   ├── 57.png
    │   ├── 58.png
    │   ├── 59.png
    │   ├── 6.png
    │   ├── 60.png
    │   ├── 7.png
    │   ├── 8.png
    │   └── 9.png
    ├── cluster_assignments.png
    ├── logo.jpg
    ├── logo100.jpg
    ├── logo150.jpg
    ├── logo200.jpg
    ├── logo300.jpg
    ├── scores.png
    ├── scores_by_distance_metric.png
    ├── scores_clust.png
    ├── scores_stream.gif
    └── scores_stream.png
├── paper
    ├── codemeta.json
    ├── paper.bib
    └── paper.md
├── readme.md
├── requirements.txt
├── requirements_ci.txt
├── requirements_examples.txt
├── setup.py
└── tests
    ├── .coverage
    ├── __init__.py
    └── test_loop.py


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: [vc1492a]
2 | 


--------------------------------------------------------------------------------
/.github/workflows/release-please.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     branches:
 4 |       - main # release branch (default)
 5 | 
 6 | permissions:
 7 |   contents: write
 8 |   pull-requests: write
 9 | 
10 | name: release-please
11 | 
12 | jobs:
13 |   release-please:
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - uses: googleapis/release-please-action@v4
17 |         with:
18 |           # this assumes that you have created a personal access token
19 |           # (PAT) and configured it as a GitHub action secret named
20 |           # `RELEASE_PLEASE_PERSONAL_ACCESS_TOKEN` (this secret name is not important).
21 |           token: ${{ secrets.RELEASE_PLEASE_PERSONAL_ACCESS_TOKEN }}
22 |           # this is a built-in strategy in release-please, see "Action Inputs"
23 |           # for more options
24 |           release-type: simple


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: tests
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "main", "dev" ]
 9 |   pull_request:
10 |     branches: [ "main", "dev" ]
11 | 
12 | jobs:
13 |   # there is no python3.6 available on newer ubuntu instances
14 |   # so we have this one to run on ubuntu-20.04
15 |   test-python36:
16 |     runs-on: ubuntu-20.04
17 |     strategy:
18 |       fail-fast: false
19 |       matrix:
20 |         python-version: ["3.6"]
21 | 
22 |     steps:
23 |     - uses: actions/checkout@v4
24 |     - name: Set up Python ${{ matrix.python-version }}
25 |       uses: actions/setup-python@v3
26 |       with:
27 |         python-version: ${{ matrix.python-version }}
28 |     - name: Install dependencies
29 |       run: |
30 |         python -m pip install --upgrade pip
31 |         python -m pip install flake8 pytest
32 |         pip install -r requirements.txt
33 |         pip install -r requirements_ci.txt
34 |     - name: Lint with flake8
35 |       run: |
36 |         # stop the build if there are Python syntax errors or undefined names
37 |         flake8 . --count --exit-zero --select=E9,F63,F7,F82 --show-source --statistics
38 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
39 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
40 |     - name: Test with pytest
41 |       run: |
42 |         pytest --cov=PyNomaly
43 |   
44 |   test:
45 | 
46 |     runs-on: ubuntu-latest
47 |     strategy:
48 |       fail-fast: false
49 |       matrix:
50 |         python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
51 | 
52 |     steps:
53 |     - uses: actions/checkout@v4
54 |     - name: Set up Python ${{ matrix.python-version }}
55 |       uses: actions/setup-python@v3
56 |       with:
57 |         python-version: ${{ matrix.python-version }}
58 |     - name: Install dependencies
59 |       run: |
60 |         python -m pip install --upgrade pip
61 |         python -m pip install flake8 pytest
62 |         pip install -r requirements.txt
63 |         pip install -r requirements_ci.txt
64 |     - name: Lint with flake8
65 |       run: |
66 |         # stop the build if there are Python syntax errors or undefined names
67 |         flake8 . --count --exit-zero --select=E9,F63,F7,F82 --show-source --statistics
68 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
69 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
70 |     - name: Test with pytest
71 |       run: |
72 |         pytest --cov=PyNomaly
73 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | *.DS_STORE
  2 | .idea/
  3 | __pycache__/
  4 | *.csv
  5 | nasaValve
  6 | rel_research
  7 | PyNomaly/loop_dev.py
  8 | /PyNomaly.egg-info/
  9 | *.pyc
 10 | *.coverage.*
 11 | .coveragerc
 12 | .pypirc
 13 | 
 14 | # Byte-compiled / optimized / DLL files
 15 | __pycache__/
 16 | *.py[cod]
 17 | *$py.class
 18 | 
 19 | # C extensions
 20 | *.so
 21 | 
 22 | # Distribution / packaging
 23 | .Python
 24 | build/
 25 | develop-eggs/
 26 | dist/
 27 | downloads/
 28 | eggs/
 29 | .eggs/
 30 | lib/
 31 | lib64/
 32 | parts/
 33 | sdist/
 34 | var/
 35 | wheels/
 36 | share/python-wheels/
 37 | *.egg-info/
 38 | .installed.cfg
 39 | *.egg
 40 | MANIFEST
 41 | 
 42 | # PyInstaller
 43 | #  Usually these files are written by a python script from a template
 44 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 45 | *.manifest
 46 | *.spec
 47 | 
 48 | # Installer logs
 49 | pip-log.txt
 50 | pip-delete-this-directory.txt
 51 | 
 52 | # Unit test / coverage reports
 53 | htmlcov/
 54 | .tox/
 55 | .nox/
 56 | .coverage
 57 | .coverage.*
 58 | .cache
 59 | nosetests.xml
 60 | coverage.xml
 61 | *.cover
 62 | *.py,cover
 63 | .hypothesis/
 64 | .pytest_cache/
 65 | cover/
 66 | 
 67 | # Translations
 68 | *.mo
 69 | *.pot
 70 | 
 71 | # Django stuff:
 72 | *.log
 73 | local_settings.py
 74 | db.sqlite3
 75 | db.sqlite3-journal
 76 | 
 77 | # Flask stuff:
 78 | instance/
 79 | .webassets-cache
 80 | 
 81 | # Scrapy stuff:
 82 | .scrapy
 83 | 
 84 | # Sphinx documentation
 85 | docs/_build/
 86 | 
 87 | # PyBuilder
 88 | .pybuilder/
 89 | target/
 90 | 
 91 | # Jupyter Notebook
 92 | .ipynb_checkpoints
 93 | 
 94 | # IPython
 95 | profile_default/
 96 | ipython_config.py
 97 | 
 98 | # pyenv
 99 | #   For a library or package, you might want to ignore these files since the code is
100 | #   intended to run in multiple environments; otherwise, check them in:
101 | # .python-version
102 | 
103 | # pipenv
104 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
105 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
106 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
107 | #   install all needed dependencies.
108 | #Pipfile.lock
109 | 
110 | # poetry
111 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
112 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
113 | #   commonly ignored for libraries.
114 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
115 | #poetry.lock
116 | 
117 | # pdm
118 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
119 | #pdm.lock
120 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
121 | #   in version control.
122 | #   https://pdm.fming.dev/#use-with-ide
123 | .pdm.toml
124 | 
125 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
126 | __pypackages__/
127 | 
128 | # Celery stuff
129 | celerybeat-schedule
130 | celerybeat.pid
131 | 
132 | # SageMath parsed files
133 | *.sage.py
134 | 
135 | # Environments
136 | .env
137 | .venv
138 | env/
139 | venv/
140 | ENV/
141 | env.bak/
142 | venv.bak/
143 | 
144 | # Spyder project settings
145 | .spyderproject
146 | .spyproject
147 | 
148 | # Rope project settings
149 | .ropeproject
150 | 
151 | # mkdocs documentation
152 | /site
153 | 
154 | # mypy
155 | .mypy_cache/
156 | .dmypy.json
157 | dmypy.json
158 | 
159 | # Pyre type checker
160 | .pyre/
161 | 
162 | # pytype static type analyzer
163 | .pytype/
164 | 
165 | # Cython debug symbols
166 | cython_debug/
167 | 
168 | # PyCharm
169 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
170 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
171 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
172 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
173 | #.idea/
174 | 
175 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2017 Valentino Constantinou.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.


--------------------------------------------------------------------------------
/PyNomaly/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/PyNomaly/__init__.py


--------------------------------------------------------------------------------
/PyNomaly/loop.py:
--------------------------------------------------------------------------------
  1 | from math import erf, sqrt
  2 | import numpy as np
  3 | from python_utils.terminal import get_terminal_size
  4 | import sys
  5 | from typing import Tuple, Union
  6 | import warnings
  7 | 
  8 | try:
  9 |     import numba
 10 | except ImportError:
 11 |     pass
 12 | 
 13 | __author__ = "Valentino Constantinou"
 14 | __version__ = "0.3.4"
 15 | __license__ = "Apache License, Version 2.0"
 16 | 
 17 | 
 18 | class Utils:
 19 |     @staticmethod
 20 |     def emit_progress_bar(progress: str, index: int, total: int) -> str:
 21 |         """
 22 |         A progress bar that is continuously updated in Python's standard
 23 |         out.
 24 |         :param progress: a string printed to stdout that is updated and later
 25 |         returned.
 26 |         :param index: the current index of the iteration within the tracked
 27 |         process.
 28 |         :param total: the total length of the tracked process.
 29 |         :return: progress string.
 30 |         """
 31 | 
 32 |         w, h = get_terminal_size()
 33 |         sys.stdout.write("\r")
 34 |         if total < w:
 35 |             block_size = int(w / total)
 36 |         else:
 37 |             block_size = int(total / w)
 38 |         if index % block_size == 0:
 39 |             progress += "="
 40 |         percent = index / total
 41 |         sys.stdout.write("[ %s ] %.2f%%" % (progress, percent * 100))
 42 |         sys.stdout.flush()
 43 |         return progress
 44 | 
 45 | 
 46 | class LocalOutlierProbability(object):
 47 |     """
 48 |     :param data: a Pandas DataFrame or Numpy array of float data
 49 |     :param extent: an integer value [1, 2, 3] that controls the statistical 
 50 |     extent, e.g. lambda times the standard deviation from the mean (optional, 
 51 |     default 3)
 52 |     :param n_neighbors: the total number of neighbors to consider w.r.t. each 
 53 |     sample (optional, default 10)
 54 |     :param cluster_labels: a numpy array of cluster assignments w.r.t. each 
 55 |     sample (optional, default None)
 56 |     :return:
 57 |     """ """
 58 | 
 59 |     Based on the work of Kriegel, Kröger, Schubert, and Zimek (2009) in LoOP: 
 60 |     Local Outlier Probabilities.
 61 |     ----------
 62 | 
 63 |     References
 64 |     ----------
 65 |     .. [1] Breunig M., Kriegel H.-P., Ng R., Sander, J. LOF: Identifying 
 66 |            Density-based Local Outliers. ACM SIGMOD
 67 |            International Conference on Management of Data (2000).
 68 |     .. [2] Kriegel H.-P., Kröger P., Schubert E., Zimek A. LoOP: Local Outlier 
 69 |            Probabilities. 18th ACM conference on 
 70 |            Information and knowledge management, CIKM (2009).
 71 |     .. [3] Goldstein M., Uchida S. A Comparative Evaluation of Unsupervised 
 72 |            Anomaly Detection Algorithms for Multivariate Data. PLoS ONE 11(4):
 73 |            e0152173 (2016).
 74 |     .. [4] Hamlet C., Straub J., Russell M., Kerlin S. An incremental and 
 75 |            approximate local outlier probability algorithm for intrusion 
 76 |            detection and its evaluation. Journal of Cyber Security Technology 
 77 |            (2016). 
 78 |     """
 79 | 
 80 |     class Validate:
 81 | 
 82 |         """
 83 |         The Validate class aids in ensuring PyNomaly receives the right set
 84 |         of user inputs for proper execution of the Local Outlier Probability
 85 |         (LoOP) approach. Depending on the desired behavior, either an
 86 |         exception is raised to the user or PyNomaly continues executing
 87 |         albeit with some form of user warning.
 88 |         """
 89 | 
 90 |         """
 91 |         Private methods.
 92 |         """
 93 | 
 94 |         @staticmethod
 95 |         def _data(obj: Union["pd.DataFrame", np.ndarray]) -> np.ndarray:
 96 |             """
 97 |             Validates the input data to ensure it is either a Pandas DataFrame
 98 |             or Numpy array.
 99 |             :param obj: user-provided input data.
100 |             :return: a vector of values to be used in calculating the local
101 |             outlier probability.
102 |             """
103 |             if obj.__class__.__name__ == "DataFrame":
104 |                 points_vector = obj.values
105 |                 return points_vector
106 |             elif obj.__class__.__name__ == "ndarray":
107 |                 points_vector = obj
108 |                 return points_vector
109 |             else:
110 |                 warnings.warn(
111 |                     "Provided data or distance matrix must be in ndarray "
112 |                     "or DataFrame.",
113 |                     UserWarning,
114 |                 )
115 |                 if isinstance(obj, list):
116 |                     points_vector = np.array(obj)
117 |                     return points_vector
118 |                 points_vector = np.array([obj])
119 |                 return points_vector
120 | 
121 |         def _inputs(self, obj: "LocalOutlierProbability"):
122 |             """
123 |             Validates the inputs provided during initialization to ensure
124 |             that the needed objects are provided.
125 |             :param obj: a PyNomaly object.
126 |             :return: a boolean indicating whether validation has failed or
127 |             the data, distance matrix, and neighbor matrix.
128 |             """
129 |             if all(v is None for v in [obj.data, obj.distance_matrix]):
130 |                 warnings.warn(
131 |                     "Data or a distance matrix must be provided.", UserWarning
132 |                 )
133 |                 return False
134 |             elif all(v is not None for v in [obj.data, obj.distance_matrix]):
135 |                 warnings.warn(
136 |                     "Only one of the following may be provided: data or a "
137 |                     "distance matrix (not both).",
138 |                     UserWarning,
139 |                 )
140 |                 return False
141 |             if obj.data is not None:
142 |                 points_vector = self._data(obj.data)
143 |                 return points_vector, obj.distance_matrix, obj.neighbor_matrix
144 |             if all(
145 |                 matrix is not None
146 |                 for matrix in [obj.neighbor_matrix, obj.distance_matrix]
147 |             ):
148 |                 dist_vector = self._data(obj.distance_matrix)
149 |                 neigh_vector = self._data(obj.neighbor_matrix)
150 |             else:
151 |                 warnings.warn(
152 |                     "A neighbor index matrix and distance matrix must both be "
153 |                     "provided when not using raw input data.",
154 |                     UserWarning,
155 |                 )
156 |                 return False
157 |             if obj.distance_matrix.shape != obj.neighbor_matrix.shape:
158 |                 warnings.warn(
159 |                     "The shape of the distance and neighbor "
160 |                     "index matrices must match.",
161 |                     UserWarning,
162 |                 )
163 |                 return False
164 |             elif (obj.distance_matrix.shape[1] != obj.n_neighbors) or (
165 |                 obj.neighbor_matrix.shape[1] != obj.n_neighbors
166 |             ):
167 |                 warnings.warn(
168 |                     "The shape of the distance or "
169 |                     "neighbor index matrix does not "
170 |                     "match the number of neighbors "
171 |                     "specified.",
172 |                     UserWarning,
173 |                 )
174 |                 return False
175 |             return obj.data, dist_vector, neigh_vector
176 | 
177 |         @staticmethod
178 |         def _cluster_size(obj) -> bool:
179 |             """
180 |             Validates the cluster labels to ensure that the smallest cluster
181 |             size (number of observations in the cluster) is larger than the
182 |             specified number of neighbors.
183 |             :param obj: a PyNomaly object.
184 |             :return: a boolean indicating whether validation has passed.
185 |             """
186 |             c_labels = obj._cluster_labels()
187 |             for cluster_id in set(c_labels):
188 |                 c_size = np.where(c_labels == cluster_id)[0].shape[0]
189 |                 if c_size <= obj.n_neighbors:
190 |                     warnings.warn(
191 |                         "Number of neighbors specified larger than smallest "
192 |                         "cluster. Specify a number of neighbors smaller than "
193 |                         "the smallest cluster size (observations in smallest "
194 |                         "cluster minus one).",
195 |                         UserWarning,
196 |                     )
197 |                     return False
198 |             return True
199 | 
200 |         @staticmethod
201 |         def _n_neighbors(obj) -> bool:
202 |             """
203 |             Validates the specified number of neighbors to ensure that it is
204 |             greater than 0 and that the specified value is less than the total
205 |             number of observations.
206 |             :param obj: a PyNomaly object.
207 |             :return: a boolean indicating whether validation has passed.
208 |             """
209 |             if not obj.n_neighbors > 0:
210 |                 obj.n_neighbors = 10
211 |                 warnings.warn(
212 |                     "n_neighbors must be greater than 0."
213 |                     " Fit with " + str(obj.n_neighbors) + " instead.",
214 |                     UserWarning,
215 |                 )
216 |                 return False
217 |             elif obj.n_neighbors >= obj._n_observations():
218 |                 obj.n_neighbors = obj._n_observations() - 1
219 |                 warnings.warn(
220 |                     "n_neighbors must be less than the number of observations."
221 |                     " Fit with " + str(obj.n_neighbors) + " instead.",
222 |                     UserWarning,
223 |                 )
224 |             return True
225 | 
226 |         @staticmethod
227 |         def _extent(obj) -> bool:
228 |             """
229 |             Validates the specified extent parameter to ensure it is either 1,
230 |             2, or 3.
231 |             :param obj: a PyNomaly object.
232 |             :return: a boolean indicating whether validation has passed.
233 |             """
234 |             if obj.extent not in [1, 2, 3]:
235 |                 warnings.warn(
236 |                     "extent parameter (lambda) must be 1, 2, or 3.", UserWarning
237 |                 )
238 |                 return False
239 |             return True
240 | 
241 |         @staticmethod
242 |         def _missing_values(obj) -> bool:
243 |             """
244 |             Validates the provided data to ensure that it contains no
245 |             missing values.
246 |             :param obj: a PyNomaly object.
247 |             :return: a boolean indicating whether validation has passed.
248 |             """
249 |             if np.any(np.isnan(obj.data)):
250 |                 warnings.warn(
251 |                     "Method does not support missing values in input data.", UserWarning
252 |                 )
253 |                 return False
254 |             return True
255 | 
256 |         @staticmethod
257 |         def _fit(obj) -> bool:
258 |             """
259 |             Validates that the model was fit prior to calling the stream()
260 |             method.
261 |             :param obj: a PyNomaly object.
262 |             :return: a boolean indicating whether validation has passed.
263 |             """
264 |             if obj.is_fit is False:
265 |                 warnings.warn(
266 |                     "Must fit on historical data by calling fit() prior to "
267 |                     "calling stream(x).",
268 |                     UserWarning,
269 |                 )
270 |                 return False
271 |             return True
272 | 
273 |         @staticmethod
274 |         def _no_cluster_labels(obj) -> bool:
275 |             """
276 |             Checks to see if cluster labels are attempting to be used in
277 |             stream() and, if so, calls fit() once again but without cluster
278 |             labels. As PyNomaly does not accept clustering algorithms as input,
279 |             the stream approach does not support clustering.
280 |             :param obj: a PyNomaly object.
281 |             :return: a boolean indicating whether validation has passed.
282 |             """
283 |             if len(set(obj._cluster_labels())) > 1:
284 |                 warnings.warn(
285 |                     "Stream approach does not support clustered data. "
286 |                     "Automatically refit using single cluster of points.",
287 |                     UserWarning,
288 |                 )
289 |                 return False
290 |             return True
291 | 
292 |     """
293 |     Decorators.
294 |     """
295 | 
296 |     def accepts(*types):
297 |         """
298 |         A decorator that facilitates a form of type checking for the inputs
299 |         which can be used in Python 3.4-3.7 in lieu of Python 3.5+'s type
300 |         hints.
301 |         :param types: the input types of the objects being passed as arguments
302 |         in __init__.
303 |         :return: a decorator.
304 |         """
305 | 
306 |         def decorator(f):
307 |             assert len(types) == f.__code__.co_argcount
308 | 
309 |             def new_f(*args, **kwds):
310 |                 for a, t in zip(args, types):
311 |                     if type(a).__name__ == "DataFrame":
312 |                         a = np.array(a)
313 |                     if isinstance(a, t) is False:
314 |                         warnings.warn(
315 |                             "Argument %r is not of type %s" % (a, t), UserWarning
316 |                         )
317 |                 opt_types = {
318 |                     "distance_matrix": {"type": types[2]},
319 |                     "neighbor_matrix": {"type": types[3]},
320 |                     "extent": {"type": types[4]},
321 |                     "n_neighbors": {"type": types[5]},
322 |                     "cluster_labels": {"type": types[6]},
323 |                     "use_numba": {"type": types[7]},
324 |                     "progress_bar": {"type": types[8]},
325 |                 }
326 |                 for x in kwds:
327 |                     opt_types[x]["value"] = kwds[x]
328 |                 for k in opt_types:
329 |                     try:
330 |                         if (
331 |                             isinstance(opt_types[k]["value"], opt_types[k]["type"])
332 |                             is False
333 |                         ):
334 |                             warnings.warn(
335 |                                 "Argument %r is not of type %s."
336 |                                 % (k, opt_types[k]["type"]),
337 |                                 UserWarning,
338 |                             )
339 |                     except KeyError:
340 |                         pass
341 |                 return f(*args, **kwds)
342 | 
343 |             new_f.__name__ = f.__name__
344 |             return new_f
345 | 
346 |         return decorator
347 | 
348 |     @accepts(
349 |         object,
350 |         np.ndarray,
351 |         np.ndarray,
352 |         np.ndarray,
353 |         (int, np.integer),
354 |         (int, np.integer),
355 |         list,
356 |         bool,
357 |         bool,
358 |     )
359 |     def __init__(
360 |         self,
361 |         data=None,
362 |         distance_matrix=None,
363 |         neighbor_matrix=None,
364 |         extent=3,
365 |         n_neighbors=10,
366 |         cluster_labels=None,
367 |         use_numba=False,
368 |         progress_bar=False,
369 |     ) -> None:
370 |         self.data = data
371 |         self.distance_matrix = distance_matrix
372 |         self.neighbor_matrix = neighbor_matrix
373 |         self.extent = extent
374 |         self.n_neighbors = n_neighbors
375 |         self.cluster_labels = cluster_labels
376 |         self.use_numba = use_numba
377 |         self.points_vector = None
378 |         self.prob_distances = None
379 |         self.prob_distances_ev = None
380 |         self.norm_prob_local_outlier_factor = None
381 |         self.local_outlier_probabilities = None
382 |         self._objects = {}
383 |         self.progress_bar = progress_bar
384 |         self.is_fit = False
385 | 
386 |         if self.use_numba is True and "numba" not in sys.modules:
387 |             self.use_numba = False
388 |             warnings.warn(
389 |                 "Numba is not available, falling back to pure python mode.", UserWarning
390 |             )
391 | 
392 |         self.Validate()._inputs(self)
393 |         self.Validate._extent(self)
394 | 
395 |     """
396 |     Private methods.
397 |     """
398 | 
399 |     @staticmethod
400 |     def _standard_distance(cardinality: float, sum_squared_distance: float) -> float:
401 |         """
402 |         Calculates the standard distance of an observation.
403 |         :param cardinality: the cardinality of the input observation.
404 |         :param sum_squared_distance: the sum squared distance between all
405 |         neighbors of the input observation.
406 |         :return: the standard distance.
407 |         #"""
408 |         division_result = sum_squared_distance / cardinality
409 |         st_dist = sqrt(division_result)
410 |         return st_dist
411 | 
412 |     @staticmethod
413 |     def _prob_distance(extent: int, standard_distance: float) -> float:
414 |         """
415 |         Calculates the probabilistic distance of an observation.
416 |         :param extent: the extent value specified during initialization.
417 |         :param standard_distance: the standard distance of the input
418 |         observation.
419 |         :return: the probabilistic distance.
420 |         """
421 |         return extent * standard_distance
422 | 
423 |     @staticmethod
424 |     def _prob_outlier_factor(
425 |         probabilistic_distance: np.ndarray, ev_prob_dist: np.ndarray
426 |     ) -> np.ndarray:
427 |         """
428 |         Calculates the probabilistic outlier factor of an observation.
429 |         :param probabilistic_distance: the probabilistic distance of the
430 |         input observation.
431 |         :param ev_prob_dist:
432 |         :return: the probabilistic outlier factor.
433 |         """
434 |         if np.all(probabilistic_distance == ev_prob_dist):
435 |             return np.zeros(probabilistic_distance.shape)
436 |         else:
437 |             ev_prob_dist[ev_prob_dist == 0.0] = 1.0e-8
438 |             result = np.divide(probabilistic_distance, ev_prob_dist) - 1.0
439 |             return result
440 | 
441 |     @staticmethod
442 |     def _norm_prob_outlier_factor(
443 |         extent: float, ev_probabilistic_outlier_factor: list
444 |     ) -> list:
445 |         """
446 |         Calculates the normalized probabilistic outlier factor of an
447 |         observation.
448 |         :param extent: the extent value specified during initialization.
449 |         :param ev_probabilistic_outlier_factor: the expected probabilistic
450 |         outlier factor of the input observation.
451 |         :return: the normalized probabilistic outlier factor.
452 |         """
453 |         npofs = []
454 |         for i in ev_probabilistic_outlier_factor:
455 |             npofs.append(extent * sqrt(i))
456 |         return npofs
457 | 
458 |     @staticmethod
459 |     def _local_outlier_probability(
460 |         plof_val: np.ndarray, nplof_val: np.ndarray
461 |     ) -> np.ndarray:
462 |         """
463 |         Calculates the local outlier probability of an observation.
464 |         :param plof_val: the probabilistic outlier factor of the input
465 |         observation.
466 |         :param nplof_val: the normalized probabilistic outlier factor of the
467 |         input observation.
468 |         :return: the local outlier probability.
469 |         """
470 |         erf_vec = np.vectorize(erf)
471 |         if np.all(plof_val == nplof_val):
472 |             return np.zeros(plof_val.shape)
473 |         else:
474 |             return np.maximum(0, erf_vec(plof_val / (nplof_val * np.sqrt(2.0))))
475 | 
476 |     def _n_observations(self) -> int:
477 |         """
478 |         Calculates the number of observations in the data.
479 |         :return: the number of observations in the input data.
480 |         """
481 |         if self.data is not None:
482 |             return len(self.data)
483 |         return len(self.distance_matrix)
484 | 
485 |     def _store(self) -> np.ndarray:
486 |         """
487 |         Initializes the storage matrix that includes the input value,
488 |         cluster labels, local outlier probability, etc. for the input data.
489 |         :return: an empty numpy array of shape [n_observations, 3].
490 |         """
491 |         return np.empty([self._n_observations(), 3], dtype=object)
492 | 
493 |     def _cluster_labels(self) -> np.ndarray:
494 |         """
495 |         Returns a numpy array of cluster labels that corresponds to the
496 |         input labels or that is an array of all 0 values to indicate all
497 |         points belong to the same cluster.
498 |         :return: a numpy array of cluster labels.
499 |         """
500 |         if self.cluster_labels is None:
501 |             if self.data is not None:
502 |                 return np.array([0] * len(self.data))
503 |             return np.array([0] * len(self.distance_matrix))
504 |         return np.array(self.cluster_labels)
505 | 
506 |     @staticmethod
507 |     def _euclidean(vector1: np.ndarray, vector2: np.ndarray) -> np.ndarray:
508 |         """
509 |         Calculates the euclidean distance between two observations in the
510 |         input data.
511 |         :param vector1: a numpy array corresponding to observation 1.
512 |         :param vector2: a numpy array corresponding to observation 2.
513 |         :return: the euclidean distance between the two observations.
514 |         """
515 |         diff = vector1 - vector2
516 |         return np.dot(diff, diff) ** 0.5
517 | 
518 |     def _assign_distances(self, data_store: np.ndarray) -> np.ndarray:
519 |         """
520 |         Takes a distance matrix, produced by _distances or provided through
521 |         user input, and assigns distances for each observation to the storage
522 |         matrix, data_store.
523 |         :param data_store: the storage matrix that collects information on
524 |         each observation.
525 |         :return: the updated storage matrix that collects information on
526 |         each observation.
527 |         """
528 |         for vec, cluster_id in zip(
529 |             range(self.distance_matrix.shape[0]), self._cluster_labels()
530 |         ):
531 |             data_store[vec][0] = cluster_id
532 |             data_store[vec][1] = self.distance_matrix[vec]
533 |             data_store[vec][2] = self.neighbor_matrix[vec]
534 |         return data_store
535 | 
536 |     @staticmethod
537 |     def _compute_distance_and_neighbor_matrix(
538 |         clust_points_vector: np.ndarray,
539 |         indices: np.ndarray,
540 |         distances: np.ndarray,
541 |         indexes: np.ndarray,
542 |     ) -> Tuple[np.ndarray, np.ndarray, int]:
543 |         """
544 |         This helper method provides the heavy lifting for the _distances
545 |         method and is only intended for use therein. The code has been
546 |         written so that it can make full use of Numba's jit capabilities if
547 |         desired.
548 |         """
549 |         for i in range(clust_points_vector.shape[0]):
550 |             for j in range(i + 1, clust_points_vector.shape[0]):
551 |                 # Global index of the points
552 |                 global_i = indices[0][i]
553 |                 global_j = indices[0][j]
554 | 
555 |                 # Compute Euclidean distance
556 |                 diff = clust_points_vector[i] - clust_points_vector[j]
557 |                 d = np.dot(diff, diff) ** 0.5
558 | 
559 |                 # Update distance and neighbor index for global_i
560 |                 idx_max = distances[global_i].argmax()
561 |                 if d < distances[global_i][idx_max]:
562 |                     distances[global_i][idx_max] = d
563 |                     indexes[global_i][idx_max] = global_j
564 | 
565 |                 # Update distance and neighbor index for global_j
566 |                 idx_max = distances[global_j].argmax()
567 |                 if d < distances[global_j][idx_max]:
568 |                     distances[global_j][idx_max] = d
569 |                     indexes[global_j][idx_max] = global_i
570 | 
571 |             yield distances, indexes, i
572 | 
573 |     def _distances(self, progress_bar: bool = False) -> None:
574 |         """
575 |         Provides the distances between each observation and it's closest
576 |         neighbors. When input data is provided, calculates the euclidean
577 |         distance between every observation. Otherwise, the user-provided
578 |         distance matrix is used.
579 |         :return: the updated storage matrix that collects information on
580 |         each observation.
581 |         """
582 |         distances = np.full(
583 |             [self._n_observations(), self.n_neighbors], 9e10, dtype=float
584 |         )
585 |         indexes = np.full([self._n_observations(), self.n_neighbors], 9e10, dtype=float)
586 |         self.points_vector = self.Validate._data(self.data)
587 |         compute = (
588 |             numba.jit(self._compute_distance_and_neighbor_matrix, cache=True)
589 |             if self.use_numba
590 |             else self._compute_distance_and_neighbor_matrix
591 |         )
592 |         progress = "="
593 |         for cluster_id in set(self._cluster_labels()):
594 |             indices = np.where(self._cluster_labels() == cluster_id)
595 |             clust_points_vector = np.array(
596 |                 self.points_vector.take(indices, axis=0)[0], dtype=np.float64
597 |             )
598 |             # a generator that yields an updated distance matrix on each loop
599 |             for c in compute(clust_points_vector, indices, distances, indexes):
600 |                 distances, indexes, i = c
601 |                 # update the progress bar
602 |                 if progress_bar is True:
603 |                     progress = Utils.emit_progress_bar(
604 |                         progress, i + 1, clust_points_vector.shape[0]
605 |                     )
606 | 
607 |         self.distance_matrix = distances
608 |         self.neighbor_matrix = indexes
609 | 
610 |     def _ssd(self, data_store: np.ndarray) -> np.ndarray:
611 |         """
612 |         Calculates the sum squared distance between neighbors for each
613 |         observation in the input data.
614 |         :param data_store: the storage matrix that collects information on
615 |         each observation.
616 |         :return: the updated storage matrix that collects information on
617 |         each observation.
618 |         """
619 |         self.cluster_labels_u = np.unique(data_store[:, 0])
620 |         ssd_array = np.empty([self._n_observations(), 1])
621 |         for cluster_id in self.cluster_labels_u:
622 |             indices = np.where(data_store[:, 0] == cluster_id)
623 |             cluster_distances = np.take(data_store[:, 1], indices).tolist()
624 |             ssd = np.power(cluster_distances[0], 2).sum(axis=1)
625 |             for i, j in zip(indices[0], ssd):
626 |                 ssd_array[i] = j
627 |         data_store = np.hstack((data_store, ssd_array))
628 |         return data_store
629 | 
630 |     def _standard_distances(self, data_store: np.ndarray) -> np.ndarray:
631 |         """
632 |         Calculated the standard distance for each observation in the input
633 |         data. First calculates the cardinality and then calculates the standard
634 |         distance with respect to each observation.
635 |         :param data_store:
636 |         :param data_store: the storage matrix that collects information on
637 |         each observation.
638 |         :return: the updated storage matrix that collects information on
639 |         each observation.
640 |         """
641 |         cardinality = [self.n_neighbors] * self._n_observations()
642 |         vals = data_store[:, 3].tolist()
643 |         std_distances = []
644 |         for c, v in zip(cardinality, vals):
645 |             std_distances.append(self._standard_distance(c, v))
646 |         return np.hstack((data_store, np.array([std_distances]).T))
647 | 
648 |     def _prob_distances(self, data_store: np.ndarray) -> np.ndarray:
649 |         """
650 |         Calculates the probabilistic distance for each observation in the
651 |         input data.
652 |         :param data_store: the storage matrix that collects information on
653 |         each observation.
654 |         :return: the updated storage matrix that collects information on
655 |         each observation.
656 |         """
657 |         prob_distances = []
658 |         for i in range(data_store[:, 4].shape[0]):
659 |             prob_distances.append(self._prob_distance(self.extent, data_store[:, 4][i]))
660 |         return np.hstack((data_store, np.array([prob_distances]).T))
661 | 
662 |     def _prob_distances_ev(self, data_store) -> np.ndarray:
663 |         """
664 |         Calculates the expected value of the probabilistic distance for
665 |         each observation in the input data with respect to the cluster the
666 |         observation belongs to.
667 |         :param data_store: the storage matrix that collects information on
668 |         each observation.
669 |         :return: the updated storage matrix that collects information on
670 |         each observation.
671 |         """
672 |         prob_set_distance_ev = np.empty([self._n_observations(), 1])
673 |         for cluster_id in self.cluster_labels_u:
674 |             indices = np.where(data_store[:, 0] == cluster_id)[0]
675 |             for index in indices:
676 |                 # Global neighbor indices for the current point
677 |                 nbrhood = data_store[index][2].astype(int)  # Ensure global indices
678 |                 nbrhood_prob_distances = np.take(data_store[:, 5], nbrhood).astype(
679 |                     float
680 |                 )
681 |                 nbrhood_prob_distances_nonan = nbrhood_prob_distances[
682 |                     np.logical_not(np.isnan(nbrhood_prob_distances))
683 |                 ]
684 |                 prob_set_distance_ev[index] = nbrhood_prob_distances_nonan.mean()
685 | 
686 |         self.prob_distances_ev = prob_set_distance_ev
687 |         return np.hstack((data_store, prob_set_distance_ev))
688 | 
689 |     def _prob_local_outlier_factors(self, data_store: np.ndarray) -> np.ndarray:
690 |         """
691 |         Calculates the probabilistic local outlier factor for each
692 |         observation in the input data.
693 |         :param data_store: the storage matrix that collects information on
694 |         each observation.
695 |         :return: the updated storage matrix that collects information on
696 |         each observation.
697 |         """
698 |         return np.hstack(
699 |             (
700 |                 data_store,
701 |                 np.array(
702 |                     [
703 |                         np.apply_along_axis(
704 |                             self._prob_outlier_factor,
705 |                             0,
706 |                             data_store[:, 5],
707 |                             data_store[:, 6],
708 |                         )
709 |                     ]
710 |                 ).T,
711 |             )
712 |         )
713 | 
714 |     def _prob_local_outlier_factors_ev(self, data_store: np.ndarray) -> np.ndarray:
715 |         """
716 |         Calculates the expected value of the probabilistic local outlier factor
717 |         for each observation in the input data with respect to the cluster the
718 |         observation belongs to.
719 |         :param data_store: the storage matrix that collects information on
720 |         each observation.
721 |         :return: the updated storage matrix that collects information on
722 |         each observation.
723 |         """
724 |         prob_local_outlier_factor_ev_dict = {}
725 |         for cluster_id in self.cluster_labels_u:
726 |             indices = np.where(data_store[:, 0] == cluster_id)
727 |             prob_local_outlier_factors = np.take(data_store[:, 7], indices).astype(
728 |                 float
729 |             )
730 |             prob_local_outlier_factors_nonan = prob_local_outlier_factors[
731 |                 np.logical_not(np.isnan(prob_local_outlier_factors))
732 |             ]
733 |             prob_local_outlier_factor_ev_dict[cluster_id] = np.power(
734 |                 prob_local_outlier_factors_nonan, 2
735 |             ).sum() / float(prob_local_outlier_factors_nonan.size)
736 |         data_store = np.hstack(
737 |             (
738 |                 data_store,
739 |                 np.array(
740 |                     [
741 |                         [
742 |                             prob_local_outlier_factor_ev_dict[x]
743 |                             for x in data_store[:, 0].tolist()
744 |                         ]
745 |                     ]
746 |                 ).T,
747 |             )
748 |         )
749 |         return data_store
750 | 
751 |     def _norm_prob_local_outlier_factors(self, data_store: np.ndarray) -> np.ndarray:
752 |         """
753 |         Calculates the normalized probabilistic local outlier factor for each
754 |         observation in the input data.
755 |         :param data_store: the storage matrix that collects information on
756 |         each observation.
757 |         :return: the updated storage matrix that collects information on
758 |         each observation.
759 |         """
760 |         return np.hstack(
761 |             (
762 |                 data_store,
763 |                 np.array(
764 |                     [
765 |                         self._norm_prob_outlier_factor(
766 |                             self.extent, data_store[:, 8].tolist()
767 |                         )
768 |                     ]
769 |                 ).T,
770 |             )
771 |         )
772 | 
773 |     def _local_outlier_probabilities(self, data_store: np.ndarray) -> np.ndarray:
774 |         """
775 |         Calculates the local outlier probability for each observation in the
776 |         input data.
777 |         :param data_store: the storage matrix that collects information on
778 |         each observation.
779 |         :return: the updated storage matrix that collects information on
780 |         each observation.
781 |         """
782 |         return np.hstack(
783 |             (
784 |                 data_store,
785 |                 np.array(
786 |                     [
787 |                         np.apply_along_axis(
788 |                             self._local_outlier_probability,
789 |                             0,
790 |                             data_store[:, 7],
791 |                             data_store[:, 9],
792 |                         )
793 |                     ]
794 |                 ).T,
795 |             )
796 |         )
797 | 
798 |     """
799 |     Public methods
800 |     """
801 | 
802 |     def fit(self) -> "LocalOutlierProbability":
803 |         """
804 |         Calculates the local outlier probability for each observation in the
805 |         input data according to the input parameters extent, n_neighbors, and
806 |         cluster_labels.
807 |         :return: self, which contains the local outlier probabilities as
808 |         self.local_outlier_probabilities.
809 |         """
810 | 
811 |         self.Validate._n_neighbors(self)
812 |         if self.Validate._cluster_size(self) is False:
813 |             sys.exit()
814 |         if self.data is not None and self.Validate._missing_values(self) is False:
815 |             sys.exit()
816 | 
817 |         store = self._store()
818 |         if self.data is not None:
819 |             self._distances(progress_bar=self.progress_bar)
820 |         store = self._assign_distances(store)
821 |         store = self._ssd(store)
822 |         store = self._standard_distances(store)
823 |         store = self._prob_distances(store)
824 |         self.prob_distances = store[:, 5]
825 |         store = self._prob_distances_ev(store)
826 |         store = self._prob_local_outlier_factors(store)
827 |         store = self._prob_local_outlier_factors_ev(store)
828 |         store = self._norm_prob_local_outlier_factors(store)
829 |         self.norm_prob_local_outlier_factor = store[:, 9].max()
830 |         store = self._local_outlier_probabilities(store)
831 |         self.local_outlier_probabilities = store[:, 10]
832 | 
833 |         self.is_fit = True
834 | 
835 |         return self
836 | 
837 |     def stream(self, x: np.ndarray) -> np.ndarray:
838 |         """
839 |         Calculates the local outlier probability for an individual sample
840 |         according to the input parameters extent, n_neighbors, and
841 |         cluster_labels after first calling fit(). Observations are assigned
842 |         a local outlier probability against the mean of expected values of
843 |         probabilistic distance and the normalized probabilistic outlier
844 |         factor from the earlier model, provided when calling fit().
845 |         distance
846 |         :param x: an observation to score for its local outlier probability.
847 |         :return: the local outlier probability of the input observation.
848 |         """
849 | 
850 |         orig_cluster_labels = None
851 |         if self.Validate._no_cluster_labels(self) is False:
852 |             orig_cluster_labels = self.cluster_labels
853 |             self.cluster_labels = np.array([0] * len(self.data))
854 | 
855 |         if self.Validate._fit(self) is False:
856 |             self.fit()
857 | 
858 |         point_vector = self.Validate._data(x)
859 |         distances = np.full([1, self.n_neighbors], 9e10, dtype=float)
860 |         if self.data is not None:
861 |             matrix = self.points_vector
862 |         else:
863 |             matrix = self.distance_matrix
864 |         for p in range(0, matrix.shape[0]):
865 |             if self.data is not None:
866 |                 d = self._euclidean(matrix[p, :], point_vector)
867 |             else:
868 |                 d = point_vector
869 |             idx_max = distances[0].argmax()
870 |             if d < distances[0][idx_max]:
871 |                 distances[0][idx_max] = d
872 | 
873 |         ssd = np.power(distances, 2).sum()
874 |         std_dist = np.sqrt(np.divide(ssd, self.n_neighbors))
875 |         prob_dist = self._prob_distance(self.extent, std_dist)
876 |         plof = self._prob_outlier_factor(
877 |             np.array(prob_dist), np.array(self.prob_distances_ev.mean())
878 |         )
879 |         loop = self._local_outlier_probability(
880 |             plof, self.norm_prob_local_outlier_factor
881 |         )
882 | 
883 |         if orig_cluster_labels is not None:
884 |             self.cluster_labels = orig_cluster_labels
885 | 
886 |         return loop
887 | 


--------------------------------------------------------------------------------
/changelog.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | All notable changes to PyNomaly will be documented in this Changelog.
  3 | 
  4 | The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) 
  5 | and adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
  6 | 
  7 | ## 0.3.4 
  8 | ### Changed 
  9 | - Changed source code as necessary to address a [user-reported issue](https://github.com/vc1492a/PyNomaly/issues/49), corrected in [this commit](https://github.com/vc1492a/PyNomaly/commit/bbdd12a318316ca9c7e0272a5b06909f3fc4f9b0)
 10 | 
 11 | ## 0.3.3
 12 | ### Changed
 13 | - The implementation of the progress bar to support use when the number of 
 14 | observations is less than the width of the Python console in which the code 
 15 | is being executed (tracked in [this issue](https://github.com/vc1492a/PyNomaly/issues/35)).
 16 | ### Added
 17 | - Docstring to the testing functions to provide some additional documentation 
 18 | of the testing (tracked in [this issue](https://github.com/vc1492a/PyNomaly/issues/41)).
 19 | 
 20 | ## 0.3.2
 21 | ### Changed
 22 | - Removed numba as a strict dependency, which is now an optional dependency 
 23 | that is not needed to use PyNomaly but which provides performance enhancements 
 24 | when functions are called repeatedly, such as when the number of observations 
 25 | is large. This relaxes the numba requirement introduced in version 0.3.0. 
 26 | ### Added
 27 | - Added progress bar functionality that can be called using 
 28 | `LocalOutlierProbability(progress_bar=True)` in both native 
 29 | Python and numba just-in-time (JIT) compiled modes. 
 30 | This is helpful in cases where PyNomaly is processing a large amount 
 31 | of observations.  
 32 | 
 33 | 
 34 | ## 0.3.1
 35 | ### Changed
 36 | - Removed Numba JIT compilation from the `_standard_distance` and 
 37 | `_prob_distance` calculations. Using Numba JIT compilation there does 
 38 | not result in a speed improvement and only add compilation overhead.
 39 | - Integrated [pull request #33](https://github.com/vc1492a/PyNomaly/pull/33) 
 40 | which decreases runtime about 30 to more than 90 percent in some cases, in 
 41 | particular on repeated calls with larger datasets. 
 42 | ### Added
 43 | - Type hinting for unit tests in `tests/test_loop.py`.
 44 | 
 45 | ## 0.3.0
 46 | ### Changed
 47 | - The manner in which the standard distance is calculated from list 
 48 | comprehension to a vectorized Numpy implementation, reducing compute 
 49 | time for that specific calculation by approximately 75%. 
 50 | - Removed formal testing and support for Python 3.4 
 51 | ([Python 3 adoption rates](https://rushter.com/blog/python-3-adoption/)).
 52 | - Raised the minimum numpy version requirement from 1.12.0 to 1.16.3.
 53 | ### Added 
 54 | - Numba just in time (JIT) compilation to improve the speed of some 
 55 | of the core functionality, consistently achieving a further 20% reduction 
 56 | in compute time when _n_ = 1000. Future optimizations could yield 
 57 | further reductions in computation time. For now, requiring a strict numba version of `0.43.1` 
 58 | in anticipation of [this deprecation](http://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-reflection-for-list-and-set-types) - 
 59 | which does not yet have an implemented solution. 
 60 | 
 61 | ## 0.2.7
 62 | ### Changed
 63 | - Integrated various performance enhancements as described in 
 64 | [pull request #30](https://github.com/vc1492a/PyNomaly/pull/30) that 
 65 | increase PyNomaly's performance by at least up to 50% in some cases.
 66 | - The Validate classes functions from public to private, as they are only 
 67 | used in validating specification and data input into PyNomaly.
 68 | ### Added
 69 | - [Issue #27](https://github.com/vc1492a/PyNomaly/issues/27) - Added 
 70 | docstring to key functions in PyNomaly to ease future development and 
 71 | provide additional information.
 72 | - Additional unit tests to raise code coverage from 96% to 100%.
 73 | 
 74 | ## 0.2.6
 75 | ### Fixed
 76 | - [Issue #25](https://github.com/vc1492a/PyNomaly/issues/25) - Fixed an issue
 77 | that caused zero division errors when all the values in a neighborhood are
 78 | duplicate samples.
 79 | ### Changed
 80 | - The error behavior when attempting to use the stream approach
 81 | before calling `fit`. While the previous implementation resulted in a
 82 | warning and system exit, PyNomaly now attempts to `fit` (assumes data or a
 83 | distance matrix is available) and then later calls `stream`. If no
 84 | data or distance matrix is provided, a warning is raised.
 85 | ### Added
 86 | - [Issue #24](https://github.com/vc1492a/PyNomaly/issues/24) - Added
 87 | the ability to use one's own distance matrix,
 88 | provided a neighbor index matrix is also provided. This ensures
 89 | PyNomaly can be used with distances other than the euclidean.
 90 | See the file `iris_dist_grid.py` for examples.
 91 | - [Issue #23](https://github.com/vc1492a/PyNomaly/issues/23) - Added
 92 | Python 3.7 to the tested distributions in Travis CI and passed tests.
 93 | - Unit tests to monitor the issues and features covered
 94 | in issues [24](https://github.com/vc1492a/PyNomaly/issues/24) and
 95 | [25](https://github.com/vc1492a/PyNomaly/issues/25).
 96 | 
 97 | 
 98 | ## 0.2.5
 99 | ### Fixed
100 | - [Issue #20](https://github.com/vc1492a/PyNomaly/issues/20) - Fixed
101 | a bug that inadvertently used global means of the probabilistic distance
102 | as the expected value of the probabilistic distance, as opposed to the
103 | expected value of the probabilistic distance within a neighborhood of
104 | a point.
105 | - Integrated [pull request #21](https://github.com/vc1492a/PyNomaly/pull/21) -
106 | This pull request addressed the issue noted above.
107 | ### Changed
108 | - Changed the default behavior to strictly not supporting the
109 | use of missing values in the input data, as opposed to the soft enforcement
110 | (a simple user warning) used in the previous behavior.
111 | 
112 | ## 0.2.4
113 | ### Fixed
114 | - [Issue #17](https://github.com/vc1492a/PyNomaly/issues/17) - Fixed
115 | a bug that allowed for a column of empty values in the primary data store.
116 | - Integrated [pull request #18](https://github.com/vc1492a/PyNomaly/pull/18) -
117 | Fixed a bug that was not causing dependencies such as numpy to skip
118 | installation when installing PyNomaly via pip.
119 | 
120 | ## 0.2.3
121 | ### Fixed
122 | - [Issue #14](https://github.com/vc1492a/PyNomaly/issues/14) - Fixed an issue
123 | that was causing a ZeroDivisionError when the specified neighborhood size
124 | is larger than the total number of observations in the smallest cluster.
125 | 
126 | ## 0.2.2
127 | ### Changed
128 | - This implementation to align more closely with the specification of the
129 | approach in the original paper. The extent parameter now takes an integer
130 | value of 1, 2, or 3 that corresponds to the lambda parameter specified
131 | in the paper. See the [readme](https://github.com/vc1492a/PyNomaly/blob/master/readme.md) for more details.
132 | - Refactored the code base and created the Validate class, which includes
133 | checks for data type, correct specification, and other dependencies.
134 | ### Added
135 | - Automated tests to ensure the desired functionality is being met can now be
136 | found in the `PyNomaly/tests` directory.
137 | - Code for the examples in the readme can now be found in the `examples` directory.
138 | - Additional information for parameter selection in the [readme](https://github.com/vc1492a/PyNomaly/blob/master/readme.md).
139 | 
140 | ## 0.2.1
141 | ### Fixed
142 | - [Issue #10](https://github.com/vc1492a/PyNomaly/issues/10) - Fixed error on line
143 | 142 which was causing the class to fail. More explicit examples
144 | were also included in the readme for using numpy arrays.
145 | 
146 | ### Added
147 | - An improvement to the Euclidean distance calculation by [MichaelSchreier](https://github.com/MichaelSchreier)
148 | which brings a over a 50% reduction in computation time.
149 | 
150 | ## 0.2.0
151 | ### Added
152 | - Added new functionality to PyNomaly by integrating a modified LoOP
153 | approach introduced by Hamlet et al. which can be used for streaming
154 | data applications or in the case where computational expense is a concern.
155 | Data is first fit to a "training set", with any additional observations
156 | considered for outlierness against this initial set.
157 | 
158 | ## 0.1.8
159 | ### Fixed
160 | - Fixed an issue which allowed the number of neighbors considered to exceed the number of observations. Added a check
161 | to ensure this is no longer possible.
162 | 
163 | ## 0.1.7
164 | ### Fixed
165 | - Fixed an issue inadvertently introduced in 0.1.6 that caused distance calculations to be incorrect, 
166 | thus resulting in incorrect LoOP values.  
167 | 
168 | ## 0.1.6
169 | ### Fixed
170 | - Updated the distance calculation such that the euclidean distance calculation has been separated from 
171 | the main distance calculation function.
172 | - Fixed an error in the calculation of the standard distance. 
173 | 
174 | ### Changed
175 | - .fit() now returns a fitted object instead of local_outlier_probabilities. Local outlier probabilities can 
176 | be now be retrieved by calling .local_outlier_probabilities. See the readme for an example. 
177 | - Some private functions have been renamed. 
178 | 
179 | ## 0.1.5
180 | ### Fixed
181 | - [Issue #4](https://github.com/vc1492a/PyNomaly/issues/4) - Separated parameter type checks 
182 | from checks for invalid parameter values.
183 |     - @accepts decorator verifies LocalOutlierProbability parameters are of correct type.
184 |     - Parameter value checks moved from .fit() to init.
185 | - Fixed parameter check to ensure extent value is in the range (0., 1.] instead of [0, 1] (extent cannot be zero). 
186 | - [Issue #1](https://github.com/vc1492a/PyNomaly/issues/1) -  Added type check using @accepts decorator for cluster_labels.    
187 | 
188 | ## 0.1.4
189 | ### Fixed
190 | - [Issue #3](https://github.com/vc1492a/PyNomaly/issues/3) - .fit() fails if the sum of squared distances sums to 0.
191 |     - Added check to ensure the sum of square distances is greater than zero.
192 |     - Added UserWarning to increase the neighborhood size if all neighbors in n_neighbors are 
193 |     zero distance from an observation. 
194 | - Added UserWarning to check for integer type n_neighbor conditions versus float type.
195 | - Changed calculation of the probabilistic local outlier factor expected value to Numpy operation
196 |     from base Python. 
197 |     
198 | ## 0.1.3
199 | ### Fixed
200 | - Altered the distance matrix computation to return a triangular matrix instead of a 
201 | fully populated matrix. This was made to ensure no duplicate neighbors were present 
202 | in computing the neighborhood distance for each observation. 
203 | 
204 | ## 0.1.2
205 | ### Added
206 | - LICENSE.txt file of Apache License, Version 2.0.
207 | - setup.py, setup.cfg files configured for release to PyPi.
208 | - Changed name throughout code base from PyLoOP to PyNomaly.
209 | 
210 | ### Other
211 | - Initial release to PyPi.
212 | 
213 | ## 0.1.1
214 | ### Other
215 | - A bad push to PyPi necessitated the need to skip a version number. 
216 |     - Chosen name of PyLoOP not present on test index but present on production PyPi index. 
217 |     - Issue not known until push was made to the test index.
218 |     - Skipped version number to align test and production PyPi indices.
219 | 
220 | ## 0.1.0 - 2017-05-19
221 | ### Added
222 | - readme.md file documenting methodology, package dependencies, use cases, 
223 | how to contribute, and acknowledgements.
224 | - Initial open release of PyNomaly codebase on Github. 
225 | 


--------------------------------------------------------------------------------
/dist/PyNomaly-0.1.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/dist/PyNomaly-0.1.0.tar.gz


--------------------------------------------------------------------------------
/dist/PyNomaly-0.1.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/dist/PyNomaly-0.1.1.tar.gz


--------------------------------------------------------------------------------
/dist/PyNomaly-0.1.2.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/dist/PyNomaly-0.1.2.tar.gz


--------------------------------------------------------------------------------
/dist/PyNomaly-0.1.3.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/dist/PyNomaly-0.1.3.tar.gz


--------------------------------------------------------------------------------
/dist/PyNomaly-0.1.4.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/dist/PyNomaly-0.1.4.tar.gz


--------------------------------------------------------------------------------
/dist/PyNomaly-0.1.5.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/dist/PyNomaly-0.1.5.tar.gz


--------------------------------------------------------------------------------
/dist/PyNomaly-0.1.6.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/dist/PyNomaly-0.1.6.tar.gz


--------------------------------------------------------------------------------
/dist/PyNomaly-0.1.7.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/dist/PyNomaly-0.1.7.tar.gz


--------------------------------------------------------------------------------
/dist/PyNomaly-0.1.8.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/dist/PyNomaly-0.1.8.tar.gz


--------------------------------------------------------------------------------
/dist/PyNomaly-0.2.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/dist/PyNomaly-0.2.0.tar.gz


--------------------------------------------------------------------------------
/dist/PyNomaly-0.2.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/dist/PyNomaly-0.2.1.tar.gz


--------------------------------------------------------------------------------
/dist/PyNomaly-0.2.2.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/dist/PyNomaly-0.2.2.tar.gz


--------------------------------------------------------------------------------
/dist/PyNomaly-0.2.4.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/dist/PyNomaly-0.2.4.tar.gz


--------------------------------------------------------------------------------
/dist/PyNomaly-0.2.5.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/dist/PyNomaly-0.2.5.tar.gz


--------------------------------------------------------------------------------
/dist/PyNomaly-0.2.6.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/dist/PyNomaly-0.2.6.tar.gz


--------------------------------------------------------------------------------
/dist/PyNomaly-0.2.7.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/dist/PyNomaly-0.2.7.tar.gz


--------------------------------------------------------------------------------
/dist/PyNomaly-0.3.0-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/dist/PyNomaly-0.3.0-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/PyNomaly-0.3.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/dist/PyNomaly-0.3.0.tar.gz


--------------------------------------------------------------------------------
/dist/PyNomaly-0.3.1-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/dist/PyNomaly-0.3.1-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/PyNomaly-0.3.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/dist/PyNomaly-0.3.1.tar.gz


--------------------------------------------------------------------------------
/dist/PyNomaly-0.3.2-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/dist/PyNomaly-0.3.2-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/PyNomaly-0.3.2.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/dist/PyNomaly-0.3.2.tar.gz


--------------------------------------------------------------------------------
/dist/PyNomaly-0.3.3-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/dist/PyNomaly-0.3.3-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/PyNomaly-0.3.3.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/dist/PyNomaly-0.3.3.tar.gz


--------------------------------------------------------------------------------
/examples/iris.py:
--------------------------------------------------------------------------------
 1 | from PyNomaly import loop
 2 | import pandas as pd
 3 | from pydataset import data
 4 | from sklearn.cluster import DBSCAN
 5 | import matplotlib.pyplot as plt
 6 | from mpl_toolkits.mplot3d import Axes3D
 7 | 
 8 | 
 9 | iris = pd.DataFrame(data('iris'))
10 | iris = pd.DataFrame(iris.drop('Species', 1))
11 | 
12 | 
13 | db = DBSCAN(eps=0.9, min_samples=10).fit(iris)
14 | m = loop.LocalOutlierProbability(iris).fit()
15 | scores_noclust = m.local_outlier_probabilities
16 | m_clust = loop.LocalOutlierProbability(iris, cluster_labels=list(db.labels_)).fit()
17 | scores_clust = m_clust.local_outlier_probabilities
18 | 
19 | 
20 | iris_clust = pd.DataFrame(iris.copy())
21 | iris_clust['scores'] = scores_clust
22 | iris_clust['labels'] = db.labels_
23 | 
24 | iris['scores'] = scores_noclust
25 | 
26 | 
27 | fig = plt.figure(figsize=(7, 7))
28 | ax = fig.add_subplot(111, projection='3d')
29 | ax.scatter(iris['Sepal.Width'], iris['Petal.Width'], iris['Sepal.Length'],
30 | c=iris['scores'], cmap='seismic', s=50)
31 | ax.set_xlabel('Sepal.Width')
32 | ax.set_ylabel('Petal.Width')
33 | ax.set_zlabel('Sepal.Length')
34 | plt.show()
35 | plt.clf()
36 | plt.cla()
37 | plt.close()
38 | 
39 | fig = plt.figure(figsize=(7, 7))
40 | ax = fig.add_subplot(111, projection='3d')
41 | ax.scatter(iris_clust['Sepal.Width'], iris_clust['Petal.Width'], iris_clust['Sepal.Length'],
42 | c=iris_clust['scores'], cmap='seismic', s=50)
43 | ax.set_xlabel('Sepal.Width')
44 | ax.set_ylabel('Petal.Width')
45 | ax.set_zlabel('Sepal.Length')
46 | plt.show()
47 | plt.clf()
48 | plt.cla()
49 | plt.close()
50 | 
51 | fig = plt.figure(figsize=(7, 7))
52 | ax = fig.add_subplot(111, projection='3d')
53 | ax.scatter(iris_clust['Sepal.Width'], iris_clust['Petal.Width'], iris_clust['Sepal.Length'],
54 | c=iris_clust['labels'], cmap='Set1', s=50)
55 | ax.set_xlabel('Sepal.Width')
56 | ax.set_ylabel('Petal.Width')
57 | ax.set_zlabel('Sepal.Length')
58 | plt.show()
59 | plt.clf()
60 | plt.cla()
61 | plt.close()
62 | 


--------------------------------------------------------------------------------
/examples/iris_dist_grid.py:
--------------------------------------------------------------------------------
 1 | from PyNomaly import loop
 2 | import pandas as pd
 3 | from pydataset import data
 4 | from sklearn.neighbors import NearestNeighbors
 5 | import matplotlib.pyplot as plt
 6 | from mpl_toolkits.mplot3d import Axes3D
 7 | 
 8 | 
 9 | iris = pd.DataFrame(data('iris'))
10 | iris = pd.DataFrame(iris.drop('Species', 1))
11 | 
12 | distance_metrics = [
13 |     'braycurtis',
14 |     'canberra',
15 |     'cityblock',
16 |     'chebyshev',
17 |     'cosine',
18 |     'euclidean',
19 |     'hamming',
20 |     'l1',
21 |     'manhattan'
22 | ]
23 | 
24 | fig = plt.figure(figsize=(17, 17))
25 | 
26 | for i in range(1, 10):
27 | 
28 |     neigh = NearestNeighbors(n_neighbors=10, metric=distance_metrics[i-1])
29 |     neigh.fit(iris)
30 |     d, idx = neigh.kneighbors(iris, return_distance=True)
31 | 
32 |     m = loop.LocalOutlierProbability(distance_matrix=d,
33 |                                      neighbor_matrix=idx).fit()
34 |     iris['scores'] = m.local_outlier_probabilities
35 | 
36 |     ax = fig.add_subplot(3, 3, i, projection='3d')
37 |     plt.title(distance_metrics[i-1], loc='left', fontsize=18)
38 |     ax.scatter(iris['Sepal.Width'], iris['Petal.Width'], iris['Sepal.Length'],
39 |                c=iris['scores'], cmap='seismic', s=50)
40 |     ax.set_xlabel('Sepal.Width')
41 |     ax.set_ylabel('Petal.Width')
42 |     ax.set_zlabel('Sepal.Length')
43 | 
44 | 
45 | plt.show()
46 | plt.clf()
47 | plt.cla()
48 | plt.close()
49 | 
50 | 


--------------------------------------------------------------------------------
/examples/multiple_gaussian_2d.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from PyNomaly import loop
 4 | import pandas as pd
 5 | 
 6 | # import the multiple gaussian data #
 7 | df = pd.read_csv('../data/multiple-gaussian-2d-data-only.csv')
 8 | print(df)
 9 | 
10 | # fit LoOP according to the original settings outlined in the paper #
11 | m = loop.LocalOutlierProbability(df[['x', 'y']], n_neighbors=20, extent=3).fit()
12 | scores = m.local_outlier_probabilities
13 | print(scores)
14 | 
15 | # plot the results #
16 | # base 3 width, then set as multiple
17 | threshold = 0.1
18 | color = np.where(scores > threshold, "white", "black")
19 | label_mask = np.where(scores > threshold)
20 | area = (20 * scores) ** 2
21 | plt.scatter(df['x'], df['y'], c=color, s=area.astype(float), edgecolor='red', linewidth=1)
22 | plt.scatter(df['x'], df['y'], c='black', s=3)
23 | for i in range(len(scores)):
24 |     if scores[i] > threshold:
25 |         plt.text(df['x'].loc[i] * (1 + 0.01), df['y'].loc[i] * (1 + 0.01), round(scores[i], 2), fontsize=8)
26 | 
27 | plt.show()
28 | 
29 | 


--------------------------------------------------------------------------------
/examples/numba_speed_diff.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from PyNomaly import loop
 3 | import time
 4 | 
 5 | # generate a large set of data
 6 | data = np.ones(shape=(10000, 4))
 7 | 
 8 | # first time the process without Numba
 9 | # use the progress bar to track progress
10 | 
11 | t1 = time.time()
12 | scores_numpy = loop.LocalOutlierProbability(
13 |     data,
14 |     n_neighbors=3,
15 |     use_numba=False,
16 |     progress_bar=True
17 | ).fit().local_outlier_probabilities
18 | t2 = time.time()
19 | seconds_no_numba = t2 - t1
20 | print("\nComputation took " + str(seconds_no_numba) + " seconds without Numba JIT.")
21 | 
22 | t3 = time.time()
23 | scores_numba = loop.LocalOutlierProbability(
24 |     data,
25 |     n_neighbors=3,
26 |     use_numba=True,
27 |     progress_bar=True
28 | ).fit().local_outlier_probabilities
29 | t4 = time.time()
30 | seconds_numba = t4 - t3
31 | print("\nComputation took " + str(seconds_numba) + " seconds with Numba JIT.")
32 | 


--------------------------------------------------------------------------------
/examples/numpy_array.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/examples/numpy_array.py


--------------------------------------------------------------------------------
/examples/stream.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from PyNomaly import loop
 3 | import pandas as pd
 4 | from pydataset import data
 5 | import matplotlib.pyplot as plt
 6 | from mpl_toolkits.mplot3d import Axes3D
 7 | 
 8 | 
 9 | iris = pd.DataFrame(data('iris'))
10 | iris = pd.DataFrame(iris.drop('Species', 1))
11 | 
12 | iris_train = iris.iloc[:, 0:4].head(120)
13 | iris_test = iris.iloc[:, 0:4].tail(30)
14 | 
15 | m = loop.LocalOutlierProbability(iris).fit()
16 | scores_noclust = m.local_outlier_probabilities
17 | iris['scores'] = scores_noclust
18 | 
19 | m_train = loop.LocalOutlierProbability(iris_train, n_neighbors=10)
20 | m_train.fit()
21 | iris_train_scores = m_train.local_outlier_probabilities
22 | 
23 | iris_test_scores = []
24 | for index, row in iris_test.iterrows():
25 |     array = np.array([row['Sepal.Length'], row['Sepal.Width'], row['Petal.Length'], row['Petal.Width']])
26 |     iris_test_scores.append(m_train.stream(array))
27 | iris_test_scores = np.array(iris_test_scores)
28 | 
29 | iris['stream_scores'] = np.hstack((iris_train_scores, iris_test_scores))
30 | # iris['scores'] from earlier example
31 | rmse = np.sqrt(((iris['scores'] - iris['stream_scores']) ** 2).mean(axis=None))
32 | print(rmse)
33 | 
34 | fig = plt.figure(figsize=(7, 7))
35 | ax = fig.add_subplot(111, projection='3d')
36 | ax.scatter(iris['Sepal.Width'], iris['Petal.Width'], iris['Sepal.Length'],
37 | c=iris['stream_scores'], cmap='seismic', s=50)
38 | ax.set_xlabel('Sepal.Width')
39 | ax.set_ylabel('Petal.Width')
40 | ax.set_zlabel('Sepal.Length')
41 | plt.show()
42 | plt.clf()
43 | plt.cla()
44 | plt.close()
45 | 


--------------------------------------------------------------------------------
/images/animation/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/1.png


--------------------------------------------------------------------------------
/images/animation/10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/10.png


--------------------------------------------------------------------------------
/images/animation/11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/11.png


--------------------------------------------------------------------------------
/images/animation/12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/12.png


--------------------------------------------------------------------------------
/images/animation/13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/13.png


--------------------------------------------------------------------------------
/images/animation/14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/14.png


--------------------------------------------------------------------------------
/images/animation/15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/15.png


--------------------------------------------------------------------------------
/images/animation/16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/16.png


--------------------------------------------------------------------------------
/images/animation/17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/17.png


--------------------------------------------------------------------------------
/images/animation/18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/18.png


--------------------------------------------------------------------------------
/images/animation/19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/19.png


--------------------------------------------------------------------------------
/images/animation/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/2.png


--------------------------------------------------------------------------------
/images/animation/20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/20.png


--------------------------------------------------------------------------------
/images/animation/21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/21.png


--------------------------------------------------------------------------------
/images/animation/22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/22.png


--------------------------------------------------------------------------------
/images/animation/23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/23.png


--------------------------------------------------------------------------------
/images/animation/24.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/24.png


--------------------------------------------------------------------------------
/images/animation/25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/25.png


--------------------------------------------------------------------------------
/images/animation/26.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/26.png


--------------------------------------------------------------------------------
/images/animation/27.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/27.png


--------------------------------------------------------------------------------
/images/animation/28.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/28.png


--------------------------------------------------------------------------------
/images/animation/29.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/29.png


--------------------------------------------------------------------------------
/images/animation/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/3.png


--------------------------------------------------------------------------------
/images/animation/30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/30.png


--------------------------------------------------------------------------------
/images/animation/31.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/31.png


--------------------------------------------------------------------------------
/images/animation/32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/32.png


--------------------------------------------------------------------------------
/images/animation/33.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/33.png


--------------------------------------------------------------------------------
/images/animation/34.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/34.png


--------------------------------------------------------------------------------
/images/animation/35.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/35.png


--------------------------------------------------------------------------------
/images/animation/36.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/36.png


--------------------------------------------------------------------------------
/images/animation/37.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/37.png


--------------------------------------------------------------------------------
/images/animation/38.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/38.png


--------------------------------------------------------------------------------
/images/animation/39.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/39.png


--------------------------------------------------------------------------------
/images/animation/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/4.png


--------------------------------------------------------------------------------
/images/animation/40.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/40.png


--------------------------------------------------------------------------------
/images/animation/41.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/41.png


--------------------------------------------------------------------------------
/images/animation/42.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/42.png


--------------------------------------------------------------------------------
/images/animation/43.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/43.png


--------------------------------------------------------------------------------
/images/animation/44.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/44.png


--------------------------------------------------------------------------------
/images/animation/45.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/45.png


--------------------------------------------------------------------------------
/images/animation/46.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/46.png


--------------------------------------------------------------------------------
/images/animation/47.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/47.png


--------------------------------------------------------------------------------
/images/animation/48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/48.png


--------------------------------------------------------------------------------
/images/animation/49.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/49.png


--------------------------------------------------------------------------------
/images/animation/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/5.png


--------------------------------------------------------------------------------
/images/animation/50.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/50.png


--------------------------------------------------------------------------------
/images/animation/51.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/51.png


--------------------------------------------------------------------------------
/images/animation/52.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/52.png


--------------------------------------------------------------------------------
/images/animation/53.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/53.png


--------------------------------------------------------------------------------
/images/animation/54.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/54.png


--------------------------------------------------------------------------------
/images/animation/55.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/55.png


--------------------------------------------------------------------------------
/images/animation/56.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/56.png


--------------------------------------------------------------------------------
/images/animation/57.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/57.png


--------------------------------------------------------------------------------
/images/animation/58.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/58.png


--------------------------------------------------------------------------------
/images/animation/59.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/59.png


--------------------------------------------------------------------------------
/images/animation/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/6.png


--------------------------------------------------------------------------------
/images/animation/60.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/60.png


--------------------------------------------------------------------------------
/images/animation/7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/7.png


--------------------------------------------------------------------------------
/images/animation/8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/8.png


--------------------------------------------------------------------------------
/images/animation/9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/animation/9.png


--------------------------------------------------------------------------------
/images/cluster_assignments.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/cluster_assignments.png


--------------------------------------------------------------------------------
/images/logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/logo.jpg


--------------------------------------------------------------------------------
/images/logo100.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/logo100.jpg


--------------------------------------------------------------------------------
/images/logo150.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/logo150.jpg


--------------------------------------------------------------------------------
/images/logo200.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/logo200.jpg


--------------------------------------------------------------------------------
/images/logo300.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/logo300.jpg


--------------------------------------------------------------------------------
/images/scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/scores.png


--------------------------------------------------------------------------------
/images/scores_by_distance_metric.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/scores_by_distance_metric.png


--------------------------------------------------------------------------------
/images/scores_clust.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/scores_clust.png


--------------------------------------------------------------------------------
/images/scores_stream.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/scores_stream.gif


--------------------------------------------------------------------------------
/images/scores_stream.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/images/scores_stream.png


--------------------------------------------------------------------------------
/paper/codemeta.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld",
 3 |   "@type": "Code",
 4 |   "author": [
 5 |     {
 6 |       "@id": "http://orcid.org/0000-0002-5279-4143",
 7 |       "@type": "Person",
 8 |       "email": "vconstan@jpl.caltech.edu",
 9 |       "name": "Valentino Constantinou",
10 |       "affiliation": "NASA Jet Propulsion Laboratory"
11 |     }
12 |   ],
13 |   "identifier": "",
14 |   "codeRepository": "https://www.github.com/vc1492a/PyNomaly",
15 |   "datePublished": "2018-05-07",
16 |   "dateModified": "2018-05-07",
17 |   "dateCreated": "2018-05-07",
18 |   "description": "Anomaly detection using Local Outlier Probabilities (LoOP).",
19 |   "keywords": "machine learning, unsupervised learning, outlier detection, anomaly detection, nearest neighbors, statistics, probability",
20 |   "license": "Apache 2.0",
21 |   "title": "PyNomaly",
22 |   "version": "v0.2.0"
23 | }


--------------------------------------------------------------------------------
/paper/paper.bib:
--------------------------------------------------------------------------------
 1 | @inproceedings{Breunig,
 2 |  author = {Breunig, Markus M. and Kriegel, Hans-Peter and Ng, Raymond T. and Sander, J\"{o}rg},
 3 |  title = {LOF: Identifying Density-based Local Outliers},
 4 |  booktitle = {Proceedings of the 2000 ACM SIGMOD International Conference on Management of Data},
 5 |  series = {SIGMOD '00},
 6 |  year = {2000},
 7 |  isbn = {1-58113-217-4},
 8 |  location = {Dallas, Texas, USA},
 9 |  pages = {93--104},
10 |  numpages = {12},
11 |  url = {http://doi.acm.org/10.1145/342009.335388},
12 |  doi = {10.1145/342009.335388},
13 |  acmid = {335388},
14 |  publisher = {ACM},
15 |  address = {New York, NY, USA},
16 |  keywords = {database mining, outlier detection},
17 | }
18 | 
19 | @inproceedings{Kriegel,
20 |  author = {Kriegel, Hans-Peter and Kr\"{o}ger, Peer and Schubert, Erich and Zimek, Arthur},
21 |  title = {LoOP: Local Outlier Probabilities},
22 |  booktitle = {Proceedings of the 18th ACM Conference on Information and Knowledge Management},
23 |  series = {CIKM '09},
24 |  year = {2009},
25 |  isbn = {978-1-60558-512-3},
26 |  location = {Hong Kong, China},
27 |  pages = {1649--1652},
28 |  numpages = {4},
29 |  url = {http://doi.acm.org/10.1145/1645953.1646195},
30 |  doi = {10.1145/1645953.1646195},
31 |  acmid = {1646195},
32 |  publisher = {ACM},
33 |  address = {New York, NY, USA},
34 |  keywords = {outlier detection},
35 | }
36 | 
37 | @article{Hamlet,
38 |  doi= {10.1080/23742917.2016.1226651},
39 |  author = {Connor Hamlet and Jeremy Straub and Matthew Russell and Scott Kerlin},
40 |  title = {An incremental and approximate local outlier probability algorithm for intrusion detection and its evaluation},
41 |  journal = {Journal of Cyber Security Technology},
42 |  volume = {1},
43 |  number = {2},
44 |  pages = {75-87},
45 |  year  = {2017},
46 |  publisher = {Taylor & Francis},
47 |  doi = {10.1080/23742917.2016.1226651},
48 |  URL = {https://doi.org/10.1080/23742917.2016.1226651},
49 |  eprint = {https://doi.org/10.1080/23742917.2016.1226651}
50 | }


--------------------------------------------------------------------------------
/paper/paper.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'PyNomaly: Anomaly detection using Local Outlier Probabilities (LoOP).'
 3 | tags:
 4 |   - outlier detection
 5 |   - anomaly detection
 6 |   - probability
 7 |   - nearest neighbors
 8 |   - unsupervised learning
 9 |   - machine learning
10 |   - statistics
11 | authors:
12 |  - name: Valentino Constantinou
13 |    orcid: 0000-0002-5279-4143
14 |    affiliation: 1
15 | affiliations:
16 |  - name: NASA Jet Propulsion Laboratory
17 |    index: 1
18 | date: 7 May 2018
19 | bibliography: paper.bib
20 | ---
21 | 
22 | # Summary
23 | 
24 | ``PyNomaly`` is a Python 3 implementation of LoOP (Local Outlier
25 | Probabilities) [@Kriegel]. LoOP is a local density based outlier detection
26 | method by Kriegel, Kröger, Schubert, and Zimek which provides
27 | outlier scores in the range of [0,1] that are directly
28 | interpretable as the probability of a sample being an outlier.
29 | ``PyNomaly`` also implements a modified approach to LoOP [@Hamlet], which may be used for applications involving
30 | streaming data or where rapid calculations may be necessary.
31 | 
32 | The outlier score of each sample is called the Local Outlier
33 | Probability. It measures the local deviation of density of a
34 | given sample with respect to its neighbors as Local Outlier
35 | Factor (LOF) [@Breunig], but provides normalized outlier scores in the
36 | range [0,1]. These outlier scores are directly interpretable
37 | as a probability of an object being an outlier. Since Local
38 | Outlier Probabilities provides scores in the range [0,1],
39 | practitioners are free to interpret the results according to
40 | the application.
41 | 
42 | Like LOF, it is local in that the anomaly score depends on
43 | how isolated the sample is with respect to the surrounding
44 | neighborhood. Locality is given by k-nearest neighbors,
45 | whose distance is used to estimate the local density.
46 | By comparing the local density of a sample to the local
47 | densities of its neighbors, one can identify samples that
48 | lie in regions of lower density compared to their neighbors
49 | and thus identify samples that may be outliers according to
50 | their Local Outlier Probability.
51 | 
52 | ``PyNomaly`` includes an optional _cluster_labels_ parameter.
53 | This is useful in cases where regions of varying density
54 | occur within the same set of data. When using _cluster_labels_,
55 | the Local Outlier Probability of a sample is calculated with
56 | respect to its cluster assignment.
57 | 
58 | ## Research
59 | 
60 | PyNomaly is currently being used in the following research:
61 | 
62 | - Y. Zhao and M.K. Hryniewicki, "XGBOD: Improving Supervised
63 | Outlier Detection with Unsupervised Representation Learning,"
64 | International Joint Conference on Neural Networks (IJCNN),
65 | IEEE, 2018.
66 | 
67 | ## Acknowledgements
68 | 
69 | The authors recognize the support of Kyle Hundman and Ian Colwell.
70 | 
71 | # References


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | # PyNomaly
  2 | 
  3 | PyNomaly is a Python 3 implementation of LoOP (Local Outlier Probabilities).
  4 | LoOP is a local density based outlier detection method by Kriegel, Kröger, Schubert, and Zimek which provides outlier
  5 | scores in the range of [0,1] that are directly interpretable as the probability of a sample being an outlier. 
  6 | 
  7 | PyNomaly is a core library of [deepchecks](https://github.com/deepchecks/deepchecks) and [pysad](https://github.com/selimfirat/pysad). 
  8 | 
  9 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
 10 | [![PyPi](https://img.shields.io/badge/pypi-0.3.4-blue.svg)](https://pypi.python.org/pypi/PyNomaly/0.3.4)
 11 | [![Downloads](https://img.shields.io/pypi/dm/PyNomaly.svg?logoColor=blue)](https://pypistats.org/packages/pynomaly)
 12 | ![Tests](https://github.com/vc1492a/PyNomaly/actions/workflows/tests.yml/badge.svg)
 13 | [![Coverage Status](https://coveralls.io/repos/github/vc1492a/PyNomaly/badge.svg?branch=main)](https://coveralls.io/github/vc1492a/PyNomaly?branch=main)
 14 | [![JOSS](http://joss.theoj.org/papers/f4d2cfe680768526da7c1f6a2c103266/status.svg)](http://joss.theoj.org/papers/f4d2cfe680768526da7c1f6a2c103266)
 15 | 
 16 | The outlier score of each sample is called the Local Outlier Probability.
 17 | It measures the local deviation of density of a given sample with
 18 | respect to its neighbors as Local Outlier Factor (LOF), but provides normalized
 19 | outlier scores in the range [0,1]. These outlier scores are directly interpretable
 20 | as a probability of an object being an outlier. Since Local Outlier Probabilities provides scores in the
 21 | range [0,1], practitioners are free to interpret the results according to the application.
 22 | 
 23 | Like LOF, it is local in that the anomaly score depends on how isolated the sample is
 24 | with respect to the surrounding neighborhood. Locality is given by k-nearest neighbors,
 25 | whose distance is used to estimate the local density. By comparing the local density of a sample to the
 26 | local densities of its neighbors, one can identify samples that lie in regions of lower
 27 | density compared to their neighbors and thus identify samples that may be outliers according to their Local
 28 | Outlier Probability.
 29 | 
 30 | The authors' 2009 paper detailing LoOP's theory, formulation, and application is provided by
 31 | Ludwig-Maximilians University Munich - Institute for Informatics;
 32 | [LoOP: Local Outlier Probabilities](http://www.dbs.ifi.lmu.de/Publikationen/Papers/LoOP1649.pdf).
 33 | 
 34 | ## Implementation
 35 | 
 36 | This Python 3 implementation uses Numpy and the formulas outlined in
 37 | [LoOP: Local Outlier Probabilities](http://www.dbs.ifi.lmu.de/Publikationen/Papers/LoOP1649.pdf)
 38 | to calculate the Local Outlier Probability of each sample.
 39 | 
 40 | ## Dependencies
 41 | - Python 3.6 - 3.13
 42 | - numpy >= 1.16.3
 43 | - python-utils >= 2.3.0
 44 | - (optional) numba >= 0.45.1
 45 | 
 46 | Numba just-in-time (JIT) compiles the function with calculates the Euclidean 
 47 | distance between observations, providing a reduction in computation time 
 48 | (significantly when a large number of observations are scored). Numba is not a 
 49 | requirement and PyNomaly may still be used solely with numpy if desired
 50 | (details below). 
 51 | 
 52 | ## Quick Start
 53 | 
 54 | First install the package from the Python Package Index:
 55 | 
 56 | ```shell
 57 | pip install PyNomaly # or pip3 install ... if you're using both Python 3 and 2.
 58 | ```
 59 | 
 60 | Alternatively, you can use conda to install the package from conda-forge:
 61 | 
 62 | ```shell
 63 | conda install conda-forge::pynomaly
 64 | ```
 65 | Then you can do something like this:
 66 | 
 67 | ```python
 68 | from PyNomaly import loop
 69 | m = loop.LocalOutlierProbability(data).fit()
 70 | scores = m.local_outlier_probabilities
 71 | print(scores)
 72 | ```
 73 | where *data* is a NxM (N rows, M columns; 2-dimensional) set of data as either a Pandas DataFrame or Numpy array.
 74 | 
 75 | LocalOutlierProbability sets the *extent* (in integer in value of 1, 2, or 3) and *n_neighbors* (must be greater than 0) parameters with the default
 76 | values of 3 and 10, respectively. You're free to set these parameters on your own as below:
 77 | 
 78 | ```python
 79 | from PyNomaly import loop
 80 | m = loop.LocalOutlierProbability(data, extent=2, n_neighbors=20).fit()
 81 | scores = m.local_outlier_probabilities
 82 | print(scores)
 83 | ```
 84 | 
 85 | This implementation of LoOP also includes an optional *cluster_labels* parameter. This is useful in cases where regions
 86 | of varying density occur within the same set of data. When using *cluster_labels*, the Local Outlier Probability of a
 87 | sample is calculated with respect to its cluster assignment.
 88 | 
 89 | ```python
 90 | from PyNomaly import loop
 91 | from sklearn.cluster import DBSCAN
 92 | db = DBSCAN(eps=0.6, min_samples=50).fit(data)
 93 | m = loop.LocalOutlierProbability(data, extent=2, n_neighbors=20, cluster_labels=list(db.labels_)).fit()
 94 | scores = m.local_outlier_probabilities
 95 | print(scores)
 96 | ```
 97 | 
 98 | **NOTE**: Unless your data is all the same scale, it may be a good idea to normalize your data with z-scores or another
 99 | normalization scheme prior to using LoOP, especially when working with multiple dimensions of varying scale.
100 | Users must also appropriately handle missing values prior to using LoOP, as LoOP does not support Pandas
101 | DataFrames or Numpy arrays with missing values.
102 | 
103 | ### Utilizing Numba and Progress Bars
104 | 
105 | It may be helpful to use just-in-time (JIT) compilation in the cases where a lot of 
106 | observations are scored. Numba, a JIT compiler for Python, may be used 
107 | with PyNomaly by setting `use_numba=True`:
108 | 
109 | ```python
110 | from PyNomaly import loop
111 | m = loop.LocalOutlierProbability(data, extent=2, n_neighbors=20, use_numba=True, progress_bar=True).fit()
112 | scores = m.local_outlier_probabilities
113 | print(scores)
114 | ```
115 | 
116 | Numba must be installed if the above to use JIT compilation and improve the 
117 | speed of multiple calls to `LocalOutlierProbability()`, and PyNomaly has been 
118 | tested with Numba version 0.45.1. An example of the speed difference that can 
119 | be realized with using Numba is avaialble in `examples/numba_speed_diff.py`. 
120 | 
121 | You may also choose to print progress bars _with our without_ the use of numba 
122 | by passing `progress_bar=True` to the `LocalOutlierProbability()` method as above.
123 | 
124 | ### Choosing Parameters
125 | 
126 | The *extent* parameter controls the sensitivity of the scoring in practice. The parameter corresponds to
127 | the statistical notion of an outlier defined as an object deviating more than a given lambda (*extent*)
128 | times the standard deviation from the mean. A value of 2 implies outliers deviating more than 2 standard deviations
129 | from the mean, and corresponds to 95.0% in the empirical "three-sigma" rule. The appropriate parameter should be selected
130 | according to the level of sensitivity needed for the input data and application. The question to ask is whether it is
131 | more reasonable to assume outliers in your data are 1, 2, or 3 standard deviations from the mean, and select the value
132 | likely most appropriate to your data and application.
133 | 
134 | The *n_neighbors* parameter defines the number of neighbors to consider about
135 | each sample (neighborhood size) when determining its Local Outlier Probability with respect to the density
136 | of the sample's defined neighborhood. The idea number of neighbors to consider is dependent on the
137 | input data. However, the notion of an outlier implies it would be considered as such regardless of the number
138 | of neighbors considered. One potential approach is to use a number of different neighborhood sizes and average
139 | the results for reach observation. Those observations which rank highly with varying neighborhood sizes are
140 | more than likely outliers. This is one potential approach of selecting the neighborhood size. Another is to
141 | select a value proportional to the number of observations, such an odd-valued integer close to the square root
142 | of the number of observations in your data (*sqrt(n_observations*).
143 | 
144 | ## Iris Data Example
145 | 
146 | We'll be using the well-known Iris dataset to show LoOP's capabilities. There's a few things you'll need for this
147 | example beyond the standard prerequisites listed above:
148 | - matplotlib 2.0.0 or greater
149 | - PyDataset 0.2.0 or greater
150 | - scikit-learn 0.18.1 or greater
151 | 
152 | First, let's import the packages and libraries we will need for this example.
153 | 
154 | ```python
155 | from PyNomaly import loop
156 | import pandas as pd
157 | from pydataset import data
158 | import numpy as np
159 | from sklearn.cluster import DBSCAN
160 | import matplotlib.pyplot as plt
161 | from mpl_toolkits.mplot3d import Axes3D
162 | ```
163 | 
164 | Now let's create two sets of Iris data for scoring; one with clustering and the other without.
165 | 
166 | ```python
167 | # import the data and remove any non-numeric columns
168 | iris = pd.DataFrame(data('iris').drop(columns=['Species']))
169 | ```
170 | 
171 | Next, let's cluster the data using DBSCAN and generate two sets of scores. On both cases, we will use the default
172 | values for both *extent* (0.997) and *n_neighbors* (10).
173 | 
174 | ```python
175 | db = DBSCAN(eps=0.9, min_samples=10).fit(iris)
176 | m = loop.LocalOutlierProbability(iris).fit()
177 | scores_noclust = m.local_outlier_probabilities
178 | m_clust = loop.LocalOutlierProbability(iris, cluster_labels=list(db.labels_)).fit()
179 | scores_clust = m_clust.local_outlier_probabilities
180 | ```
181 | 
182 | Organize the data into two separate Pandas DataFrames.
183 | 
184 | ```python
185 | iris_clust = pd.DataFrame(iris.copy())
186 | iris_clust['scores'] = scores_clust
187 | iris_clust['labels'] = db.labels_
188 | iris['scores'] = scores_noclust
189 | ```
190 | 
191 | And finally, let's visualize the scores provided by LoOP in both cases (with and without clustering).
192 | 
193 | ```python
194 | fig = plt.figure(figsize=(7, 7))
195 | ax = fig.add_subplot(111, projection='3d')
196 | ax.scatter(iris['Sepal.Width'], iris['Petal.Width'], iris['Sepal.Length'],
197 | c=iris['scores'], cmap='seismic', s=50)
198 | ax.set_xlabel('Sepal.Width')
199 | ax.set_ylabel('Petal.Width')
200 | ax.set_zlabel('Sepal.Length')
201 | plt.show()
202 | plt.clf()
203 | plt.cla()
204 | plt.close()
205 | 
206 | fig = plt.figure(figsize=(7, 7))
207 | ax = fig.add_subplot(111, projection='3d')
208 | ax.scatter(iris_clust['Sepal.Width'], iris_clust['Petal.Width'], iris_clust['Sepal.Length'],
209 | c=iris_clust['scores'], cmap='seismic', s=50)
210 | ax.set_xlabel('Sepal.Width')
211 | ax.set_ylabel('Petal.Width')
212 | ax.set_zlabel('Sepal.Length')
213 | plt.show()
214 | plt.clf()
215 | plt.cla()
216 | plt.close()
217 | 
218 | fig = plt.figure(figsize=(7, 7))
219 | ax = fig.add_subplot(111, projection='3d')
220 | ax.scatter(iris_clust['Sepal.Width'], iris_clust['Petal.Width'], iris_clust['Sepal.Length'],
221 | c=iris_clust['labels'], cmap='Set1', s=50)
222 | ax.set_xlabel('Sepal.Width')
223 | ax.set_ylabel('Petal.Width')
224 | ax.set_zlabel('Sepal.Length')
225 | plt.show()
226 | plt.clf()
227 | plt.cla()
228 | plt.close()
229 | ```
230 | 
231 | Your results should look like the following:
232 | 
233 | **LoOP Scores without Clustering**
234 | ![LoOP Scores without Clustering](https://github.com/vc1492a/PyNomaly/blob/main/images/scores.png)
235 | 
236 | **LoOP Scores with Clustering**
237 | ![LoOP Scores with Clustering](https://github.com/vc1492a/PyNomaly/blob/main/images/scores_clust.png)
238 | 
239 | **DBSCAN Cluster Assignments**
240 | ![DBSCAN Cluster Assignments](https://github.com/vc1492a/PyNomaly/blob/main/images/cluster_assignments.png)
241 | 
242 | 
243 | Note the differences between using LocalOutlierProbability with and without clustering. In the example without clustering, samples are
244 | scored according to the distribution of the entire data set. In the example with clustering, each sample is scored
245 | according to the distribution of each cluster. Which approach is suitable depends on the use case.
246 | 
247 | **NOTE**: Data was not normalized in this example, but it's probably a good idea to do so in practice.
248 | 
249 | ## Using Numpy
250 | 
251 | When using numpy, make sure to use 2-dimensional arrays in tabular format:
252 | 
253 | ```python
254 | data = np.array([
255 |     [43.3, 30.2, 90.2],
256 |     [62.9, 58.3, 49.3],
257 |     [55.2, 56.2, 134.2],
258 |     [48.6, 80.3, 50.3],
259 |     [67.1, 60.0, 55.9],
260 |     [421.5, 90.3, 50.0]
261 | ])
262 | 
263 | scores = loop.LocalOutlierProbability(data, n_neighbors=3).fit().local_outlier_probabilities
264 | print(scores)
265 | 
266 | ```
267 | 
268 | The shape of the input array shape corresponds to the rows (observations) and columns (features) in the data:
269 | 
270 | ```python
271 | print(data.shape)
272 | # (6,3), which matches number of observations and features in the above example
273 | ```
274 | 
275 | Similar to the above:
276 | 
277 | ```python
278 | data = np.random.rand(100, 5)
279 | scores = loop.LocalOutlierProbability(data).fit().local_outlier_probabilities
280 | print(scores)
281 | ```
282 | 
283 | ## Specifying a Distance Matrix
284 | 
285 | PyNomaly provides the ability to specify a distance matrix so that any
286 | distance metric can be used (a neighbor index matrix must also be provided).
287 | This can be useful when wanting to use a distance other than the euclidean.
288 | 
289 | Note that in order to maintain alignment with the LoOP definition of closest neighbors, 
290 | an additional neighbor is added when using [scikit-learn's NearestNeighbors](https://scikit-learn.org/1.5/modules/neighbors.html) since `NearestNeighbors` 
291 | includes the point itself when calculating the cloest neighbors (whereas the LoOP method does not include distances to point itself). 
292 | 
293 | ```python
294 | import numpy as np
295 | from sklearn.neighbors import NearestNeighbors
296 | 
297 | data = np.array([
298 |     [43.3, 30.2, 90.2],
299 |     [62.9, 58.3, 49.3],
300 |     [55.2, 56.2, 134.2],
301 |     [48.6, 80.3, 50.3],
302 |     [67.1, 60.0, 55.9],
303 |     [421.5, 90.3, 50.0]
304 | ])
305 | 
306 | # Generate distance and neighbor matrices
307 | n_neighbors = 3 # the number of neighbors according to the LoOP definition 
308 | neigh = NearestNeighbors(n_neighbors=n_neighbors+1, metric='hamming')
309 | neigh.fit(data)
310 | d, idx = neigh.kneighbors(data, return_distance=True)
311 | 
312 | # Remove self-distances - you MUST do this to preserve the same results as intended by the definition of LoOP
313 | indices = np.delete(indices, 0, 1)
314 | distances = np.delete(distances, 0, 1)
315 | 
316 | # Fit and return scores
317 | m = loop.LocalOutlierProbability(distance_matrix=d, neighbor_matrix=idx, n_neighbors=n_neighbors+1).fit()
318 | scores = m.local_outlier_probabilities
319 | ```
320 | 
321 | The below visualization shows the results by a few known distance metrics:
322 | 
323 | **LoOP Scores by Distance Metric**
324 | ![DBSCAN Cluster Assignments](https://github.com/vc1492a/PyNomaly/blob/main/images/scores_by_distance_metric.png)
325 | 
326 | ## Streaming Data
327 | 
328 | PyNomaly also contains an implementation of Hamlet et. al.'s modifications
329 | to the original LoOP approach [[4](http://www.tandfonline.com/doi/abs/10.1080/23742917.2016.1226651?journalCode=tsec20)],
330 | which may be used for applications involving streaming data or where rapid calculations may be necessary.
331 | First, the standard LoOP algorithm is used on "training" data, with certain attributes of the fitted data
332 | stored from the original LoOP approach. Then, as new points are considered, these fitted attributes are
333 | called when calculating the score of the incoming streaming data due to the use of averages from the initial
334 | fit, such as the use of a global value for the expected value of the probabilistic distance. Despite the potential
335 | for increased error when compared to the standard approach, it may be effective in streaming applications where
336 | refitting the standard approach over all points could be computationally expensive.
337 | 
338 | While the iris dataset is not streaming data, we'll use it in this example by taking the first 120 observations
339 | as training data and take the remaining 30 observations as a stream, scoring each observation
340 | individually.
341 | 
342 | Split the data.
343 | ```python
344 | iris = iris.sample(frac=1) # shuffle data
345 | iris_train = iris.iloc[:, 0:4].head(120)
346 | iris_test = iris.iloc[:, 0:4].tail(30)
347 | ```
348 | 
349 | Fit to each set.
350 | ```python
351 | m = loop.LocalOutlierProbability(iris).fit()
352 | scores_noclust = m.local_outlier_probabilities
353 | iris['scores'] = scores_noclust
354 | 
355 | m_train = loop.LocalOutlierProbability(iris_train, n_neighbors=10)
356 | m_train.fit()
357 | iris_train_scores = m_train.local_outlier_probabilities
358 | ```
359 | 
360 | ```python
361 | iris_test_scores = []
362 | for index, row in iris_test.iterrows():
363 |     array = np.array([row['Sepal.Length'], row['Sepal.Width'], row['Petal.Length'], row['Petal.Width']])
364 |     iris_test_scores.append(m_train.stream(array))
365 | iris_test_scores = np.array(iris_test_scores)
366 | ```
367 | 
368 | Concatenate the scores and assess.
369 | 
370 | ```python
371 | iris['stream_scores'] = np.hstack((iris_train_scores, iris_test_scores))
372 | # iris['scores'] from earlier example
373 | rmse = np.sqrt(((iris['scores'] - iris['stream_scores']) ** 2).mean(axis=None))
374 | print(rmse)
375 | ```
376 | 
377 | The root mean squared error (RMSE) between the two approaches is approximately 0.199 (your scores will vary depending on the data and specification).
378 | The plot below shows the scores from the stream approach.
379 | 
380 | ```python
381 | fig = plt.figure(figsize=(7, 7))
382 | ax = fig.add_subplot(111, projection='3d')
383 | ax.scatter(iris['Sepal.Width'], iris['Petal.Width'], iris['Sepal.Length'],
384 | c=iris['stream_scores'], cmap='seismic', s=50)
385 | ax.set_xlabel('Sepal.Width')
386 | ax.set_ylabel('Petal.Width')
387 | ax.set_zlabel('Sepal.Length')
388 | plt.show()
389 | plt.clf()
390 | plt.cla()
391 | plt.close()
392 | ```
393 | 
394 | **LoOP Scores using Stream Approach with n=10**
395 | ![LoOP Scores using Stream Approach with n=10](https://github.com/vc1492a/PyNomaly/blob/main/images/scores_stream.png)
396 | 
397 | ### Notes
398 | When calculating the LoOP score of incoming data, the original fitted scores are not updated.
399 | In some applications, it may be beneficial to refit the data periodically. The stream functionality
400 | also assumes that either data or a distance matrix (or value) will be used across in both fitting
401 | and streaming, with no changes in specification between steps.
402 | 
403 | ## Contributing
404 | 
405 | Please use the issue tracker to report any erroneous behavior or desired 
406 | feature requests. 
407 | 
408 | If you would like to contribute to development, please fork the repository and make 
409 | any changes to a branch which corresponds to an open issue. Hot fixes 
410 | and bug fixes can be represented by branches with the prefix `fix/` versus 
411 | `feature/` for new capabilities or code improvements. Pull requests will 
412 | then be made from these branches into the repository's `dev` branch 
413 | prior to being pulled into `main`. 
414 | 
415 | ### Commit Messages and Releases
416 | 
417 | **Your commit messages are important** - here's why. 
418 | 
419 | PyNomaly leverages [release-please](https://github.com/googleapis/release-please-action) to help automate the release process using the [Conventional Commits](https://www.conventionalcommits.org/) specification. When pull requests are opened to the `main` branch, release-please will collate the git commit messages and prepare an organized changelog and release notes. This process can be completed because of the Conventional Commits specification. 
420 | 
421 | Conventional Commits provides an easy set of rules for creating an explicit commit history; which makes it easier to write automated tools on top of. This convention dovetails with SemVer, by describing the features, fixes, and breaking changes made in commit messages. You can check out examples [here](https://www.conventionalcommits.org/en/v1.0.0/#examples). Make a best effort to use the specification when contributing to Infactory code as it dramatically eases the documentation around releases and their features, breaking changes, bug fixes and documentation updates. 
422 | 
423 | ### Tests
424 | When contributing, please ensure to run unit tests and add additional tests as 
425 | necessary if adding new functionality. To run the unit tests, use `pytest`: 
426 | 
427 | ```
428 | python3 -m pytest --cov=PyNomaly -s -v
429 | ```
430 | 
431 | To run the tests with Numba enabled, simply set the flag `NUMBA` in `test_loop.py` 
432 | to `True`. Note that a drop in coverage is expected due to portions of the code 
433 | being compiled upon code execution. 
434 | 
435 | ## Versioning
436 | [Semantic versioning](http://semver.org/) is used for this project. If contributing, please conform to semantic
437 | versioning guidelines when submitting a pull request.
438 | 
439 | ## License
440 | This project is licensed under the Apache 2.0 license.
441 | 
442 | ## Research
443 | If citing PyNomaly, use the following: 
444 | 
445 | ```
446 | @article{Constantinou2018,
447 |   doi = {10.21105/joss.00845},
448 |   url = {https://doi.org/10.21105/joss.00845},
449 |   year  = {2018},
450 |   month = {oct},
451 |   publisher = {The Open Journal},
452 |   volume = {3},
453 |   number = {30},
454 |   pages = {845},
455 |   author = {Valentino Constantinou},
456 |   title = {{PyNomaly}: Anomaly detection using Local Outlier Probabilities ({LoOP}).},
457 |   journal = {Journal of Open Source Software}
458 | }
459 | ```
460 | 
461 | 
462 | ## References
463 | 1. Breunig M., Kriegel H.-P., Ng R., Sander, J. LOF: Identifying Density-based Local Outliers. ACM SIGMOD International Conference on Management of Data (2000). [PDF](http://www.dbs.ifi.lmu.de/Publikationen/Papers/LOF.pdf).
464 | 2. Kriegel H., Kröger P., Schubert E., Zimek A. LoOP: Local Outlier Probabilities. 18th ACM conference on Information and knowledge management, CIKM (2009). [PDF](http://www.dbs.ifi.lmu.de/Publikationen/Papers/LoOP1649.pdf).
465 | 3. Goldstein M., Uchida S. A Comparative Evaluation of Unsupervised Anomaly Detection Algorithms for Multivariate Data. PLoS ONE 11(4): e0152173 (2016).
466 | 4. Hamlet C., Straub J., Russell M., Kerlin S. An incremental and approximate local outlier probability algorithm for intrusion detection and its evaluation. Journal of Cyber Security Technology (2016). [DOI](http://www.tandfonline.com/doi/abs/10.1080/23742917.2016.1226651?journalCode=tsec20).
467 | 
468 | ## Acknowledgements
469 | - The authors of LoOP (Local Outlier Probabilities)
470 |     - Hans-Peter Kriegel
471 |     - Peer Kröger
472 |     - Erich Schubert
473 |     - Arthur Zimek
474 | - [NASA Jet Propulsion Laboratory](https://jpl.nasa.gov/)
475 |     - [Kyle Hundman](https://github.com/khundman)
476 |     - [Ian Colwell](https://github.com/iancolwell)
477 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.12.0
2 | python-utils>=2.3.0


--------------------------------------------------------------------------------
/requirements_ci.txt:
--------------------------------------------------------------------------------
1 | coveralls>=1.8.0
2 | pandas>=0.24.2
3 | pytest>=4.6.2
4 | pytest-cov>=2.7.1
5 | scikit-learn>=0.21.2
6 | scipy>=1.3.0
7 | wheel>=0.33.4


--------------------------------------------------------------------------------
/requirements_examples.txt:
--------------------------------------------------------------------------------
1 | matplotlib==3.1.0
2 | pandas>=0.24.2
3 | pydataset>=0.2.0
4 | scikit-learn>=0.21.2
5 | scipy>=1.3.0


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | from pathlib import Path
 4 | this_directory = Path(__file__).parent
 5 | long_description = (this_directory / "README.md").read_text()
 6 | 
 7 | setup(
 8 |     name='PyNomaly',
 9 |     packages=['PyNomaly'],
10 |     version='0.3.4',
11 |     description='A Python 3 implementation of LoOP: Local Outlier '
12 |                 'Probabilities, a local density based outlier detection '
13 |                 'method providing an outlier score in the range of [0,1].',
14 |     author='Valentino Constantinou',
15 |     author_email='vc@valentino.io',
16 |     long_description=long_description,
17 |     long_description_content_type='text/markdown',
18 |     url='https://github.com/vc1492a/PyNomaly',
19 |     download_url='https://github.com/vc1492a/PyNomaly/archive/0.3.4.tar.gz',
20 |     keywords=['outlier', 'anomaly', 'detection', 'machine', 'learning',
21 |               'probability'],
22 |     classifiers=[],
23 |     license='Apache License, Version 2.0',
24 |     install_requires=['numpy', 'python-utils']
25 | )
26 | 


--------------------------------------------------------------------------------
/tests/.coverage:
--------------------------------------------------------------------------------
1 | !coverage.py: This is a private format, don't read it directly!{"lines": {"/Users/valentinoconstantinou/Files/Coding_Stuff/Data Science/PyNomaly/PyNomaly/loop.py": [128, 1, 2, 3, 4, 5, 7, 8, 9, 138, 139, 12, 19, 84, 85, 98, 36, 38, 108, 175, 177, 178, 179, 118, 73, 57, 140], "/Users/valentinoconstantinou/Files/Coding_Stuff/Data Science/PyNomaly/PyNomaly/__init__.py": [1]}}


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vc1492a/PyNomaly/684dd3916b15d015d9b5e80a284dab75018ec278/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_loop.py:
--------------------------------------------------------------------------------
  1 | # Authors: Valentino Constantinou <vc@valentino.io>
  2 | # License: Apache 2.0
  3 | 
  4 | from PyNomaly import loop
  5 | 
  6 | import logging
  7 | from typing import Tuple
  8 | import numpy as np
  9 | from numpy.testing import assert_array_equal, assert_array_almost_equal
 10 | import pandas as pd
 11 | import pytest
 12 | from sklearn.datasets import load_iris
 13 | from sklearn.metrics import roc_auc_score
 14 | from sklearn.neighbors import NearestNeighbors
 15 | from sklearn.utils import check_random_state
 16 | import sys
 17 | 
 18 | logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 19 | 
 20 | # flag to enable or disable NUMBA
 21 | NUMBA = False
 22 | 
 23 | if NUMBA is False:
 24 |     logging.info(
 25 |         "Numba is disabled. Coverage statistics are reflective of "
 26 |         "testing native Python code. Consider also testing with numba"
 27 |         " enabled."
 28 |     )
 29 | else:
 30 |     logging.warning(
 31 |         "Numba is enabled. Coverage statistics will be impacted (reduced) to"
 32 |         " due the just-in-time compilation of native Python code."
 33 |     )
 34 | 
 35 | # load the iris dataset
 36 | # and randomly permute it
 37 | rng = check_random_state(0)
 38 | iris = load_iris()
 39 | perm = rng.permutation(iris.target.size)
 40 | iris.data = iris.data[perm]
 41 | iris.target = iris.target[perm]
 42 | 
 43 | 
 44 | # fixtures
 45 | @pytest.fixture()
 46 | def X_n8() -> np.ndarray:
 47 |     """
 48 |     Fixture that generates a small Numpy array with two anomalous values
 49 |     (last two observations).
 50 |     :return: a Numpy array.
 51 |     """
 52 |     # Toy sample (the last two samples are outliers):
 53 |     X = np.array(
 54 |         [[-2, -1], [-1, -1], [-1, -2], [1, 2], [1, 2], [2, 1], [5, 3], [-4, 2]]
 55 |     )
 56 |     return X
 57 | 
 58 | 
 59 | @pytest.fixture()
 60 | def X_n20_scores() -> Tuple[np.ndarray, np.ndarray]:
 61 |     """
 62 |     Fixture that returns a tuple containing a 20 element numpy array
 63 |     and the precalculated loOP scores based on that array.
 64 |     :return: tuple(input_data,exptected_scores)
 65 |     """
 66 |     input_data = np.array(
 67 |         [
 68 |             0.02059752,
 69 |             0.32629926,
 70 |             0.63036653,
 71 |             0.94409321,
 72 |             0.63251097,
 73 |             0.47598494,
 74 |             0.80204026,
 75 |             0.34845067,
 76 |             0.81556468,
 77 |             0.89183,
 78 |             0.25210317,
 79 |             0.11460502,
 80 |             0.19953434,
 81 |             0.36955067,
 82 |             0.06038041,
 83 |             0.34527368,
 84 |             0.56621582,
 85 |             0.90533649,
 86 |             0.33773613,
 87 |             0.71573306,
 88 |         ]
 89 |     )
 90 | 
 91 |     expected_scores = np.array(
 92 |         [
 93 |             0.6356276742921594,
 94 |             0.0,
 95 |             0.0,
 96 |             0.48490790006974044,
 97 |             0.0,
 98 |             0.0,
 99 |             0.0,
100 |             0.0,
101 |             0.021728288376168012,
102 |             0.28285086151683225,
103 |             0.0,
104 |             0.18881886507113213,
105 |             0.0,
106 |             0.0,
107 |             0.45350246469681843,
108 |             0.0,
109 |             0.07886635748113013,
110 |             0.3349068501560546,
111 |             0.0,
112 |             0.0,
113 |         ]
114 |     )
115 |     return (input_data, expected_scores)
116 | 
117 | 
118 | @pytest.fixture()
119 | def X_n120() -> np.ndarray:
120 |     """
121 |     Fixture that generates a Numpy array with 120 observations. Each
122 |     observation contains two float values.
123 |     :return: a Numpy array.
124 |     """
125 |     # Generate train/test data
126 |     rng = check_random_state(2)
127 |     X = 0.3 * rng.randn(120, 2)
128 |     return X
129 | 
130 | 
131 | @pytest.fixture()
132 | def X_n140_outliers(X_n120) -> np.ndarray:
133 |     """
134 |     Fixture that generates a Numpy array with 140 observations, where the
135 |     first 120 observations are "normal" and the last 20 considered anomalous.
136 |     :param X_n120: A pytest Fixture that generates the first 120 observations.
137 |     :return: A Numpy array.
138 |     """
139 |     # Generate some abnormal novel observations
140 |     X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
141 |     X = np.r_[X_n120, X_outliers]
142 |     return X
143 | 
144 | 
145 | @pytest.fixture()
146 | def X_n1000() -> np.ndarray:
147 |     """
148 |     Fixture that generates a Numpy array with 1000 observations.
149 |     :return: A Numpy array.
150 |     """
151 |     # Generate train/test data
152 |     rng = check_random_state(2)
153 |     X = 0.3 * rng.randn(1000, 2)
154 |     return X
155 | 
156 | 
157 | def test_loop(X_n8) -> None:
158 |     """
159 |     Tests the basic functionality and asserts that the anomalous observations
160 |     are detected as anomalies. Tests the functionality using inputs
161 |     as Numpy arrays and as Pandas dataframes.
162 |     :param X_n8: A pytest Fixture that generates the 8 observations.
163 |     :return: None
164 |     """
165 |     # Test LocalOutlierProbability:
166 |     clf = loop.LocalOutlierProbability(X_n8, n_neighbors=5, use_numba=NUMBA)
167 |     score = clf.fit().local_outlier_probabilities
168 |     share_outlier = 2.0 / 8.0
169 |     predictions = [-1 if s > share_outlier else 1 for s in score]
170 |     assert_array_equal(predictions, 6 * [1] + 2 * [-1])
171 | 
172 |     # Assert smallest outlier score is greater than largest inlier score:
173 |     assert np.min(score[-2:]) > np.max(score[:-2])
174 | 
175 |     # Test the DataFrame functionality
176 |     X_df = pd.DataFrame(X_n8)
177 | 
178 |     # Test LocalOutlierProbability:
179 |     clf = loop.LocalOutlierProbability(X_df, n_neighbors=5, use_numba=NUMBA)
180 |     score = clf.fit().local_outlier_probabilities
181 |     share_outlier = 2.0 / 8.0
182 |     predictions = [-1 if s > share_outlier else 1 for s in score]
183 |     assert_array_equal(predictions, 6 * [1] + 2 * [-1])
184 | 
185 |     # Assert smallest outlier score is greater than largest inlier score:
186 |     assert np.min(score[-2:]) > np.max(score[:-2])
187 | 
188 | 
189 | def test_regression(X_n20_scores) -> None:
190 |     """
191 |     Tests for potential regression errors by comparing current results
192 |     to the exptected results. Any changes to the code should still return
193 |     the same result given the same dataset
194 |     """
195 |     input_data, expected_scores = X_n20_scores
196 |     clf = loop.LocalOutlierProbability(input_data).fit()
197 |     scores = clf.local_outlier_probabilities
198 |     assert_array_almost_equal(scores, expected_scores, 6)
199 | 
200 | 
201 | def test_loop_performance(X_n120) -> None:
202 |     """
203 |     Using a set of known anomalies (labels), tests the performance (using
204 |     ROC / AUC score) of the software and ensures it is able to capture most
205 |     anomalies under this basic scenario.
206 |     :param X_n120: A pytest Fixture that generates the 120 observations.
207 |     :return: None
208 |     """
209 |     # Generate some abnormal novel observations
210 |     X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
211 |     X_test = np.r_[X_n120, X_outliers]
212 |     X_labels = np.r_[np.repeat(1, X_n120.shape[0]), np.repeat(-1, X_outliers.shape[0])]
213 | 
214 |     # fit the model
215 |     clf = loop.LocalOutlierProbability(
216 |         X_test,
217 |         n_neighbors=X_test.shape[0] - 1,
218 |         # test the progress bar
219 |         progress_bar=True,
220 |         use_numba=NUMBA,
221 |     )
222 | 
223 |     # predict scores (the lower, the more normal)
224 |     score = clf.fit().local_outlier_probabilities
225 |     share_outlier = X_outliers.shape[0] / X_test.shape[0]
226 |     X_pred = [-1 if s > share_outlier else 1 for s in score]
227 | 
228 |     # check that roc_auc is good
229 |     assert roc_auc_score(X_pred, X_labels) >= 0.98
230 | 
231 | 
232 | def test_input_nodata(X_n140_outliers) -> None:
233 |     """
234 |     Test to ensure that the proper warning is issued if no data is
235 |     provided.
236 |     :param X_n140_outliers: A pytest Fixture that generates 140 observations.
237 |     :return: None
238 |     """
239 |     with pytest.warns(UserWarning) as record:
240 |         # attempt to fit loop without data or a distance matrix
241 |         loop.LocalOutlierProbability(
242 |             n_neighbors=X_n140_outliers.shape[0] - 1, use_numba=NUMBA
243 |         )
244 | 
245 |     # check that only one warning was raised
246 |     assert len(record) == 1
247 |     # check that the message matches
248 |     assert record[0].message.args[0] == "Data or a distance matrix must be provided."
249 | 
250 | 
251 | def test_input_incorrect_type(X_n140_outliers) -> None:
252 |     """
253 |     Test to ensure that the proper warning is issued if the type of an
254 |     argument is the incorrect type.
255 |     :param X_n140_outliers: A pytest Fixture that generates 140 observations.
256 |     :return: None
257 |     """
258 |     with pytest.warns(UserWarning) as record:
259 |         # attempt to fit loop with a string input for n_neighbors
260 |         loop.LocalOutlierProbability(
261 |             X_n140_outliers,
262 |             n_neighbors=str(X_n140_outliers.shape[0] - 1),
263 |             use_numba=NUMBA,
264 |         )
265 | 
266 |     # check that only one warning was raised
267 |     assert len(record) == 1
268 |     # check that the message matches
269 |     assert (
270 |         record[0].message.args[0]
271 |         == "Argument 'n_neighbors' is not of type (<class 'int'>, "
272 |         "<class 'numpy.integer'>)."
273 |     )
274 | 
275 | 
276 | def test_input_neighbor_zero(X_n120) -> None:
277 |     """
278 |     Test to ensure that the proper warning is issued if the neighbor size
279 |     is specified as 0 (must be greater than 0).
280 |     :param X_n120: A pytest Fixture that generates 120 observations.
281 |     :return: None
282 |     """
283 |     clf = loop.LocalOutlierProbability(X_n120, n_neighbors=0, use_numba=NUMBA)
284 | 
285 |     with pytest.warns(UserWarning) as record:
286 |         # attempt to fit loop with a 0 neighbor count
287 |         clf.fit()
288 | 
289 |     # check that only one warning was raised
290 |     assert len(record) == 1
291 |     # check that the message matches
292 |     assert (
293 |         record[0].message.args[0]
294 |         == "n_neighbors must be greater than 0. Fit with 10 instead."
295 |     )
296 | 
297 | 
298 | def test_input_distonly(X_n120) -> None:
299 |     """
300 |     Test to ensure that the proper warning is issued if only a distance
301 |     matrix is provided (without a neighbor matrix).
302 |     :param X_n120: A pytest Fixture that generates 120 observations.
303 |     :return: None
304 |     """
305 |     # generate distance and neighbor indices
306 |     neigh = NearestNeighbors(metric="euclidean")
307 |     neigh.fit(X_n120)
308 |     d, idx = neigh.kneighbors(X_n120, n_neighbors=10, return_distance=True)
309 | 
310 |     with pytest.warns(UserWarning) as record:
311 |         # attempt to fit loop with a distance matrix and no neighbor matrix
312 |         loop.LocalOutlierProbability(distance_matrix=d, use_numba=NUMBA)
313 | 
314 |     # check that only one warning was raised
315 |     assert len(record) == 1
316 |     # check that the message matches
317 |     assert (
318 |         record[0].message.args[0]
319 |         == "A neighbor index matrix and distance matrix must both "
320 |         "be provided when not using raw input data."
321 |     )
322 | 
323 | 
324 | def test_input_neighboronly(X_n120) -> None:
325 |     """
326 |     Test to ensure that the proper warning is issued if only a neighbor
327 |     matrix is provided (without a distance matrix).
328 |     :param X_n120: A pytest Fixture that generates 120 observations.
329 |     :return: None
330 |     """
331 |     # generate distance and neighbor indices
332 |     neigh = NearestNeighbors(metric="euclidean")
333 |     neigh.fit(X_n120)
334 |     d, idx = neigh.kneighbors(X_n120, n_neighbors=10, return_distance=True)
335 | 
336 |     with pytest.warns(UserWarning) as record:
337 |         # attempt to fit loop with a neighbor matrix and no distance matrix
338 |         loop.LocalOutlierProbability(neighbor_matrix=idx, use_numba=NUMBA)
339 | 
340 |     # check that only one warning was raised
341 |     assert len(record) == 1
342 |     # check that the message matches
343 |     assert record[0].message.args[0] == "Data or a distance matrix must be provided."
344 | 
345 | 
346 | def test_input_too_many(X_n120) -> None:
347 |     """
348 |     Test to ensure that the proper warning is issued if both a data matrix
349 |     and a distance matrix are provided (can only be data matrix).
350 |     :param X_n120: A pytest Fixture that generates 120 observations.
351 |     :return: None
352 |     """
353 |     # generate distance and neighbor indices
354 |     neigh = NearestNeighbors(metric="euclidean")
355 |     neigh.fit(X_n120)
356 |     d, idx = neigh.kneighbors(X_n120, n_neighbors=10, return_distance=True)
357 | 
358 |     with pytest.warns(UserWarning) as record:
359 |         # attempt to fit loop with data and a distance matrix
360 |         loop.LocalOutlierProbability(
361 |             X_n120, distance_matrix=d, neighbor_matrix=idx, use_numba=NUMBA
362 |         )
363 | 
364 |     # check that only one warning was raised
365 |     assert len(record) == 1
366 |     # check that the message matches
367 |     assert (
368 |         record[0].message.args[0]
369 |         == "Only one of the following may be provided: data or a "
370 |         "distance matrix (not both)."
371 |     )
372 | 
373 | 
374 | def test_distance_neighbor_shape_mismatch(X_n120) -> None:
375 |     """
376 |     Test to ensure that the proper warning is issued if there is a mismatch
377 |     between the shape of the provided distance and neighbor matrices.
378 |     :param X_n120: A pytest Fixture that generates 120 observations.
379 |     :return: None
380 |     """
381 |     # generate distance and neighbor indices
382 |     neigh = NearestNeighbors(metric="euclidean")
383 |     neigh.fit(X_n120)
384 |     d, idx = neigh.kneighbors(X_n120, n_neighbors=10, return_distance=True)
385 | 
386 |     # generate distance and neighbor indices of a different shape
387 |     neigh_2 = NearestNeighbors(metric="euclidean")
388 |     neigh_2.fit(X_n120)
389 |     d_2, idx_2 = neigh.kneighbors(X_n120, n_neighbors=5, return_distance=True)
390 | 
391 |     with pytest.warns(UserWarning) as record:
392 |         # attempt to fit loop with a mismatch in shapes
393 |         loop.LocalOutlierProbability(
394 |             distance_matrix=d, neighbor_matrix=idx_2, n_neighbors=5, use_numba=NUMBA
395 |         )
396 | 
397 |     # check that only one warning was raised
398 |     assert len(record) == 1
399 |     # check that the message matches
400 |     assert (
401 |         record[0].message.args[0] == "The shape of the distance and neighbor "
402 |         "index matrices must match."
403 |     )
404 | 
405 | 
406 | def test_input_neighbor_mismatch(X_n120) -> None:
407 |     """
408 |     Test to ensure that the proper warning is issued if the supplied distance
409 |     (and neighbor) matrix and specified number of neighbors do not match.
410 |     :param X_n120: A pytest Fixture that generates 120 observations.
411 |     :return: None
412 |     """
413 |     # generate distance and neighbor indices
414 |     neigh = NearestNeighbors(metric="euclidean")
415 |     neigh.fit(X_n120)
416 |     d, idx = neigh.kneighbors(X_n120, n_neighbors=5, return_distance=True)
417 | 
418 |     with pytest.warns(UserWarning) as record:
419 |         # attempt to fit loop with a neighbor size mismatch
420 |         loop.LocalOutlierProbability(
421 |             distance_matrix=d, neighbor_matrix=idx, n_neighbors=10, use_numba=NUMBA
422 |         )
423 | 
424 |     # check that only one warning was raised
425 |     assert len(record) == 1
426 |     # check that the message matches
427 |     assert (
428 |         record[0].message.args[0] == "The shape of the distance or "
429 |         "neighbor index matrix does not "
430 |         "match the number of neighbors "
431 |         "specified."
432 |     )
433 | 
434 | 
435 | def test_loop_dist_matrix(X_n120) -> None:
436 |     """
437 |     Tests to ensure the proper results are returned when supplying the
438 |     appropriate format distance and neighbor matrices.
439 |     :param X_n120: A pytest Fixture that generates 120 observations.
440 |     :return: None
441 |     """
442 |     # generate distance and neighbor indices
443 |     neigh = NearestNeighbors(metric="euclidean")
444 |     neigh.fit(X_n120)
445 |     d, idx = neigh.kneighbors(X_n120, n_neighbors=10, return_distance=True)
446 | 
447 |     # fit loop using data and distance matrix
448 |     clf1 = loop.LocalOutlierProbability(X_n120, use_numba=NUMBA)
449 |     clf2 = loop.LocalOutlierProbability(
450 |         distance_matrix=d, neighbor_matrix=idx, use_numba=NUMBA
451 |     )
452 |     scores1 = clf1.fit().local_outlier_probabilities
453 |     scores2 = clf2.fit().local_outlier_probabilities
454 | 
455 |     # compare the agreement between the results
456 |     assert np.abs(scores2 - scores1).all() <= 0.1
457 | 
458 | 
459 | def test_lambda_values(X_n140_outliers) -> None:
460 |     """
461 |     Test to ensure results are returned which correspond to what is expected
462 |     when varying the extent parameter (we expect larger extent values to
463 |     result in more constrained scores).
464 |     :param X_n140_outliers: A pytest Fixture that generates 140 observations.
465 |     :return: None
466 |     """
467 |     # Fit the model with different extent (lambda) values
468 |     clf1 = loop.LocalOutlierProbability(X_n140_outliers, extent=1, use_numba=NUMBA)
469 |     clf2 = loop.LocalOutlierProbability(X_n140_outliers, extent=2, use_numba=NUMBA)
470 |     clf3 = loop.LocalOutlierProbability(X_n140_outliers, extent=3, use_numba=NUMBA)
471 | 
472 |     # predict scores (the lower, the more normal)
473 |     score1 = clf1.fit().local_outlier_probabilities
474 |     score2 = clf2.fit().local_outlier_probabilities
475 |     score3 = clf3.fit().local_outlier_probabilities
476 | 
477 |     # Get the mean of all the scores
478 |     score_mean1 = np.mean(score1)
479 |     score_mean2 = np.mean(score2)
480 |     score_mean3 = np.mean(score3)
481 | 
482 |     # check that expected the means align with expectation
483 |     assert score_mean1 > score_mean2
484 |     assert score_mean2 > score_mean3
485 | 
486 | 
487 | def test_parameters(X_n120) -> None:
488 |     """
489 |     Test to ensure that the model object contains the needed attributes after
490 |     the model is fit. This is important in the context of the streaming
491 |     functionality.
492 |     :param X_n120: A pytest Fixture that generates 120 observations.
493 |     :return: None
494 |     """
495 |     # fit the model
496 |     clf = loop.LocalOutlierProbability(X_n120, use_numba=NUMBA).fit()
497 | 
498 |     # check that the model has attributes post fit
499 |     assert hasattr(clf, "n_neighbors") and clf.n_neighbors is not None
500 |     assert hasattr(clf, "extent") and clf.extent is not None
501 |     assert hasattr(clf, "cluster_labels") and clf._cluster_labels() is not None
502 |     assert hasattr(clf, "prob_distances") and clf.prob_distances is not None
503 |     assert hasattr(clf, "prob_distances_ev") and clf.prob_distances_ev is not None
504 |     assert (
505 |         hasattr(clf, "norm_prob_local_outlier_factor")
506 |         and clf.norm_prob_local_outlier_factor is not None
507 |     )
508 |     assert (
509 |         hasattr(clf, "local_outlier_probabilities")
510 |         and clf.local_outlier_probabilities is not None
511 |     )
512 | 
513 | 
514 | def test_n_neighbors() -> None:
515 |     """
516 |     Tests the functionality of providing a large number of neighbors that
517 |     is greater than the number of observations (software defaults to the
518 |     data input size and provides a UserWarning).
519 |     :return: None
520 |     """
521 |     X = iris.data
522 |     clf = loop.LocalOutlierProbability(X, n_neighbors=500, use_numba=NUMBA).fit()
523 |     assert clf.n_neighbors == X.shape[0] - 1
524 | 
525 |     clf = loop.LocalOutlierProbability(X, n_neighbors=500, use_numba=NUMBA)
526 | 
527 |     with pytest.warns(UserWarning) as record:
528 |         clf.fit()
529 | 
530 |     # check that only one warning was raised
531 |     assert len(record) == 1
532 | 
533 |     assert clf.n_neighbors == X.shape[0] - 1
534 | 
535 | 
536 | def test_extent() -> None:
537 |     """
538 |     Test to ensure that a UserWarning is issued when providing an invalid
539 |     extent parameter value (can be 1, 2, or 3).
540 |     :return: None
541 |     """
542 |     X = np.array([[1, 1], [1, 0]])
543 |     clf = loop.LocalOutlierProbability(X, n_neighbors=2, extent=4, use_numba=NUMBA)
544 | 
545 |     with pytest.warns(UserWarning) as record:
546 |         clf.fit()
547 | 
548 |     # check that only one warning was raised
549 |     assert len(record) == 1
550 | 
551 | 
552 | def test_data_format() -> None:
553 |     """
554 |     Test to ensure that a UserWarning is issued when the shape of the input
555 |     data is not explicitly correct. This is corrected by the software when
556 |     possible.
557 |     :return: None
558 |     """
559 |     X = [1.3, 1.1, 0.9, 1.4, 1.5, 3.2]
560 |     clf = loop.LocalOutlierProbability(X, n_neighbors=3, use_numba=NUMBA)
561 | 
562 |     with pytest.warns(UserWarning) as record:
563 |         clf.fit()
564 | 
565 |     # check that only one warning was raised
566 |     assert len(record) == 1
567 | 
568 | 
569 | def test_missing_values() -> None:
570 |     """
571 |     Test to ensure that the program exits of a missing value is encountered
572 |     in the input data, as this is not allowable.
573 |     :return: None
574 |     """
575 |     X = np.array([1.3, 1.1, 0.9, 1.4, 1.5, np.nan, 3.2])
576 |     clf = loop.LocalOutlierProbability(X, n_neighbors=3, use_numba=NUMBA)
577 | 
578 |     with pytest.raises(SystemExit) as record_a, pytest.warns(UserWarning) as record_b:
579 |         clf.fit()
580 | 
581 |     assert record_a.type == SystemExit
582 | 
583 |     # check that only one warning was raised
584 |     assert len(record_b) == 1
585 |     # check that the message matches
586 |     assert (
587 |         record_b[0].message.args[0]
588 |         == "Method does not support missing values in input data."
589 |     )
590 | 
591 | 
592 | def test_small_cluster_size(X_n140_outliers) -> None:
593 |     """
594 |     Test to ensure that the program exits when the specified number of
595 |     neighbors is larger than the smallest cluster size in the input data.
596 |     :param X_n140_outliers: A pytest Fixture that generates 140 observations.
597 |     :return: None
598 |     """
599 |     # Generate cluster labels
600 |     a = [0] * 120
601 |     b = [1] * 18
602 |     cluster_labels = a + b
603 | 
604 |     clf = loop.LocalOutlierProbability(
605 |         X_n140_outliers, n_neighbors=50, cluster_labels=cluster_labels, use_numba=NUMBA
606 |     )
607 | 
608 |     with pytest.raises(SystemExit) as record_a, pytest.warns(UserWarning) as record_b:
609 |         clf.fit()
610 | 
611 |     assert record_a.type == SystemExit
612 | 
613 |     # check that only one warning was raised
614 |     assert len(record_b) == 1
615 |     # check that the message matches
616 |     assert (
617 |         record_b[0].message.args[0]
618 |         == "Number of neighbors specified larger than smallest "
619 |         "cluster. Specify a number of neighbors smaller than "
620 |         "the smallest cluster size (observations in smallest "
621 |         "cluster minus one)."
622 |     )
623 | 
624 | 
625 | def test_stream_fit(X_n140_outliers) -> None:
626 |     """
627 |     Test to ensure that the proper warning is issued if the user attempts
628 |     to use the streaming approach prior to the classical approach being fit.
629 |     :param X_n140_outliers: A pytest Fixture that generates 140 observations.
630 |     :return: None
631 |     """
632 |     # Fit the model
633 |     X_train = X_n140_outliers[0:138]
634 |     X_test = X_n140_outliers[139]
635 |     clf = loop.LocalOutlierProbability(X_train, use_numba=NUMBA)
636 | 
637 |     with pytest.warns(UserWarning) as record:
638 |         clf.stream(X_test)
639 | 
640 |     # check that the message matches
641 |     messages = [i.message.args[0] for i in record]
642 |     assert (
643 |         "Must fit on historical data by calling fit() prior to "
644 |         "calling stream(x)." in messages
645 |     )
646 | 
647 | 
648 | def test_stream_distance(X_n140_outliers) -> None:
649 |     """
650 |     Test to ensure that the streaming approach functions as desired when
651 |     providing matrices for use and that the returned results are within some
652 |     margin of error when compared to the classical approach (using the RMSE).
653 |     :param X_n140_outliers: A pytest Fixture that generates 140 observations.
654 |     :return: None
655 |     """
656 |     X_train = X_n140_outliers[0:100]
657 |     X_test = X_n140_outliers[100:140]
658 | 
659 |     # generate distance and neighbor indices
660 |     neigh = NearestNeighbors(metric="euclidean")
661 |     neigh.fit(X_train)
662 |     d, idx = neigh.kneighbors(X_train, n_neighbors=10, return_distance=True)
663 | 
664 |     # Fit the models in standard and distance matrix form
665 |     m = loop.LocalOutlierProbability(X_train, use_numba=NUMBA).fit()
666 |     m_dist = loop.LocalOutlierProbability(
667 |         distance_matrix=d, neighbor_matrix=idx, use_numba=NUMBA
668 |     ).fit()
669 | 
670 |     # Collect the scores
671 |     X_test_scores = []
672 |     for i in range(X_test.shape[0]):
673 |         X_test_scores.append(m.stream(np.array(X_test[i])))
674 |     X_test_scores = np.array(X_test_scores)
675 | 
676 |     X_test_dist_scores = []
677 |     for i in range(X_test.shape[0]):
678 |         dd, ii = neigh.kneighbors(np.array([X_test[i]]), return_distance=True)
679 |         X_test_dist_scores.append(m_dist.stream(np.mean(dd)))
680 |     X_test_dist_scores = np.array(X_test_dist_scores)
681 | 
682 |     # calculate the rmse and ensure score is below threshold
683 |     rmse = np.sqrt(((X_test_scores - X_test_dist_scores) ** 2).mean(axis=None))
684 |     assert 0.075 >= rmse
685 | 
686 | 
687 | def test_stream_cluster(X_n140_outliers) -> None:
688 |     """
689 |     Test to ensure that the proper warning is issued if the streaming approach
690 |     is called on clustered data, as the streaming approach does not support
691 |     this functionality.
692 |     :param X_n140_outliers: A pytest Fixture that generates 140 observations.
693 |     :return: None
694 |     """
695 |     # Generate cluster labels
696 |     a = [0] * 120
697 |     b = [1] * 18
698 |     cluster_labels = a + b
699 | 
700 |     # Fit the model
701 |     X_train = X_n140_outliers[0:138]
702 |     X_test = X_n140_outliers[139]
703 |     clf = loop.LocalOutlierProbability(
704 |         X_train, cluster_labels=cluster_labels, use_numba=NUMBA
705 |     ).fit()
706 | 
707 |     with pytest.warns(UserWarning) as record:
708 |         clf.stream(X_test)
709 | 
710 |     # check that only one warning was raised
711 |     assert len(record) == 1
712 |     # check that the message matches
713 |     assert (
714 |         record[0].message.args[0] == "Stream approach does not support clustered data. "
715 |         "Automatically refit using single cluster of points."
716 |     )
717 | 
718 | 
719 | def test_stream_performance(X_n140_outliers) -> None:
720 |     """
721 |     Test to ensure that the streaming approach works as desired when using
722 |     a regular set of input data (no distance and neighbor matrices) and that
723 |     the result is within some expected level of error when compared to the
724 |     classical approach.
725 |     :param X_n140_outliers: A pytest Fixture that generates 140 observations.
726 |     :return:
727 |     """
728 |     X_train = X_n140_outliers[0:100]
729 |     X_test = X_n140_outliers[100:140]
730 | 
731 |     # Fit the models in standard and stream form
732 |     m = loop.LocalOutlierProbability(X_n140_outliers, use_numba=NUMBA).fit()
733 |     scores_noclust = m.local_outlier_probabilities
734 | 
735 |     m_train = loop.LocalOutlierProbability(X_train, use_numba=NUMBA)
736 |     m_train.fit()
737 |     X_train_scores = m_train.local_outlier_probabilities
738 | 
739 |     X_test_scores = []
740 |     for idx in range(X_test.shape[0]):
741 |         X_test_scores.append(m_train.stream(X_test[idx]))
742 |     X_test_scores = np.array(X_test_scores)
743 | 
744 |     stream_scores = np.hstack((X_train_scores, X_test_scores))
745 | 
746 |     # calculate the rmse and ensure score is below threshold
747 |     rmse = np.sqrt(((scores_noclust - stream_scores) ** 2).mean(axis=None))
748 |     assert 0.35 > rmse
749 | 
750 | 
751 | def test_progress_bar(X_n8) -> None:
752 |     """
753 |     Tests the progress bar functionality on a small number of observations,
754 |     when the number of observations is less than the width of the console
755 |     window.
756 |     :param X_n8: a numpy array with 8 observations.
757 |     :return: None
758 |     """
759 | 
760 |     # attempt to use the progress bar on a small number of observations
761 |     loop.LocalOutlierProbability(X_n8, use_numba=NUMBA, progress_bar=True).fit()
762 | 
763 | 
764 | def test_data_flipping() -> None:
765 |     """
766 |     Tests the flipping of data and cluster labels and ensures that the
767 |     :return: None
768 |     """
769 |     np.random.seed(1)
770 |     n = 9
771 |     data = np.append(
772 |         np.random.normal(2, 1, [n, 2]), np.random.normal(8, 1, [n, 2]), axis=0
773 |     )
774 |     clus = np.append(np.ones(n), 2 * np.ones(n)).tolist()
775 |     model = loop.LocalOutlierProbability(data, n_neighbors=5, cluster_labels=clus)
776 |     fit = model.fit()
777 |     res = fit.local_outlier_probabilities
778 | 
779 |     data_flipped = np.flipud(data)
780 |     clus_flipped = np.flipud(clus).tolist()
781 |     model2 = loop.LocalOutlierProbability(
782 |         data_flipped, n_neighbors=5, cluster_labels=clus_flipped
783 |     )
784 |     fit2 = model2.fit()
785 |     res2 = np.flipud(fit2.local_outlier_probabilities)
786 | 
787 |     assert_array_almost_equal(res, res2, decimal=6)
788 |     assert_array_almost_equal(
789 |         fit.norm_prob_local_outlier_factor,
790 |         fit2.norm_prob_local_outlier_factor,
791 |         decimal=6,
792 |     )
793 | 
794 | 
795 | def test_distance_matrix_consistency(X_n120) -> None:
796 |     """
797 |     Test to ensure that the distance matrix is consistent with the neighbor
798 |     matrix and that the software is able to handle self-distances.
799 |     :return: None
800 |     """
801 | 
802 |     neigh = NearestNeighbors(metric='euclidean')
803 |     neigh.fit(X_n120)
804 |     distances, indices = neigh.kneighbors(X_n120, n_neighbors=11, return_distance=True)
805 | 
806 |     # remove the closest neighbor (its the point itself) from each row in the indices matrix and distances matrix
807 |     indices = np.delete(indices, 0, 1)
808 |     distances = np.delete(distances, 0, 1)
809 | 
810 |     # Fit LoOP with and without distance matrix
811 |     clf_data = loop.LocalOutlierProbability(X_n120, n_neighbors=10)
812 |     clf_dist = loop.LocalOutlierProbability(distance_matrix=distances, neighbor_matrix=indices, n_neighbors=11)
813 | 
814 |     # Attempt to retrieve scores and check types
815 |     scores_data = clf_data.fit().local_outlier_probabilities
816 |     scores_dist = clf_dist.fit().local_outlier_probabilities
817 | 
818 |     # Debugging prints to investigate types and contents
819 |     print("Type of scores_data:", type(scores_data))
820 |     print("Type of scores_dist:", type(scores_dist))
821 |     print("Value of scores_data:", scores_data)
822 |     print("Value of scores_dist:", scores_dist)
823 |     print("Shape of scores_data:", scores_data.shape)
824 |     print("Shape of scores_dist:", scores_dist.shape)
825 | 
826 |     # Convert to arrays if they aren't already
827 |     scores_data = np.array(scores_data) if not isinstance(scores_data, np.ndarray) else scores_data
828 |     scores_dist = np.array(scores_dist) if not isinstance(scores_dist, np.ndarray) else scores_dist
829 | 
830 |     # Check shapes and types before assertion
831 |     assert scores_data.shape == scores_dist.shape, "Score shapes mismatch"
832 |     assert isinstance(scores_data, np.ndarray), "Expected scores_data to be a numpy array"
833 |     assert isinstance(scores_dist, np.ndarray), "Expected scores_dist to be a numpy array"
834 | 
835 |     # Compare scores allowing for minor floating-point differences
836 |     assert_array_almost_equal(scores_data, scores_dist, decimal=10, err_msg="Inconsistent LoOP scores due to self-distances")
837 | 


--------------------------------------------------------------------------------