├── .gitattributes
├── .github
└── workflows
│ ├── docs.yml
│ ├── python-publish.yml
│ └── test.yml
├── .gitignore
├── .vscode
└── settings.json
├── LICENSE
├── README.md
├── docs
├── Makefile
├── api.rst
├── conf.py
├── design.rst
├── getting_started.rst
├── index.rst
└── make.bat
├── images
├── active_learning_app.png
├── anomaly.png
└── logo
│ ├── tsod.eps
│ ├── tsod.png
│ └── tsod.svg
├── notebooks
├── Example Water Level.ipynb
├── Getting started.ipynb
├── SMHI_hydrology.ipynb
└── cmems.ipynb
├── pyproject.toml
├── tests
├── __init__.py
├── data
│ ├── BO_TS_MO_FINO2.nc
│ ├── Ballen_20150218-20201222.csv
│ ├── combined.joblib
│ └── example.csv
├── data_generation.py
├── test_detectors.py
└── test_persistence.py
└── tsod
├── __init__.py
├── base.py
├── custom_exceptions.py
├── detectors.py
├── features.py
└── hampel.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-language=Python
--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
1 | name: Documentation
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 |
8 | jobs:
9 | build:
10 |
11 | runs-on: ubuntu-latest
12 |
13 | steps:
14 | - uses: actions/checkout@v2
15 | - name: Set up Python
16 | uses: actions/setup-python@v2
17 | with:
18 | python-version: 3.12
19 |
20 | - name: Install package
21 | run: |
22 | pip install .[dev]
23 |
24 | - name: Sphinx Build
25 | run: |
26 | cd docs
27 | make html
28 |
29 | - name: Publish to GitHub Pages
30 | uses: peaceiris/actions-gh-pages@v3.6.1
31 | with:
32 | github_token: ${{ secrets.GITHUB_TOKEN }}
33 | publish_dir: docs/_build/html
--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
1 | # This workflows will upload a Python Package using Twine when a release is created
2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
3 |
4 | name: Upload Python Package
5 |
6 | on:
7 | workflow_dispatch:
8 |
9 | release:
10 | types: [created]
11 |
12 | jobs:
13 | deploy:
14 |
15 | runs-on: ubuntu-latest
16 |
17 | steps:
18 | - uses: actions/checkout@v2
19 | - name: Set up Python
20 | uses: actions/setup-python@v2
21 | with:
22 | python-version: '3.x'
23 | - name: Install dependencies
24 | run: |
25 | python -m pip install --upgrade pip
26 | pip install setuptools wheel twine
27 | - name: Build and publish
28 | env:
29 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
30 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
31 | run: |
32 | python setup.py sdist bdist_wheel
33 | twine upload dist/*
34 |
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Full test
5 |
6 | on:
7 | push:
8 | branches: [ main ]
9 | pull_request:
10 | branches: [ main ]
11 |
12 | jobs:
13 | build:
14 |
15 | runs-on: ${{ matrix.os }}
16 | strategy:
17 | matrix:
18 | os: [ubuntu-latest, windows-latest]
19 | python-version: ["3.10", "3.13"]
20 |
21 | steps:
22 | - uses: actions/checkout@v3
23 | - uses: chartboost/ruff-action@v1 # Fail fast if there are any linting errors
24 | - name: Set up Python ${{ matrix.python-version }}
25 | uses: actions/setup-python@v4
26 | with:
27 | python-version: ${{ matrix.python-version }}
28 | - name: Install dependencies
29 | run: |
30 | python -m pip install --upgrade pip
31 | pip install wheel pytest pytest-cov
32 |
33 | - name: Install tsod
34 | run: |
35 | pip install .[test]
36 | - name: Test with pytest
37 | run: |
38 | pytest --cov=tsod tests
39 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # PyCharm
2 | .idea/
3 |
4 | # Byte-compiled / optimized / DLL files
5 | __pycache__/
6 | *.py[cod]
7 | *$py.class
8 |
9 | # C extensions
10 | *.so
11 |
12 | # Distribution / packaging
13 | .Python
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | wheels/
26 | pip-wheel-metadata/
27 | share/python-wheels/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | MANIFEST
32 |
33 | # PyInstaller
34 | # Usually these files are written by a python script from a template
35 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
36 | *.manifest
37 | *.spec
38 |
39 | # Installer logs
40 | pip-log.txt
41 | pip-delete-this-directory.txt
42 |
43 | # Unit test / coverage reports
44 | htmlcov/
45 | .tox/
46 | .nox/
47 | .coverage
48 | .coverage.*
49 | .cache
50 | nosetests.xml
51 | coverage.xml
52 | *.cover
53 | *.py,cover
54 | .hypothesis/
55 | .pytest_cache/
56 |
57 | # Translations
58 | *.mo
59 | *.pot
60 |
61 | # Django stuff:
62 | *.log
63 | local_settings.py
64 | db.sqlite3
65 | db.sqlite3-journal
66 |
67 | # Flask stuff:
68 | instance/
69 | .webassets-cache
70 |
71 | # Scrapy stuff:
72 | .scrapy
73 |
74 | # Sphinx documentation
75 | docs/_build/
76 |
77 | # PyBuilder
78 | target/
79 |
80 | # Jupyter Notebook
81 | .ipynb_checkpoints
82 |
83 | # IPython
84 | profile_default/
85 | ipython_config.py
86 |
87 | # pyenv
88 | .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
98 | __pypackages__/
99 |
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 |
104 | # SageMath parsed files
105 | *.sage.py
106 |
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 |
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 |
120 | # Rope project settings
121 | .ropeproject
122 |
123 | # mkdocs documentation
124 | /site
125 |
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 |
131 | # Pyre type checker
132 | .pyre/
133 |
134 | # Sphinx
135 | /doc/_build/
136 |
137 | data/
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "python.testing.pytestArgs": [
3 | "tests"
4 | ],
5 | "python.formatting.provider": "none",
6 | "editor.formatOnSave": true,
7 | "python.testing.unittestEnabled": false,
8 | "python.testing.nosetestsEnabled": false,
9 | "python.testing.pytestEnabled": true,
10 | "restructuredtext.confPath": "${workspaceFolder}\\doc",
11 | "[python]": {
12 | "editor.defaultFormatter": "ms-python.black-formatter"
13 | }
14 | }
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 DHI
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # tsod: Anomaly Detection for time series data.
4 |
5 | [](https://github.com/DHI/tsod/actions/workflows/test.yml)
6 | [](https://badge.fury.io/py/tsod)
7 | 
8 |
9 | 
10 |
11 | Sensors often provide faulty or missing observations. These anomalies must be detected automatically and replaced with more feasible values before feeding the data to numerical simulation engines as boundary conditions or real time decision systems.
12 |
13 | This package aims to provide examples and algorithms for detecting anomalies in time series data specifically tailored to DHI users and the water domain. It is simple to install and deploy operationally and is accessible to everyone (open-source).
14 |
15 | ## Getting Started
16 |
17 | * [Documentation](https://dhi.github.io/tsod/getting_started.html)
18 | * [Notebook](https://github.com/DHI/tsod/blob/main/notebooks/Getting%20started.ipynb) [](http://colab.research.google.com/github/DHI/tsod/blob/main/notebooks/Getting%20started.ipynb)
19 |
20 |
21 |
22 | ## Installation
23 |
24 | `tsod` is a pure Python library and runs on Windows, Linux and Mac.
25 |
26 | From PyPI:
27 |
28 | `pip install tsod`
29 |
30 | Or development version:
31 |
32 | `pip install https://github.com/DHI/tsod/archive/main.zip`
33 |
34 |
38 |
39 |
40 | ## Vision
41 | * A simple and consistent API for anomaly detection of timeseries
42 | * The computational speed will be good for typical timeseries data found in the water domain, to support realtime detection
43 | * It will have a suite of different algorithms ranging from simple rule-based to more advanced based on e.g. neural networks
44 |
45 | ## Definitions
46 | Note that we distinguish between [two types of anomaly detection](https://scikit-learn.org/stable/modules/outlier_detection.html)
47 |
48 | - Outlier detection (unsupervised anomaly detection)
49 | The training data may contain outliers, i.e. observations far from most other observations. Outlier detectors try to concentrate on the observations in the training data that similar and close together, and ignores observations further away.
50 |
51 | - Novelty detection (semi-supervised anomaly detection)
52 | The training data is considered "normal" and is not polluted by outliers. New test data observations can be categorized as an outlier and is in this context called a novelty.
53 |
54 |
55 | ## Contribute to `tsod`
56 | [Open in Visual Studio Code](https://open.vscode.dev/DHI/tsod)
57 | - Follow PEP8 code style. This is automatically checked during Pull Requests.
58 |
59 | - If citing or re-using other code please make sure their license is also consistent with our policy.
60 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
1 | .. _api:
2 |
3 | API Reference
4 | =================
5 |
6 | * :class:`tsod.RangeDetector`
7 | * :class:`tsod.ConstantValueDetector`
8 | * :class:`tsod.ConstantGradientDetector`
9 | * :class:`tsod.GradientDetector`
10 | * :class:`tsod.DiffDetector`
11 | * :class:`tsod.CombinedDetector`
12 |
13 | Generic
14 | -------
15 | .. autoclass:: tsod.RangeDetector
16 | :members:
17 | :undoc-members:
18 | :inherited-members:
19 |
20 | .. autoclass:: tsod.ConstantValueDetector
21 | :members:
22 | :undoc-members:
23 | :inherited-members:
24 |
25 | .. autoclass:: tsod.ConstantGradientDetector
26 | :members:
27 | :undoc-members:
28 | :inherited-members:
29 |
30 | .. autoclass:: tsod.GradientDetector
31 | :members:
32 | :undoc-members:
33 | :inherited-members:
34 |
35 | .. autoclass:: tsod.DiffDetector
36 | :members:
37 | :undoc-members:
38 | :inherited-members:
39 |
40 | .. autoclass:: tsod.CombinedDetector
41 | :members:
42 | :undoc-members:
43 | :inherited-members:
44 |
45 | Hampel
46 | -------
47 | .. autoclass:: tsod.hampel.HampelDetector
48 | :members:
49 | :undoc-members:
50 | :inherited-members:
51 |
52 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 |
17 |
18 | # -- Project information -----------------------------------------------------
19 |
20 | project = "tsod"
21 | copyright = (
22 | "2021, Henrik Andersson, Rasmus Halvgaard, Laura Frøhlich, Jesper Mariegaard"
23 | )
24 | author = "Henrik Andersson, Rasmus Halvgaard, Laura Frøhlich, Jesper Mariegaard"
25 |
26 | # The full version, including alpha/beta/rc tags
27 | release = ""
28 |
29 |
30 | # -- General configuration ---------------------------------------------------
31 |
32 | # Add any Sphinx extension module names here, as strings. They can be
33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
34 | # ones.
35 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.napoleon"]
36 |
37 | # Add any paths that contain templates here, relative to this directory.
38 | templates_path = ["_templates"]
39 |
40 | # List of patterns, relative to source directory, that match files and
41 | # directories to ignore when looking for source files.
42 | # This pattern also affects html_static_path and html_extra_path.
43 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
44 |
45 |
46 | # -- Options for HTML output -------------------------------------------------
47 |
48 | # The theme to use for HTML and HTML Help pages. See the documentation for
49 | # a list of builtin themes.
50 | #
51 | html_theme = "sphinx_book_theme"
52 |
53 | # Add any paths that contain custom static files (such as style sheets) here,
54 | # relative to this directory. They are copied after the builtin static files,
55 | # so a file named "default.css" will overwrite the builtin "default.css".
56 | # html_static_path = ['_static']
57 |
58 | # autoclass_content = 'both'
59 |
--------------------------------------------------------------------------------
/docs/design.rst:
--------------------------------------------------------------------------------
1 | .. _design:
2 |
3 | Design philosophy
4 | =================
5 |
6 | * Easy to use
7 | * Easy to install
8 | * Easy to get started
9 | * Open Source
10 | * Easy to collaborate
11 | * Reproducible
12 | * Easy access to new features
13 |
14 |
15 | Easy to use
16 | -----------
17 | Common operations such as reading a file should only need a few lines of code.
18 |
19 | Make extensive use of existing standard libraries for scientific computing such as numpy, matplotlib and pandas.
20 |
21 |
22 | Easy to install
23 | ---------------
24 |
25 | From PyPI::
26 |
27 | pip install tsod
28 |
29 |
30 | Easy to get started
31 | -------------------
32 | By providing many examples to cut/paste from.
33 |
34 | Examples are available in two forms:
35 |
36 | * `Unit tests `_
37 | * `Jupyter notebooks `_
38 |
39 | Open Source
40 | ------------
41 |
42 | tsod is an open source project licensed under the `MIT license `_.
43 | The software is provided free of charge with the source code available for inspection and modification.
44 |
45 |
46 | Easy to collaborate
47 | --------------------
48 |
49 | By developing `tsod` on GitHub along with a completely open discussion, we believe that the collaboration between developers and end-users results in a useful library.
50 |
51 | Reproducible
52 | ------------
53 |
54 | By providing the historical versions of `tsod`` on PyPI it is possible to reproduce the behaviour of an older existing system, based on an older version.
55 |
56 | Install specific version::
57 |
58 | pip install tsod==0.1.2
59 |
60 |
61 | Install development version::
62 |
63 | pip install https://github.com/DHI/tsod/archive/main.zip
64 |
--------------------------------------------------------------------------------
/docs/getting_started.rst:
--------------------------------------------------------------------------------
1 | .. _getting_started:
2 |
3 | Getting started
4 | ===============
5 |
6 | `tsod` is library for timeseries data. The format of a timeseries is always a :py:class:`~pandas.Series` and in some cases with a :py:class:`~pandas.DatetimeIndex`
7 |
8 | 1. Get data in the form of a a :py:class:`~pandas.Series` (see Data formats below)
9 | 2. Select one or more detectors e.g. :class:`RangeDetector ` or :class:`ConstantValueDetector `
10 | 3. Define parameters (e.g. min/max, max rate of change) or...
11 | 4. Fit parameters based on normal data, i.e. without outliers
12 | 5. Detect outliers in any dataset
13 |
14 | .. image:: https://colab.research.google.com/assets/colab-badge.svg
15 | :target: http://colab.research.google.com/github/DHI/tsod/blob/main/notebooks/Getting%20started.ipynb
16 |
17 | Example
18 | -------
19 |
20 | >>> import pandas as pd
21 | >>> from tsod import RangeDetector
22 | >>> rd = RangeDetector(max_value=2.0)
23 | >>> data = pd.Series([0.0, 1.0, 3.0]) # 3.0 is out of range i.e. an anomaly
24 | >>> anom = rd.detect(data)
25 | >>> anom
26 | 0 False
27 | 1 False
28 | 2 True
29 | dtype: bool
30 | >>> data[anom] # get anomalous data
31 | 2 3.0
32 | dtype: float64
33 | >>> data[~anom] # get normal data
34 | 0 0.0
35 | 1 1.0
36 | dtype: float64
37 | >>>
38 |
39 |
40 | Saving and loading
41 | ------------------
42 | .. code-block:: python
43 |
44 | # save a configured detector
45 | cd = CombinedDetector([ConstantValueDetector(), RangeDetector()])
46 | cd.fit(normal_data)
47 | cd.save("detector.joblib")
48 |
49 | # ... and then later load it from disk
50 | my_detector = tsod.load("detector.joblib")
51 | my_detector.detect(some_data)
52 |
53 | Data formats
54 | ------------
55 |
56 | Converting data to a :py:class:`~pandas.Series`
57 |
58 | .. code-block:: python
59 |
60 | import pandas as pd
61 | df = pd.read_csv("mydata.csv", parse_dates=True, index_col=0)
62 | my_series = df['water_level']
63 |
64 | from mikeio import Dfs0
65 | dfs = Dfs0('simple.dfs0')
66 | df = dfs.to_dataframe()
67 | my_series_2 = df['rainfall']
68 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. tsod documentation master file, created by
2 | sphinx-quickstart on Thu Mar 25 08:11:16 2021.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | .. image:: https://raw.githubusercontent.com/DHI/tsod/main/images/logo/tsod.png
7 | :width: 600
8 |
9 | tsod: Anomaly Detection for time series data.
10 | =============================================
11 |
12 |
13 | .. image:: https://raw.githubusercontent.com/DHI/tsod/main/images/anomaly.png
14 |
15 | Sensors often provide faulty or missing observations. These anomalies must be detected automatically and replaced with more feasible values before feeding the data to numerical simulation engines as boundary conditions or real time decision systems.
16 |
17 | This package aims to provide examples and algorithms for detecting anomalies in time series data specifically tailored to DHI users and the water domain. It is simple to install and deploy operationally and is accessible to everyone (open-source).
18 |
19 | .. toctree::
20 | :maxdepth: 2
21 | :caption: Contents:
22 | :hidden:
23 |
24 | getting_started
25 | design
26 | api
27 |
28 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/images/active_learning_app.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DHI/tsod/07669cfb4d92daf28e6ee1495f0090ae4046a57e/images/active_learning_app.png
--------------------------------------------------------------------------------
/images/anomaly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DHI/tsod/07669cfb4d92daf28e6ee1495f0090ae4046a57e/images/anomaly.png
--------------------------------------------------------------------------------
/images/logo/tsod.eps:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DHI/tsod/07669cfb4d92daf28e6ee1495f0090ae4046a57e/images/logo/tsod.eps
--------------------------------------------------------------------------------
/images/logo/tsod.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DHI/tsod/07669cfb4d92daf28e6ee1495f0090ae4046a57e/images/logo/tsod.png
--------------------------------------------------------------------------------
/images/logo/tsod.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
31 |
--------------------------------------------------------------------------------
/notebooks/Example Water Level.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Load water level data from DMI"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import os\n",
17 | "import pandas as pd\n",
18 | "import numpy as np"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 2,
24 | "metadata": {},
25 | "outputs": [
26 | {
27 | "name": "stderr",
28 | "output_type": "stream",
29 | "text": [
30 | "C:\\Users\\JAN\\AppData\\Local\\Temp/ipykernel_21332/396824514.py:3: DeprecationWarning: `set_matplotlib_formats` is deprecated since IPython 7.23, directly use `matplotlib_inline.backend_inline.set_matplotlib_formats()`\n",
31 | " set_matplotlib_formats('png')\n"
32 | ]
33 | }
34 | ],
35 | "source": [
36 | "from IPython.display import set_matplotlib_formats\n",
37 | "set_matplotlib_formats('png')"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 3,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "from tsod.detectors import CombinedDetector, RangeDetector, DiffDetector, RollingStandardDeviationDetector\n",
47 | "from tsod.hampel import HampelDetector"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 4,
53 | "metadata": {},
54 | "outputs": [],
55 | "source": [
56 | "file_path = os.path.join(\"..\", \"tests\", \"data\", \"Ballen_20150218-20201222.csv\")\n",
57 | "df = pd.read_csv(file_path, index_col=0, parse_dates=True)"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 5,
63 | "metadata": {},
64 | "outputs": [],
65 | "source": [
66 | "data = df.water_level"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": 6,
72 | "metadata": {},
73 | "outputs": [
74 | {
75 | "data": {
76 | "text/plain": [
77 | ""
78 | ]
79 | },
80 | "execution_count": 6,
81 | "metadata": {},
82 | "output_type": "execute_result"
83 | },
84 | {
85 | "data": {
86 | "image/png": "\n",
87 | "text/plain": [
88 | ""
89 | ]
90 | },
91 | "metadata": {
92 | "needs_background": "light"
93 | },
94 | "output_type": "display_data"
95 | }
96 | ],
97 | "source": [
98 | "data.plot()"
99 | ]
100 | },
101 | {
102 | "cell_type": "markdown",
103 | "metadata": {},
104 | "source": [
105 | "# Detect anomalies outside manually set range"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 7,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "range_anomalies = RangeDetector(-1, 1.3).detect(data)"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": 8,
120 | "metadata": {},
121 | "outputs": [
122 | {
123 | "data": {
124 | "text/plain": [
125 | ""
126 | ]
127 | },
128 | "execution_count": 8,
129 | "metadata": {},
130 | "output_type": "execute_result"
131 | },
132 | {
133 | "data": {
134 | "image/png": "\n",
135 | "text/plain": [
136 | ""
137 | ]
138 | },
139 | "metadata": {
140 | "needs_background": "light"
141 | },
142 | "output_type": "display_data"
143 | }
144 | ],
145 | "source": [
146 | "detected = data.to_frame()\n",
147 | "detected[\"anomalies\"] = data[range_anomalies.values]\n",
148 | "detected.plot(style=['-', 'o'], figsize=(8,3), title=f\"Anomalies detected: {range_anomalies.sum()}\")"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": 9,
154 | "metadata": {},
155 | "outputs": [],
156 | "source": [
157 | "data_clean = data.copy()\n",
158 | "data_clean[range_anomalies.values] = np.nan"
159 | ]
160 | },
161 | {
162 | "cell_type": "markdown",
163 | "metadata": {},
164 | "source": [
165 | "# Detect anomalies outside automatically set range"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": 10,
171 | "metadata": {},
172 | "outputs": [],
173 | "source": [
174 | "N = 1000\n",
175 | "normal_data, test_data = data[:N], data[N:]"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": 11,
181 | "metadata": {},
182 | "outputs": [],
183 | "source": [
184 | "anomaly_detector = CombinedDetector([RangeDetector(), DiffDetector()])\n",
185 | "anomaly_detector.fit(normal_data)\n",
186 | "detected_anomalies = anomaly_detector.detect(test_data)"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": 12,
192 | "metadata": {},
193 | "outputs": [
194 | {
195 | "data": {
196 | "text/plain": [
197 | "2015-02-25 12:30:00 False\n",
198 | "2015-02-25 12:40:00 False\n",
199 | "2015-02-25 12:50:00 False\n",
200 | "2015-02-25 13:00:00 False\n",
201 | "2015-02-25 13:20:00 False\n",
202 | "dtype: bool"
203 | ]
204 | },
205 | "execution_count": 12,
206 | "metadata": {},
207 | "output_type": "execute_result"
208 | }
209 | ],
210 | "source": [
211 | "detected_anomalies.head()"
212 | ]
213 | },
214 | {
215 | "cell_type": "markdown",
216 | "metadata": {},
217 | "source": [
218 | "# Detect peaks"
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": 13,
224 | "metadata": {},
225 | "outputs": [],
226 | "source": [
227 | "detector = RollingStandardDeviationDetector(10, 0.1)\n",
228 | "std_anomalies = detector.detect(data)"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": 14,
234 | "metadata": {},
235 | "outputs": [
236 | {
237 | "data": {
238 | "text/plain": [
239 | ""
240 | ]
241 | },
242 | "execution_count": 14,
243 | "metadata": {},
244 | "output_type": "execute_result"
245 | },
246 | {
247 | "data": {
248 | "image/png": "\n",
249 | "text/plain": [
250 | ""
251 | ]
252 | },
253 | "metadata": {
254 | "needs_background": "light"
255 | },
256 | "output_type": "display_data"
257 | }
258 | ],
259 | "source": [
260 | "detected = data.to_frame()\n",
261 | "detected[\"anomalies\"] = data[std_anomalies.values]\n",
262 | "detected.plot(style=['-', 'o'], figsize=(8,3))"
263 | ]
264 | },
265 | {
266 | "cell_type": "markdown",
267 | "metadata": {},
268 | "source": [
269 | "# Hampel filter"
270 | ]
271 | },
272 | {
273 | "cell_type": "markdown",
274 | "metadata": {},
275 | "source": [
276 | "The default threshold of the HampelDetector is 3, which means that a sample that deviates by more than three times of the rolling window's standard deviation is marked as an anomaly. **Increasing** the threshold marks **more** samples as anomalies, **decreasing** the threshold marks **fewer**."
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": 16,
282 | "metadata": {},
283 | "outputs": [],
284 | "source": [
285 | "detector = HampelDetector(window_size=20, threshold=3)"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": 17,
291 | "metadata": {},
292 | "outputs": [],
293 | "source": [
294 | "anomalies = detector.detect(data)"
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": 18,
300 | "metadata": {},
301 | "outputs": [
302 | {
303 | "data": {
304 | "text/plain": [
305 | ""
306 | ]
307 | },
308 | "execution_count": 18,
309 | "metadata": {},
310 | "output_type": "execute_result"
311 | },
312 | {
313 | "data": {
314 | "image/png": "\n",
315 | "text/plain": [
316 | ""
317 | ]
318 | },
319 | "metadata": {
320 | "needs_background": "light"
321 | },
322 | "output_type": "display_data"
323 | }
324 | ],
325 | "source": [
326 | "detected = data.to_frame()\n",
327 | "detected[\"anomalies\"] = data[anomalies]\n",
328 | "detected.plot(style=['-', 'o'], figsize=(8,3), title=f'Anomalies detected: {sum(anomalies)}')"
329 | ]
330 | },
331 | {
332 | "cell_type": "code",
333 | "execution_count": null,
334 | "metadata": {},
335 | "outputs": [],
336 | "source": []
337 | }
338 | ],
339 | "metadata": {
340 | "kernelspec": {
341 | "display_name": "Python 3 (ipykernel)",
342 | "language": "python",
343 | "name": "python3"
344 | },
345 | "language_info": {
346 | "codemirror_mode": {
347 | "name": "ipython",
348 | "version": 3
349 | },
350 | "file_extension": ".py",
351 | "mimetype": "text/x-python",
352 | "name": "python",
353 | "nbconvert_exporter": "python",
354 | "pygments_lexer": "ipython3",
355 | "version": "3.9.6"
356 | }
357 | },
358 | "nbformat": 4,
359 | "nbformat_minor": 4
360 | }
361 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["hatchling"]
3 | build-backend = "hatchling.build"
4 |
5 | [tool.hatch.build]
6 | include = [
7 | "tsod/**/*",
8 | ]
9 |
10 | [project]
11 | name = "tsod"
12 | version = "0.3.dev0"
13 | description = "Time series anomaly detection."
14 | authors = [{ name = "Henrik Andersson", email = "jan@dhigroup.com" }]
15 | license = { text = "MIT" }
16 | readme = "README.md"
17 | requires-python = ">=3.10"
18 | dependencies = [
19 | "pandas>=1.0.0",
20 | "joblib",
21 | "numba",
22 | ]
23 |
24 | classifiers = [
25 | "License :: OSI Approved :: MIT License",
26 | "Development Status :: 2 - Pre-Alpha",
27 | "Intended Audience :: Science/Research",
28 | "Programming Language :: Python",
29 | "Programming Language :: Python :: 3",
30 | "Programming Language :: Python :: 3.10",
31 | "Programming Language :: Python :: 3.11",
32 | "Programming Language :: Python :: 3.12",
33 | "Operating System :: OS Independent",
34 | "Topic :: Scientific/Engineering",
35 | ]
36 |
37 | [project.optional-dependencies]
38 | dev = [
39 | "ruff",
40 | "pytest>=6",
41 | "pytest-cov>=4",
42 | "sphinx<7,>=4",
43 | "sphinx-book-theme",
44 | ]
45 |
46 | [project.urls]
47 | "Homepage" = "https://github.com/DHI/tsod"
48 | "Bug Tracker" = "https://github.com/DHI/tsod/issues"
49 |
50 |
51 |
52 | [tool.ruff]
53 | lint.ignore = ["E501", "E741"]
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DHI/tsod/07669cfb4d92daf28e6ee1495f0090ae4046a57e/tests/__init__.py
--------------------------------------------------------------------------------
/tests/data/BO_TS_MO_FINO2.nc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DHI/tsod/07669cfb4d92daf28e6ee1495f0090ae4046a57e/tests/data/BO_TS_MO_FINO2.nc
--------------------------------------------------------------------------------
/tests/data/combined.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DHI/tsod/07669cfb4d92daf28e6ee1495f0090ae4046a57e/tests/data/combined.joblib
--------------------------------------------------------------------------------
/tests/data/example.csv:
--------------------------------------------------------------------------------
1 | datetime,value
2 | 2000-01-01 00:00,0
3 | 2000-01-01 01:00,1.152781471
4 | 2000-01-01 02:00,1.450555612
5 | 2000-01-01 03:00,0.958642996
6 | 2000-01-01 04:00,1.174894688
7 | 2000-01-01 05:00,1.873848691
8 | 2000-01-01 06:00,1.84439191
9 | 2000-01-01 07:00,1.042026059
10 | 2000-01-01 08:00,0.787750065
11 | 2000-01-01 09:00,0.748739534
12 | 2000-01-01 10:00,0.821560744
13 | 2000-01-01 11:00,1.225795469
14 | 2000-01-01 12:00,0.957734013
15 | 2000-01-01 13:00,3
16 | 2000-01-01 14:00,3
17 | 2000-01-01 15:00,0.41953078
18 | 2000-01-01 16:00,0.612788648
19 | 2000-01-01 17:00,0.960619748
20 | 2000-01-01 18:00,0.922445351
21 | 2000-01-01 19:00,0.317372902
22 | 2000-01-01 20:00,0.683232022
23 | 2000-01-01 21:00,1.59095915
24 | 2000-01-01 22:00,1.284410756
25 | 2000-01-01 23:00,0.506858535
26 | 2000-01-02 00:00,1
27 | 2000-01-02 01:00,1
28 | 2000-01-02 02:00,1
29 | 2000-01-02 03:00,1
30 | 2000-01-02 04:00,1
31 | 2000-01-02 05:00,1
32 | 2000-01-02 06:00,1
33 | 2000-01-02 07:00,1
34 | 2000-01-02 08:00,1
35 | 2000-01-02 09:00,1
36 | 2000-01-02 10:00,1
37 | 2000-01-02 11:00,1
38 | 2000-01-02 12:00,1
39 | 2000-01-02 13:00,1.586141486
40 | 2000-01-02 14:00,1.314504228
41 | 2000-01-02 15:00,1.201552756
42 | 2000-01-02 16:00,1.445814101
43 | 2000-01-02 17:00,1.162387057
44 | 2000-01-02 18:00,1.066841287
45 | 2000-01-02 19:00,1.00248802
46 | 2000-01-02 20:00,0.503450841
47 | 2000-01-02 21:00,0.872922217
48 | 2000-01-02 22:00,1.580114567
49 | 2000-01-02 23:00,1.045514877
50 | 2000-01-03 00:00,1.012183073
51 | 2000-01-03 01:00,1.425633166
52 | 2000-01-03 02:00,1.191899682
53 | 2000-01-03 03:00,1.267666114
54 | 2000-01-03 04:00,1.061485161
55 | 2000-01-03 05:00,0.665546206
56 | 2000-01-03 06:00,0.424668666
57 | 2000-01-03 07:00,2.5
58 | 2000-01-03 08:00,0.4
59 | 2000-01-03 09:00,1.295534815
60 | 2000-01-03 10:00,1.514240194
61 | 2000-01-03 11:00,1.083260333
62 | 2000-01-03 12:00,1.1584635
63 | 2000-01-03 13:00,1.679222803
64 | 2000-01-03 14:00,1.081591441
65 | 2000-01-03 15:00,0.454411928
66 | 2000-01-03 16:00,0.937110802
67 | 2000-01-03 17:00,1.020471646
68 | 2000-01-03 18:00,1.285019944
69 | 2000-01-03 19:00,1.450649173
70 | 2000-01-03 20:00,0.937287208
71 | 2000-01-03 21:00,0.506385868
72 | 2000-01-03 22:00,0.921499469
73 | 2000-01-03 23:00,1.081260008
74 | 2000-01-04 00:00,1.015525661
75 | 2000-01-04 01:00,1.319109975
76 | 2000-01-04 02:00,1.187151435
77 | 2000-01-04 03:00,1.267982362
78 | 2000-01-04 04:00,0.896385507
79 | 2000-01-04 05:00,1
80 | 2000-01-04 06:00,0.9
81 | 2000-01-04 07:00,0.8
82 | 2000-01-04 08:00,0.7
83 | 2000-01-04 09:00,0.6
84 | 2000-01-04 10:00,0.5
85 | 2000-01-04 11:00,0.4
86 | 2000-01-04 12:00,0.3
87 | 2000-01-04 13:00,0.2
88 | 2000-01-04 14:00,0.1
89 | 2000-01-04 15:00,0
90 | 2000-01-04 16:00,0.768838193
91 | 2000-01-04 17:00,1.156824441
92 | 2000-01-04 18:00,1.399798202
93 | 2000-01-04 19:00,1.112722702
94 | 2000-01-04 20:00,1.221512379
95 | 2000-01-04 21:00,0.859352212
96 | 2000-01-04 22:00,0.247665553
97 | 2000-01-04 23:00,0.784977135
98 | 2000-01-05 00:00,0.634419463
99 | 2000-01-05 01:00,0.239960571
100 | 2000-01-05 02:00,0.422304927
101 | 2000-01-05 03:00,0.606980415
102 | 2000-01-05 03:59,0.776914226
103 | 2000-01-05 04:59,0.96975411
104 | 2000-01-05 05:59,0.774306839
105 | 2000-01-05 06:59,0.967961138
106 | 2000-01-05 07:59,1.145144565
107 | 2000-01-05 08:59,0.464866706
108 | 2000-01-05 09:59,0.13530199
109 | 2000-01-05 10:59,0.738101172
110 | 2000-01-05 11:59,0.832450078
111 | 2000-01-05 12:59,1.082938156
112 | 2000-01-05 13:59,0.948400949
113 | 2000-01-05 14:59,0.65301421
114 | 2000-01-05 15:59,1.526724574
115 | 2000-01-05 16:59,1.842639141
116 | 2000-01-05 17:59,1.869540157
117 | 2000-01-05 18:59,1.697764564
118 | 2000-01-05 19:59,0.784910422
119 | 2000-01-05 20:59,1.040359074
120 | 2000-01-05 21:59,1.802531081
121 | 2000-01-05 22:59,0.834522078
122 | 2000-01-05 23:59,0.179125049
123 | 2000-01-06 00:59,1.064403211
124 | 2000-01-06 01:59,1.859795992
125 | 2000-01-06 02:59,1.58075584
126 | 2000-01-06 03:59,1.058748553
127 | 2000-01-06 04:59,0.503596224
128 | 2000-01-06 05:59,0.503728592
129 | 2000-01-06 06:59,0.917595552
130 | 2000-01-06 07:59,1.322622102
131 | 2000-01-06 08:59,0.968904598
132 | 2000-01-06 09:59,1.080349742
133 | 2000-01-06 10:59,1.401408437
134 | 2000-01-06 11:59,1.44165518
135 | 2000-01-06 12:59,1.745179237
136 |
--------------------------------------------------------------------------------
/tests/data_generation.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def create_random_walk_with_outliers(
5 | n_steps, t0=0, outlier_fraction=0.1, outlier_scale=10, seed=42
6 | ):
7 | """
8 | Generate a random walk time series with random outlier peaks.
9 |
10 | Parameters
11 | ------------
12 | n_steps : int
13 | Length of the time series to be generated.
14 | t0 : int
15 | Time series initial value.
16 | outlier_fraction : float
17 | Fraction of outliers to be generated in series [0-1].
18 | outlier_scale : float
19 | Scalar by which to multiply the RW increment to create an outlier.
20 | seed : int
21 | Random seed
22 |
23 | Returns
24 | -------
25 | random_walk : np.ndarray
26 | The generated random walk time series with outliers.
27 | outlier_indices : np.ndarray
28 | The indices of the introduced outliers.
29 | """
30 | assert 0 <= outlier_fraction <= 1
31 | n_outliers = int(outlier_fraction * n_steps)
32 |
33 | # Simulate random walk
34 | np.random.seed(seed)
35 | possible_steps = [-1, 1]
36 | random_steps = np.random.choice(a=possible_steps, size=n_steps)
37 | random_walk = np.append(t0, random_steps[:-1]).cumsum(axis=0)
38 |
39 | # Add outliers
40 | random_walk_with_outliers = random_walk.copy()
41 | outlier_indices = np.random.randint(0, n_steps, n_outliers)
42 | random_walk_with_outliers[outlier_indices] += (
43 | random_steps[outlier_indices] * outlier_scale
44 | )
45 |
46 | return random_walk_with_outliers, sorted(outlier_indices), random_walk
47 |
--------------------------------------------------------------------------------
/tests/test_detectors.py:
--------------------------------------------------------------------------------
1 | from tsod.base import Detector
2 | import pytest
3 | import numpy as np
4 | import pandas as pd
5 | import os
6 |
7 | from tsod.custom_exceptions import WrongInputDataTypeError
8 | from tsod.detectors import (
9 | RangeDetector,
10 | DiffDetector,
11 | CombinedDetector,
12 | RollingStandardDeviationDetector,
13 | ConstantValueDetector,
14 | ConstantGradientDetector,
15 | GradientDetector,
16 | )
17 |
18 | from tsod.features import create_dataset
19 | from tsod.hampel import HampelDetector
20 |
21 |
22 | from tests.data_generation import create_random_walk_with_outliers
23 |
24 |
25 | @pytest.fixture
26 | def data_series():
27 | n_steps = 100
28 | (
29 | time_series_with_outliers,
30 | outlier_indices,
31 | random_walk,
32 | ) = create_random_walk_with_outliers(n_steps)
33 | time = pd.date_range(start="2020", periods=n_steps, freq="1h")
34 | return (
35 | pd.Series(time_series_with_outliers, index=time),
36 | outlier_indices,
37 | pd.Series(random_walk, index=time),
38 | )
39 |
40 |
41 | @pytest.fixture
42 | def range_data():
43 | normal_data = np.array([0, np.nan, 1, 0, 2, np.nan, 3.14, 4])
44 | abnormal_data = np.array([-1.0, np.nan, 2.0, np.nan, 1.0, 0.0, 4.1, 10.0])
45 | expected_anomalies = np.array([True, False, False, False, False, False, True, True])
46 | assert len(expected_anomalies) == len(abnormal_data)
47 | return normal_data, abnormal_data, expected_anomalies
48 |
49 |
50 | @pytest.fixture
51 | def range_data_series(range_data):
52 | normal_data, abnormal_data, expected_anomalies = range_data
53 | time = pd.date_range(start="2020", periods=len(normal_data), freq="1h")
54 | return (
55 | pd.Series(normal_data, index=time),
56 | pd.Series(abnormal_data, index=time),
57 | expected_anomalies,
58 | )
59 |
60 |
61 | @pytest.fixture
62 | def constant_gradient_data_series(range_data):
63 | normal_data = np.array([0, np.nan, 1, 1.1, 1.4, 1.5555, 3.14, 4])
64 | abnormal_data = np.array([-1, 2.0, 2.1, 2.2, 2.3, 2.4, 4, 10])
65 | expected_anomalies = np.array([False, True, True, True, True, True, False, False])
66 | time = pd.date_range(start="2020", periods=len(normal_data), freq="1h")
67 | return (
68 | pd.Series(normal_data, index=time),
69 | pd.Series(abnormal_data, index=time),
70 | expected_anomalies,
71 | )
72 |
73 |
74 | @pytest.fixture
75 | def constant_data_series(range_data):
76 | normal_data = np.array([0, np.nan, 1, 1.1, 1.4, 1.5555, 3.14, 4])
77 | abnormal_data = np.array([-1, np.nan, 1, 1, 1, 1, 4, 10])
78 | expected_anomalies = np.array([False, False, True, True, True, True, False, False])
79 | time = pd.date_range(start="2020", periods=len(normal_data), freq="1h")
80 | return (
81 | pd.Series(normal_data, index=time),
82 | pd.Series(abnormal_data, index=time),
83 | expected_anomalies,
84 | )
85 |
86 |
87 | def test_base_detector_exceptions(range_data, range_data_series):
88 | data, _, _ = range_data
89 | data_series, _, _ = range_data_series
90 |
91 | detector = RangeDetector()
92 | pytest.raises(WrongInputDataTypeError, detector.fit, data)
93 |
94 |
95 | def test_range_detector(range_data_series):
96 | data, _, _ = range_data_series
97 |
98 | detector = RangeDetector(0, 2)
99 | anomalies = detector.detect(data)
100 | expected_anomalies = [False, False, False, False, False, False, True, True]
101 | assert len(anomalies) == len(data)
102 | assert sum(anomalies) == 2
103 | assert all(expected_anomalies == anomalies)
104 |
105 |
106 | def test_range_detector_autoset(range_data_series):
107 | data, _, _ = range_data_series
108 |
109 | anomalies = RangeDetector(min_value=3).detect(data)
110 | assert sum(anomalies) == 4
111 |
112 | anomalies = RangeDetector(max_value=3).detect(data)
113 | assert sum(anomalies) == 2
114 |
115 |
116 | def test_combined_fit(range_data_series):
117 | normal_data, abnormal_data, labels = range_data_series
118 | cd = CombinedDetector([ConstantValueDetector(), RangeDetector()])
119 | cd.fit(normal_data)
120 |
121 | anomalies = cd.detect(abnormal_data)
122 | assert all(anomalies == labels)
123 |
124 |
125 | def test_combined_wrong_type():
126 | with pytest.raises(ValueError):
127 | CombinedDetector([ConstantValueDetector, RangeDetector()]) #
128 |
129 |
130 | def test_combined_access_items():
131 |
132 | cd = CombinedDetector([ConstantValueDetector(), RangeDetector()])
133 |
134 | assert isinstance(cd[0], Detector)
135 | assert isinstance(cd[0], ConstantValueDetector)
136 | assert isinstance(cd[1], RangeDetector)
137 | assert isinstance(cd[-1], RangeDetector)
138 |
139 |
140 | def test_range_detector_quantile():
141 | np.random.seed(42)
142 | train = np.random.normal(size=1000)
143 | test = np.random.normal(size=1000)
144 |
145 | train[42] = -6.5
146 | train[560] = 10.5
147 |
148 | test[142] = -4.5
149 | test[960] = 5.5
150 |
151 | normal_data_incl_two_outliers = pd.Series(train)
152 | test_data = pd.Series(test)
153 |
154 | # all test data is within range of train data, no anomalies detected
155 | nqdetector = RangeDetector().fit(normal_data_incl_two_outliers)
156 | detected_anomalies = nqdetector.detect(test_data)
157 | assert sum(detected_anomalies) == 0
158 |
159 | # exclude extreme values
160 | detector = RangeDetector(quantiles=[0.001, 0.999]).fit(
161 | normal_data_incl_two_outliers
162 | )
163 | detected_anomalies = detector.detect(test_data)
164 | assert sum(detected_anomalies) == 2
165 | assert detector._min > normal_data_incl_two_outliers.min()
166 | assert detector._max < normal_data_incl_two_outliers.max()
167 |
168 |
169 | def test_diff_detector_autoset(range_data_series):
170 | normal_data, abnormal_data, expected_anomalies = range_data_series
171 |
172 | detector = DiffDetector().fit(normal_data)
173 | detected_anomalies = detector.detect(abnormal_data)
174 | assert sum(detected_anomalies) == 2
175 |
176 |
177 | def test_combined_detector():
178 | path_to_tests_super_folder = os.path.abspath(__file__).split("tests")[0]
179 | df = pd.read_csv(
180 | os.path.join(path_to_tests_super_folder, "tests", "data", "example.csv"),
181 | parse_dates=True,
182 | index_col=0,
183 | )
184 | combined = CombinedDetector(
185 | [
186 | ConstantValueDetector(),
187 | RangeDetector(max_value=2.0),
188 | ]
189 | )
190 |
191 | series = df.value
192 | res = combined.detect(series)
193 |
194 | assert isinstance(res, pd.Series)
195 |
196 |
197 | def test_rollingstddev_detector():
198 |
199 | np.random.seed(42)
200 | normal_data = pd.Series(np.random.normal(scale=1.0, size=1000)) + 10.0 * np.sin(
201 | np.linspace(0, 10, num=1000)
202 | )
203 | abnormal_data = pd.Series(np.random.normal(scale=2.0, size=100))
204 |
205 | all_data = pd.concat([normal_data, abnormal_data])
206 |
207 | detector = RollingStandardDeviationDetector()
208 | anomalies = detector.detect(normal_data)
209 | assert sum(anomalies) == 0
210 |
211 | detector.fit(normal_data)
212 | anomalies = detector.detect(normal_data)
213 | assert sum(anomalies) == 0
214 |
215 | anomalies = detector.detect(all_data)
216 | assert sum(anomalies) > 0
217 |
218 | # Manual specification
219 | detector = RollingStandardDeviationDetector(max_std=2.0)
220 | anomalies = detector.detect(normal_data)
221 | assert sum(anomalies) == 0
222 |
223 | anomalies = detector.detect(all_data)
224 | assert sum(anomalies) > 0
225 |
226 |
227 | def test_hampel_detector(data_series):
228 | data_with_anomalies, expected_anomalies_indices, _ = data_series
229 | detector = HampelDetector()
230 | anomalies = detector.detect(data_with_anomalies)
231 | anomalies_indices = np.array(np.where(anomalies)).flatten()
232 | # Validate if the found anomalies are also in the expected anomaly set
233 | # NB Not necessarily all of them
234 | assert all(i in expected_anomalies_indices for i in anomalies_indices)
235 |
236 |
237 |
238 | def test_constant_value_detector(constant_data_series):
239 | good_data, abnormal_data, _ = constant_data_series
240 |
241 | detector = ConstantValueDetector(2, 0.0001)
242 | anomalies = detector.detect(good_data)
243 |
244 | assert len(anomalies) == len(good_data)
245 | assert sum(anomalies) == 0
246 |
247 | detector = ConstantValueDetector(3, 0.0001)
248 | anomalies = detector.detect(abnormal_data)
249 |
250 | assert len(anomalies) == len(abnormal_data)
251 | assert sum(anomalies) == 4
252 |
253 |
254 | def test_constant_gradient_detector(constant_gradient_data_series):
255 | good_data, abnormal_data, _ = constant_gradient_data_series
256 |
257 | detector = ConstantGradientDetector(3)
258 | anomalies = detector.detect(good_data)
259 |
260 | assert len(anomalies) == len(good_data)
261 | assert sum(anomalies) == 0
262 |
263 | detector = ConstantGradientDetector(3)
264 | anomalies = detector.detect(abnormal_data)
265 |
266 | assert len(anomalies) == len(abnormal_data)
267 | assert sum(anomalies) == 5
268 |
269 |
270 | def test_gradient_detector_constant_gradient(constant_gradient_data_series):
271 | good_data, _, _ = constant_gradient_data_series
272 |
273 | detector = GradientDetector(1.0)
274 | anomalies = detector.detect(good_data)
275 |
276 | assert len(anomalies) == len(good_data)
277 | assert sum(anomalies) == 0
278 |
279 |
280 | def test_gradient_detector_sudden_jump():
281 |
282 | normal_data = np.array(
283 | [
284 | -0.5,
285 | -0.6,
286 | 0.6,
287 | 0.6,
288 | 0.1,
289 | 0.6,
290 | 0.4,
291 | 0.8,
292 | 0.7,
293 | 1.5,
294 | 1.6,
295 | 1.1,
296 | 0.3,
297 | 2.1,
298 | 0.7,
299 | 0.3,
300 | -1.7,
301 | -0.3,
302 | 0.0,
303 | -1.0,
304 | ]
305 | )
306 | abnormal_data = np.array(
307 | [
308 | -0.5,
309 | -1.5,
310 | 1.5,
311 | 0.6,
312 | 0.1,
313 | 0.6,
314 | 0.4,
315 | 0.8,
316 | 0.7,
317 | 1.5,
318 | 1.6,
319 | 1.1,
320 | 0.3,
321 | 2.1,
322 | 0.7,
323 | 0.3,
324 | -1.7,
325 | -0.3,
326 | 0.0,
327 | -1.0,
328 | ]
329 | )
330 |
331 | expected_anomalies = np.repeat(False, len(normal_data))
332 | expected_anomalies[2] = True
333 | time = pd.date_range(start="2020", periods=len(normal_data), freq="1h")
334 |
335 | normal_data = pd.Series(normal_data, index=time)
336 | abnormal_data = pd.Series(abnormal_data, index=time)
337 |
338 | detector = GradientDetector()
339 |
340 | anomalies = detector.detect(normal_data)
341 | assert sum(anomalies) == 0
342 |
343 | # Default is to accept any gradient
344 | anomalies = detector.detect(abnormal_data)
345 | assert sum(anomalies) == 0
346 |
347 | # Max gradient 2.0/h
348 | detector.fit(normal_data)
349 | anomalies = detector.detect(abnormal_data)
350 |
351 | assert sum(anomalies) == 1
352 |
353 |
354 | def test_create_dataset(data_series):
355 | data_with_anomalies, _, _ = data_series
356 | data_with_anomalies.name = "y"
357 | data = data_with_anomalies.to_frame()
358 | time_steps = 2
359 | predictors, y = create_dataset(data[["y"]], data.y, time_steps)
360 | assert len(y) == len(data) - time_steps
361 | assert predictors.shape[0] == len(data) - time_steps
362 | assert predictors.shape[1] == time_steps
363 |
--------------------------------------------------------------------------------
/tests/test_persistence.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tsod
3 | from tsod import RangeDetector, ConstantValueDetector, CombinedDetector
4 |
5 |
6 | def test_save_and_load(tmp_path):
7 |
8 | combined = CombinedDetector(
9 | [
10 | ConstantValueDetector(),
11 | RangeDetector(max_value=2.0),
12 | ]
13 | )
14 |
15 | path = tmp_path / "combined.joblib"
16 | combined.save(path)
17 |
18 | loaded = tsod.load(path)
19 |
20 | assert isinstance(loaded, CombinedDetector)
21 |
22 |
23 | def test_load():
24 | path_to_tests_super_folder = os.path.abspath(__file__).split("tests")[0]
25 | filename = os.path.join(
26 | path_to_tests_super_folder, "tests", "data", "combined.joblib"
27 | )
28 |
29 | loaded = tsod.load(filename)
30 |
31 | assert isinstance(loaded, CombinedDetector)
32 |
33 |
34 | def test_save_and_load_filename(tmpdir):
35 |
36 | combined = CombinedDetector(
37 | [
38 | ConstantValueDetector(),
39 | RangeDetector(max_value=2.0),
40 | ]
41 | )
42 |
43 | filename = os.path.join(tmpdir, "combined.joblib")
44 | combined.save(filename)
45 |
46 | loaded = tsod.load(filename)
47 |
48 | assert isinstance(loaded, CombinedDetector)
49 |
--------------------------------------------------------------------------------
/tsod/__init__.py:
--------------------------------------------------------------------------------
1 | from .detectors import (
2 | RangeDetector,
3 | DiffDetector,
4 | ConstantGradientDetector,
5 | GradientDetector,
6 | ConstantValueDetector,
7 | CombinedDetector,
8 | RollingStandardDeviationDetector,
9 | )
10 |
11 |
12 | from .base import load
13 |
14 | __version__ = "0.2.0"
15 |
16 | __all__ = [
17 | "RangeDetector",
18 | "DiffDetector",
19 | "ConstantGradientDetector",
20 | "GradientDetector",
21 | "ConstantValueDetector",
22 | "CombinedDetector",
23 | "RollingStandardDeviationDetector",
24 | "load",
25 | ]
26 |
--------------------------------------------------------------------------------
/tsod/base.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import Union
3 |
4 | from pathlib import Path
5 | import joblib
6 |
7 | import pandas as pd
8 |
9 |
10 | from .custom_exceptions import WrongInputDataTypeError
11 |
12 |
13 | def load(path: Union[str, Path]):
14 | """Load a saved model from disk saved with `Detector.save`
15 |
16 | Parameters
17 | ==========
18 | path: str or Path
19 | file-like object to load detector from
20 | """
21 |
22 | return joblib.load(path)
23 |
24 |
25 | class Detector(ABC):
26 | """Abstract base class for all detectors"""
27 |
28 | def __init__(self):
29 | pass
30 |
31 | def fit(self, data: pd.Series):
32 | """Set detector parameters based on data.
33 |
34 | Parameters
35 | ----------
36 | data: pd.Series
37 | Normal time series data.
38 | """
39 | data = self.validate(data)
40 | self._fit(data)
41 | return self
42 |
43 | def _fit(self, data: pd.Series):
44 | # Default implementation is a NoOp
45 | return self
46 |
47 | def detect(self, data: pd.Series) -> pd.Series:
48 | """Detect anomalies
49 |
50 | Parameters
51 | ----------
52 | data: pd.Series
53 | Time series data with possible anomalies
54 |
55 | Returns
56 | -------
57 | pd.Series
58 | Time series with bools, True == anomaly
59 | """
60 | data = self.validate(data)
61 |
62 | pred = self._detect(data)
63 | return self._postprocess(pred)
64 |
65 | def _postprocess(self, pred: pd.Series) -> pd.Series:
66 | # TODO implement
67 | return pred
68 |
69 | @abstractmethod
70 | def _detect(self, data: pd.Series) -> pd.Series:
71 | """Detect anomalies"""
72 | pass
73 |
74 | def validate(
75 | self, data: Union[pd.Series, pd.DataFrame]
76 | ) -> Union[pd.Series, pd.DataFrame]:
77 | """Check that input data is in correct format and possibly adjust"""
78 | if not (isinstance(data, pd.Series) or isinstance(data, pd.DataFrame)):
79 | raise WrongInputDataTypeError()
80 | return data
81 |
82 | def _gradient(
83 | self, data: Union[pd.Series, pd.DataFrame], periods: int = 1
84 | ) -> pd.Series:
85 | dt = data.index.to_series().diff().dt.total_seconds()
86 | if dt.min() < 1e-15:
87 | raise ValueError("Index must be monotonically increasing")
88 |
89 | gradient = data.diff(periods=periods) / dt
90 | return gradient
91 |
92 | def __str__(self):
93 | return f"{self.__class__.__name__}"
94 |
95 | def save(self, path: Union[str, Path]) -> None:
96 | """Save a detector for later use
97 |
98 | Parameters
99 | ==========
100 | path: str or Path
101 | file-like object to load detector from
102 | """
103 |
104 | joblib.dump(self, path)
105 |
--------------------------------------------------------------------------------
/tsod/custom_exceptions.py:
--------------------------------------------------------------------------------
1 | class WrongInputDataTypeError(Exception):
2 | def __init__(self, message="Input data must be a pandas.Series."):
3 | self.message = message
4 | super().__init__(self.message)
5 |
6 |
7 | class NotFittedError(Exception):
8 | def __init__(self, message="Please call fit() before detect().", tip=""):
9 | self.message = " ".join([message, tip])
10 | super().__init__(self.message)
11 |
12 |
13 | class NoRangeDefinedError(NotFittedError):
14 | def __init__(
15 | self, message="Or specify min/max range when instantiating detector object."
16 | ):
17 | super().__init__(message)
18 |
19 |
20 | class InvalidArgumentError(Exception):
21 | def __init__(self, argument_name, requirement):
22 | self.message = f"{argument_name} must be {requirement}."
23 | super().__init__(self.message)
24 |
25 |
26 | class NotIntegerError(InvalidArgumentError):
27 | def __init__(self, argument_name):
28 | super().__init__(argument_name, "an integer")
29 |
30 |
31 | class NonUniqueTimeStampsError(Exception):
32 | def __init__(self, message="Found multiple values at the same time stamp."):
33 | self.message = message
34 | super().__init__(self.message)
35 |
36 |
37 | class WrongInputSizeError(ValueError):
38 | pass
39 |
--------------------------------------------------------------------------------
/tsod/detectors.py:
--------------------------------------------------------------------------------
1 | """Simple univariate anomaly detectors"""
2 |
3 | from collections.abc import Sequence
4 | import pandas as pd
5 | import numpy as np
6 |
7 | from .base import Detector
8 |
9 |
10 | class CombinedDetector(Detector, Sequence):
11 | """Combine detectors.
12 |
13 | It is possible to combine several anomaly detection strategies into a combined detector.
14 |
15 | Examples
16 | --------
17 | >>> normal_data = pd.Series(np.random.normal(size=100))
18 | >>> abnormal_data = pd.Series(np.random.normal(size=100))
19 | >>> abnormal_data[[2, 6, 15, 57, 60, 73]] = 5
20 |
21 | >>> anomaly_detector = CombinedDetector([RangeDetector(), DiffDetector()])
22 | >>> anomaly_detector.fit(normal_data)
23 | >>> detected_anomalies = anomaly_detector.detect(abnormal_data)
24 | """
25 |
26 | def __init__(self, detectors):
27 | super().__init__()
28 |
29 | for detector in detectors:
30 | if not isinstance(detector, Detector):
31 | raise ValueError(
32 | f"""{detector} is not a Detector.
33 | Did you forget to create an instance, e.g. ConstantValueDetector()?"""
34 | )
35 |
36 | self._detectors = detectors
37 |
38 | def _fit(self, data):
39 | for detector in self._detectors:
40 | detector.fit(data)
41 | return self
42 |
43 | def _detect(self, data: pd.Series) -> pd.Series:
44 | all_anomalies = []
45 | for detector in self._detectors:
46 | anom = detector.detect(data)
47 | all_anomalies.append(anom)
48 | data_frame = pd.DataFrame(all_anomalies).T
49 | return data_frame.any(axis=1)
50 |
51 | def __getitem__(self, index):
52 | return self._detectors[index]
53 |
54 | def __len__(self):
55 | return len(self._detectors)
56 |
57 |
58 | class RangeDetector(Detector):
59 | """
60 | Detect values outside range.
61 |
62 | Parameters
63 | ----------
64 | min_value : float
65 | Minimum value threshold.
66 | max_value : float
67 | Maximum value threshold.
68 | quantiles : list[2]
69 | Default quantiles [0, 1]. Same as min and max value.
70 |
71 | Examples
72 | ---------
73 | >>> normal_data = pd.Series(np.random.normal(size=100))
74 | >>> abnormal_data = pd.Series(np.random.normal(size=100))
75 | >>> abnormal_data[[2, 6, 15, 57, 60, 73]] = 5
76 | >>> normal_data_with_some_outliers = pd.Series(np.random.normal(size=100))
77 | >>> normal_data_with_some_outliers[[12, 13, 20, 90]] = 7
78 |
79 | >>> detector = RangeDetector(min_value=0.0, max_value=2.0)
80 | >>> anomalies = detector.detect(abnormal_data)
81 |
82 | >>> detector = RangeDetector()
83 | >>> detector.fit(normal_data) # min, max inferred from normal data
84 | >>> anomalies = detector.detect(abnormal_data)
85 |
86 | >>> detector = RangeDetector(quantiles=[0.001,0.999])
87 | >>> detector.fit(normal_data_with_some_outliers)
88 | >>> anomalies = detector.detect(abnormal_data)"""
89 |
90 | def __init__(self, min_value=-np.inf, max_value=np.inf, quantiles=None):
91 | super().__init__()
92 |
93 | self._min = min_value
94 |
95 | self._max = max_value
96 |
97 | if quantiles is None:
98 | self._quantiles = [0.0, 1.0]
99 | else:
100 | assert 0.0 <= quantiles[0] <= 1.0
101 | assert 0.0 <= quantiles[1] <= 1.0
102 | self._quantiles = quantiles
103 |
104 | def _fit(self, data):
105 | """Set min and max based on data.
106 |
107 | Parameters
108 | ----------
109 | data : pd.Series
110 | Normal time series data.
111 | """
112 | super().validate(data)
113 |
114 | quantiles = np.nanquantile(data, self._quantiles)
115 | self._min = quantiles.min()
116 | self._max = quantiles.max()
117 |
118 | assert self._max >= self._min
119 | return self
120 |
121 | def _detect(self, data: pd.Series) -> pd.Series:
122 | """Detect anomalies outside range"""
123 |
124 | if self._max is None:
125 | return data < self._min
126 |
127 | if self._min is None:
128 | return data > self._max
129 |
130 | return (data < self._min) | (data > self._max)
131 |
132 | def __str__(self):
133 | return f"{super.__str__(self)}{self._min}, {self._max})"
134 |
135 | def __repr__(self):
136 | return f"{self.__class__.__name__}(min: {self._min:.1e}, max: {self._max:.1e})"
137 |
138 |
139 | class DiffDetector(Detector):
140 | """Detect sudden shifts in data. Irrespective of time axis.
141 |
142 | Parameters
143 | ----------
144 | max_diff : float
145 | Maximum change threshold.
146 | direction: str
147 | positive, negative or both, default='both'
148 |
149 | See also
150 | --------
151 | GradientDetector: similar functionality but considers actual time between data points
152 | """
153 |
154 | def __init__(self, max_diff=np.inf, direction="both"):
155 | super().__init__()
156 | self._max_diff = max_diff
157 |
158 | valid_directions = ("both", "positive", "negative")
159 | if direction in valid_directions:
160 | self._direction = direction
161 | else:
162 | raise ValueError(
163 | f"Selected direction, '{direction}' is not a valid direction. Valid directions are: {valid_directions}"
164 | )
165 |
166 | def _fit(self, data):
167 | data_diff = data.diff()
168 |
169 | self._max_diff = data_diff.max()
170 | return self
171 |
172 | def _detect(self, data: pd.Series) -> pd.Series:
173 | if self._direction == "both":
174 | return np.abs(data.diff()) > self._max_diff
175 | elif self._direction == "positive":
176 | return data.diff() > self._max_diff
177 | else:
178 | return data.diff() < -self._max_diff
179 |
180 | def __str__(self):
181 | return (
182 | f"{self.__class__.__name__}({self._max_diff}, direction:{self._direction})"
183 | )
184 |
185 |
186 | class RollingStandardDeviationDetector(Detector):
187 | """Detect large variations
188 |
189 |
190 | ----------
191 | window_size: int
192 | Number of data points to evaluate over, default=10
193 | max_std: float
194 | Maximum standard deviation to accept as normal, default=np.inf
195 | center: bool
196 | Center rolling window, default=True
197 | """
198 |
199 | def __init__(self, window_size=10, max_std=np.inf, center=True):
200 | super().__init__()
201 | self._window_size = window_size
202 | self._max_std = max_std
203 | self._center = center
204 |
205 | def _fit(self, data):
206 | self._max_std = data.rolling(self._window_size).std().max()
207 |
208 | return self
209 |
210 | def _detect(self, data: pd.Series) -> pd.Series:
211 | anomalies = (
212 | data.rolling(self._window_size, center=self._center).std() > self._max_std
213 | )
214 | # anomalies = anomalies.astype(int).diff() > 0 # only take positive edges
215 | anomalies[0] = False # first element cannot be determined by diff
216 | return anomalies
217 |
218 | def __str__(self):
219 | return f"{self.__class__.__name__}(window_size:{self._window_size}, max_std:{self._max_std})"
220 |
221 |
222 | class ConstantValueDetector(Detector):
223 | """
224 | Detect constant values over a longer period.
225 |
226 | Commonly caused by sensor failures, which get stuck at a constant level.
227 | """
228 |
229 | def __init__(self, window_size: int = 3, threshold: float = 1e-7):
230 | super().__init__()
231 | self._threshold = threshold
232 | self._window_size = window_size
233 |
234 | def _fit(self, data):
235 | return self
236 |
237 | def _detect(self, data: pd.Series) -> pd.Series:
238 | rollmax = data.rolling(self._window_size, center=True).apply(np.nanmax)
239 | rollmin = data.rolling(self._window_size, center=True).apply(np.nanmin)
240 | anomalies = np.abs(rollmax - rollmin) < self._threshold
241 | anomalies.iloc[0] = False # first element cannot be determined
242 | anomalies.iloc[-1] = False
243 | idx = np.where(anomalies)[0]
244 | if idx is not None:
245 | # assuming window size = 3
246 | # remove also points before and after each detected anomaly
247 | anomalies.iloc[idx[idx > 0] - 1] = True
248 | maxidx = len(anomalies) - 1
249 | anomalies.iloc[idx[idx < maxidx] + 1] = True
250 |
251 | return anomalies
252 |
253 | def __str__(self):
254 | return f"{self.__class__.__name__}({self._window_size}, {self._threshold})"
255 |
256 |
257 | class ConstantGradientDetector(ConstantValueDetector):
258 | """Detect constant gradients.
259 |
260 | Typically caused by linear interpolation over a long interval.
261 |
262 | Parameters
263 | ==========
264 | window_size: int
265 | Minium window to consider as anomaly, default 3
266 | """
267 |
268 | def __init__(self, window_size: int = 3):
269 | super().__init__(window_size=window_size)
270 |
271 | def _detect(self, data: pd.Series) -> pd.Series:
272 | gradient = self._gradient(data, periods=1)
273 | s1 = super()._detect(gradient)
274 | gradient = self._gradient(data, periods=-1)
275 | s2 = super()._detect(gradient)
276 | return s1 | s2
277 |
278 | def __str__(self):
279 | return f"{self.__class__.__name__}({self._window_size})"
280 |
281 |
282 | class GradientDetector(Detector):
283 | """Detects abrupt changes
284 |
285 | Parameters
286 | ==========
287 | max_gradient: float
288 | Maximum rate of change per second, default np.inf
289 | direction: str
290 | positive, negative or both, default='both'
291 | """
292 |
293 | def __init__(self, max_gradient=np.inf, direction="both"):
294 | super().__init__()
295 | self._max_gradient = max_gradient
296 | valid_directions = ("both", "positive", "negative")
297 | if direction in valid_directions:
298 | self._direction = direction
299 | else:
300 | raise ValueError(
301 | f"""Selected direction, '{direction}' is not a valid direction.
302 | Valid directions are: {valid_directions}"""
303 | )
304 |
305 | def _fit(self, data: pd.Series):
306 | """Set max gradient based on data."""
307 |
308 | self._max_gradient = np.max(np.abs(self._gradient(data)))
309 | return self
310 |
311 | def _detect(self, data: pd.Series) -> pd.Series:
312 | gradient = self._gradient(data)
313 | if self._direction == "negative":
314 | return gradient < -self._max_gradient
315 | elif self._direction == "positive":
316 | return gradient > self._max_gradient
317 | else:
318 | return np.abs(gradient) > self._max_gradient
319 |
320 | def __str__(self):
321 | max_grad_hr = self._max_gradient * 3600.0
322 | return (
323 | f"{self.__class__.__name__}({max_grad_hr}/hr, direction:{self._direction})"
324 | )
325 |
--------------------------------------------------------------------------------
/tsod/features.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 |
4 |
5 | def lag_time_series(time_series: pd.Series, lags):
6 | """Create lagged time series features.
7 |
8 | Parameters
9 | ----------
10 | time_series : pd.Series
11 | lags : list[int]
12 | List of lags
13 |
14 | Returns
15 | -------
16 | pd.DataFrame
17 | Lagged time series features.
18 | """
19 | lagged_time_series = {}
20 | for lag in lags:
21 | lagged_time_series[str(lag)] = time_series.shift(lag)
22 |
23 | return pd.concat(lagged_time_series, axis=1)
24 |
25 |
26 | def create_dataset(X, y, time_steps=1):
27 | Xs, ys = [], []
28 | for i in range(len(X) - time_steps):
29 | v = X.iloc[i : (i + time_steps)].values
30 | Xs.append(v)
31 | ys.append(y.iloc[i + time_steps])
32 | return np.array(Xs), np.array(ys)
33 |
--------------------------------------------------------------------------------
/tsod/hampel.py:
--------------------------------------------------------------------------------
1 | """Hampel detector"""
2 |
3 | import numpy as np
4 | from numba import jit
5 |
6 | from tsod.custom_exceptions import NotIntegerError, InvalidArgumentError
7 | from tsod.detectors import Detector
8 |
9 |
10 | # GAUSSIAN_SCALE_FACTOR = k = 1/Phi^(-1)(3/4)
11 | # Choosing 3/4 as argument makes +-MAD cover 50% of the standard normal cumulative distribution function.
12 |
13 | GAUSSIAN_SCALE_FACTOR = 1.4826
14 |
15 |
16 | def _validate_arguments(window_size, threshold):
17 | if not isinstance(window_size, int):
18 | raise NotIntegerError("window_size")
19 | else:
20 | if window_size <= 0:
21 | raise InvalidArgumentError("window_size", "nonnegative")
22 |
23 | if threshold < 0:
24 | raise InvalidArgumentError("threshold", "positive")
25 |
26 |
27 | @jit(nopython=True)
28 | def _detect(time_series, window_size, threshold=3, k=GAUSSIAN_SCALE_FACTOR):
29 | """
30 | Hampel filter implementation that works on numpy arrays, implemented with numba.
31 |
32 | Parameters
33 | ----------
34 | time_series: numpy.ndarray
35 | window_size: int
36 | The window range is from [(i - window_size):(i + window_size)], so window_size is half of the
37 | window, counted in number of array elements (as opposed to specify a time span, which is not
38 | supported by this implementation)
39 | threshold: float
40 | The threshold for marking an outlier. A low threshold "narrows" the band within which values are deemed as
41 | outliers. n_sigmas
42 | k : float
43 | Constant scale factor dependent on distribution. Default is normal distribution.
44 | """
45 |
46 | # time_series_clean = time_series.copy()
47 | # outlier_indices = []
48 | is_outlier = [False] * len(time_series)
49 |
50 | for t in range(window_size, (len(time_series) - window_size)):
51 | time_series_window = time_series[(t - window_size) : (t + window_size)]
52 | median_in_window = np.nanmedian(time_series_window)
53 | mad_in_window = k * np.nanmedian(np.abs(time_series_window - median_in_window))
54 | absolute_deviation_from_median = np.abs(time_series[t] - median_in_window)
55 | is_outlier[t] = absolute_deviation_from_median > threshold * mad_in_window
56 | # if is_outlier[t]:
57 | # outlier_indices.append(t)
58 | # time_series_clean[t] = median_in_window
59 |
60 | return is_outlier
61 |
62 |
63 | class HampelDetector(Detector):
64 | """
65 | Hampel filter implementation that works on numpy arrays, implemented with numba.
66 |
67 | Parameters
68 | ----------
69 | window_size: int
70 | The window range is from [(i - window_size):(i + window_size)], so window_size is half of the
71 | window, counted in number of array elements (as opposed to specify a time span, which is not
72 | supported by this implementation)
73 | threshold: float
74 | The threshold for marking an outlier. A low threshold "narrows" the band within which values are deemed as
75 | outliers. n_sigmas, default=3.0
76 | """
77 |
78 | def __init__(self, window_size=5, threshold=3):
79 | super().__init__()
80 | _validate_arguments(window_size, threshold)
81 | self._threshold = threshold
82 | self._window_size = window_size
83 |
84 | def _detect(self, data):
85 | anomalies = _detect(data.values, self._window_size, self._threshold)
86 |
87 | return anomalies
88 |
89 | def __str__(self):
90 | return f"{self.__class__.__name__}({self._window_size}, {self._threshold})"
91 |
--------------------------------------------------------------------------------