├── .gitattributes ├── .github └── workflows │ ├── docs.yml │ ├── python-publish.yml │ └── test.yml ├── .gitignore ├── .vscode └── settings.json ├── LICENSE ├── README.md ├── docs ├── Makefile ├── api.rst ├── conf.py ├── design.rst ├── getting_started.rst ├── index.rst └── make.bat ├── images ├── active_learning_app.png ├── anomaly.png └── logo │ ├── tsod.eps │ ├── tsod.png │ └── tsod.svg ├── notebooks ├── Example Water Level.ipynb ├── Getting started.ipynb ├── SMHI_hydrology.ipynb └── cmems.ipynb ├── pyproject.toml ├── tests ├── __init__.py ├── data │ ├── BO_TS_MO_FINO2.nc │ ├── Ballen_20150218-20201222.csv │ ├── combined.joblib │ └── example.csv ├── data_generation.py ├── test_detectors.py └── test_persistence.py └── tsod ├── __init__.py ├── base.py ├── custom_exceptions.py ├── detectors.py ├── features.py └── hampel.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb linguist-language=Python -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: Documentation 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | build: 10 | 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: 3.12 19 | 20 | - name: Install package 21 | run: | 22 | pip install .[dev] 23 | 24 | - name: Sphinx Build 25 | run: | 26 | cd docs 27 | make html 28 | 29 | - name: Publish to GitHub Pages 30 | uses: peaceiris/actions-gh-pages@v3.6.1 31 | with: 32 | github_token: ${{ secrets.GITHUB_TOKEN }} 33 | publish_dir: docs/_build/html -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | workflow_dispatch: 8 | 9 | release: 10 | types: [created] 11 | 12 | jobs: 13 | deploy: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Set up Python 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: '3.x' 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install setuptools wheel twine 27 | - name: Build and publish 28 | env: 29 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 30 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 31 | run: | 32 | python setup.py sdist bdist_wheel 33 | twine upload dist/* 34 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Full test 5 | 6 | on: 7 | push: 8 | branches: [ main ] 9 | pull_request: 10 | branches: [ main ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ${{ matrix.os }} 16 | strategy: 17 | matrix: 18 | os: [ubuntu-latest, windows-latest] 19 | python-version: ["3.10", "3.13"] 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | - uses: chartboost/ruff-action@v1 # Fail fast if there are any linting errors 24 | - name: Set up Python ${{ matrix.python-version }} 25 | uses: actions/setup-python@v4 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | - name: Install dependencies 29 | run: | 30 | python -m pip install --upgrade pip 31 | pip install wheel pytest pytest-cov 32 | 33 | - name: Install tsod 34 | run: | 35 | pip install .[test] 36 | - name: Test with pytest 37 | run: | 38 | pytest --cov=tsod tests 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # PyCharm 2 | .idea/ 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # Sphinx 135 | /doc/_build/ 136 | 137 | data/ -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.testing.pytestArgs": [ 3 | "tests" 4 | ], 5 | "python.formatting.provider": "none", 6 | "editor.formatOnSave": true, 7 | "python.testing.unittestEnabled": false, 8 | "python.testing.nosetestsEnabled": false, 9 | "python.testing.pytestEnabled": true, 10 | "restructuredtext.confPath": "${workspaceFolder}\\doc", 11 | "[python]": { 12 | "editor.defaultFormatter": "ms-python.black-formatter" 13 | } 14 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 DHI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # tsod: Anomaly Detection for time series data. 4 | 5 | [![Full test](https://github.com/DHI/tsod/actions/workflows/test.yml/badge.svg)](https://github.com/DHI/tsod/actions/workflows/test.yml) 6 | [![PyPI version](https://badge.fury.io/py/tsod.svg)](https://badge.fury.io/py/tsod) 7 | ![Python version](https://img.shields.io/pypi/pyversions/tsod.svg) 8 | 9 | ![univariate](https://raw.githubusercontent.com/DHI/tsod/main/images/anomaly.png) 10 | 11 | Sensors often provide faulty or missing observations. These anomalies must be detected automatically and replaced with more feasible values before feeding the data to numerical simulation engines as boundary conditions or real time decision systems. 12 | 13 | This package aims to provide examples and algorithms for detecting anomalies in time series data specifically tailored to DHI users and the water domain. It is simple to install and deploy operationally and is accessible to everyone (open-source). 14 | 15 | ## Getting Started 16 | 17 | * [Documentation](https://dhi.github.io/tsod/getting_started.html) 18 | * [Notebook](https://github.com/DHI/tsod/blob/main/notebooks/Getting%20started.ipynb) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](http://colab.research.google.com/github/DHI/tsod/blob/main/notebooks/Getting%20started.ipynb) 19 | 20 | 21 | 22 | ## Installation 23 | 24 | `tsod` is a pure Python library and runs on Windows, Linux and Mac. 25 | 26 | From PyPI: 27 | 28 | `pip install tsod` 29 | 30 | Or development version: 31 | 32 | `pip install https://github.com/DHI/tsod/archive/main.zip` 33 | 34 | 38 | 39 | 40 | ## Vision 41 | * A simple and consistent API for anomaly detection of timeseries 42 | * The computational speed will be good for typical timeseries data found in the water domain, to support realtime detection 43 | * It will have a suite of different algorithms ranging from simple rule-based to more advanced based on e.g. neural networks 44 | 45 | ## Definitions 46 | Note that we distinguish between [two types of anomaly detection](https://scikit-learn.org/stable/modules/outlier_detection.html) 47 | 48 | - Outlier detection (unsupervised anomaly detection) 49 | The training data may contain outliers, i.e. observations far from most other observations. Outlier detectors try to concentrate on the observations in the training data that similar and close together, and ignores observations further away. 50 | 51 | - Novelty detection (semi-supervised anomaly detection) 52 | The training data is considered "normal" and is not polluted by outliers. New test data observations can be categorized as an outlier and is in this context called a novelty. 53 | 54 | 55 | ## Contribute to `tsod` 56 | [Open in Visual Studio Code](https://open.vscode.dev/DHI/tsod) 57 | - Follow PEP8 code style. This is automatically checked during Pull Requests. 58 | 59 | - If citing or re-using other code please make sure their license is also consistent with our policy. 60 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | .. _api: 2 | 3 | API Reference 4 | ================= 5 | 6 | * :class:`tsod.RangeDetector` 7 | * :class:`tsod.ConstantValueDetector` 8 | * :class:`tsod.ConstantGradientDetector` 9 | * :class:`tsod.GradientDetector` 10 | * :class:`tsod.DiffDetector` 11 | * :class:`tsod.CombinedDetector` 12 | 13 | Generic 14 | ------- 15 | .. autoclass:: tsod.RangeDetector 16 | :members: 17 | :undoc-members: 18 | :inherited-members: 19 | 20 | .. autoclass:: tsod.ConstantValueDetector 21 | :members: 22 | :undoc-members: 23 | :inherited-members: 24 | 25 | .. autoclass:: tsod.ConstantGradientDetector 26 | :members: 27 | :undoc-members: 28 | :inherited-members: 29 | 30 | .. autoclass:: tsod.GradientDetector 31 | :members: 32 | :undoc-members: 33 | :inherited-members: 34 | 35 | .. autoclass:: tsod.DiffDetector 36 | :members: 37 | :undoc-members: 38 | :inherited-members: 39 | 40 | .. autoclass:: tsod.CombinedDetector 41 | :members: 42 | :undoc-members: 43 | :inherited-members: 44 | 45 | Hampel 46 | ------- 47 | .. autoclass:: tsod.hampel.HampelDetector 48 | :members: 49 | :undoc-members: 50 | :inherited-members: 51 | 52 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = "tsod" 21 | copyright = ( 22 | "2021, Henrik Andersson, Rasmus Halvgaard, Laura Frøhlich, Jesper Mariegaard" 23 | ) 24 | author = "Henrik Andersson, Rasmus Halvgaard, Laura Frøhlich, Jesper Mariegaard" 25 | 26 | # The full version, including alpha/beta/rc tags 27 | release = "" 28 | 29 | 30 | # -- General configuration --------------------------------------------------- 31 | 32 | # Add any Sphinx extension module names here, as strings. They can be 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 34 | # ones. 35 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.napoleon"] 36 | 37 | # Add any paths that contain templates here, relative to this directory. 38 | templates_path = ["_templates"] 39 | 40 | # List of patterns, relative to source directory, that match files and 41 | # directories to ignore when looking for source files. 42 | # This pattern also affects html_static_path and html_extra_path. 43 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 44 | 45 | 46 | # -- Options for HTML output ------------------------------------------------- 47 | 48 | # The theme to use for HTML and HTML Help pages. See the documentation for 49 | # a list of builtin themes. 50 | # 51 | html_theme = "sphinx_book_theme" 52 | 53 | # Add any paths that contain custom static files (such as style sheets) here, 54 | # relative to this directory. They are copied after the builtin static files, 55 | # so a file named "default.css" will overwrite the builtin "default.css". 56 | # html_static_path = ['_static'] 57 | 58 | # autoclass_content = 'both' 59 | -------------------------------------------------------------------------------- /docs/design.rst: -------------------------------------------------------------------------------- 1 | .. _design: 2 | 3 | Design philosophy 4 | ================= 5 | 6 | * Easy to use 7 | * Easy to install 8 | * Easy to get started 9 | * Open Source​ 10 | * Easy to collaborate​ 11 | * Reproducible 12 | * Easy access to new features 13 | 14 | 15 | Easy to use 16 | ----------- 17 | Common operations such as reading a file should only need a few lines of code. 18 | 19 | Make extensive use of existing standard libraries for scientific computing such as numpy, matplotlib and pandas. 20 | 21 | 22 | Easy to install 23 | --------------- 24 | 25 | From PyPI:: 26 | 27 | pip install tsod 28 | 29 | 30 | Easy to get started 31 | ------------------- 32 | By providing many examples to cut/paste from. 33 | 34 | Examples are available in two forms: 35 | 36 | * `Unit tests `_ 37 | * `Jupyter notebooks `_ 38 | 39 | Open Source​ 40 | ------------ 41 | 42 | tsod is an open source project licensed under the `MIT license `_. 43 | The software is provided free of charge with the source code available for inspection and modification. 44 | 45 | 46 | Easy to collaborate​ 47 | -------------------- 48 | 49 | By developing `tsod` on GitHub along with a completely open discussion, we believe that the collaboration between developers and end-users results in a useful library. 50 | 51 | Reproducible 52 | ------------ 53 | 54 | By providing the historical versions of `tsod`` on PyPI it is possible to reproduce the behaviour of an older existing system, based on an older version. 55 | 56 | Install specific version:: 57 | 58 | pip install tsod==0.1.2 59 | 60 | 61 | Install development version:: 62 | 63 | pip install https://github.com/DHI/tsod/archive/main.zip 64 | -------------------------------------------------------------------------------- /docs/getting_started.rst: -------------------------------------------------------------------------------- 1 | .. _getting_started: 2 | 3 | Getting started 4 | =============== 5 | 6 | `tsod` is library for timeseries data. The format of a timeseries is always a :py:class:`~pandas.Series` and in some cases with a :py:class:`~pandas.DatetimeIndex` 7 | 8 | 1. Get data in the form of a a :py:class:`~pandas.Series` (see Data formats below) 9 | 2. Select one or more detectors e.g. :class:`RangeDetector ` or :class:`ConstantValueDetector ` 10 | 3. Define parameters (e.g. min/max, max rate of change) or... 11 | 4. Fit parameters based on normal data, i.e. without outliers 12 | 5. Detect outliers in any dataset 13 | 14 | .. image:: https://colab.research.google.com/assets/colab-badge.svg 15 | :target: http://colab.research.google.com/github/DHI/tsod/blob/main/notebooks/Getting%20started.ipynb 16 | 17 | Example 18 | ------- 19 | 20 | >>> import pandas as pd 21 | >>> from tsod import RangeDetector 22 | >>> rd = RangeDetector(max_value=2.0) 23 | >>> data = pd.Series([0.0, 1.0, 3.0]) # 3.0 is out of range i.e. an anomaly 24 | >>> anom = rd.detect(data) 25 | >>> anom 26 | 0 False 27 | 1 False 28 | 2 True 29 | dtype: bool 30 | >>> data[anom] # get anomalous data 31 | 2 3.0 32 | dtype: float64 33 | >>> data[~anom] # get normal data 34 | 0 0.0 35 | 1 1.0 36 | dtype: float64 37 | >>> 38 | 39 | 40 | Saving and loading 41 | ------------------ 42 | .. code-block:: python 43 | 44 | # save a configured detector 45 | cd = CombinedDetector([ConstantValueDetector(), RangeDetector()]) 46 | cd.fit(normal_data) 47 | cd.save("detector.joblib") 48 | 49 | # ... and then later load it from disk 50 | my_detector = tsod.load("detector.joblib") 51 | my_detector.detect(some_data) 52 | 53 | Data formats 54 | ------------ 55 | 56 | Converting data to a :py:class:`~pandas.Series` 57 | 58 | .. code-block:: python 59 | 60 | import pandas as pd 61 | df = pd.read_csv("mydata.csv", parse_dates=True, index_col=0) 62 | my_series = df['water_level'] 63 | 64 | from mikeio import Dfs0 65 | dfs = Dfs0('simple.dfs0') 66 | df = dfs.to_dataframe() 67 | my_series_2 = df['rainfall'] 68 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. tsod documentation master file, created by 2 | sphinx-quickstart on Thu Mar 25 08:11:16 2021. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | .. image:: https://raw.githubusercontent.com/DHI/tsod/main/images/logo/tsod.png 7 | :width: 600 8 | 9 | tsod: Anomaly Detection for time series data. 10 | ============================================= 11 | 12 | 13 | .. image:: https://raw.githubusercontent.com/DHI/tsod/main/images/anomaly.png 14 | 15 | Sensors often provide faulty or missing observations. These anomalies must be detected automatically and replaced with more feasible values before feeding the data to numerical simulation engines as boundary conditions or real time decision systems. 16 | 17 | This package aims to provide examples and algorithms for detecting anomalies in time series data specifically tailored to DHI users and the water domain. It is simple to install and deploy operationally and is accessible to everyone (open-source). 18 | 19 | .. toctree:: 20 | :maxdepth: 2 21 | :caption: Contents: 22 | :hidden: 23 | 24 | getting_started 25 | design 26 | api 27 | 28 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /images/active_learning_app.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DHI/tsod/07669cfb4d92daf28e6ee1495f0090ae4046a57e/images/active_learning_app.png -------------------------------------------------------------------------------- /images/anomaly.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DHI/tsod/07669cfb4d92daf28e6ee1495f0090ae4046a57e/images/anomaly.png -------------------------------------------------------------------------------- /images/logo/tsod.eps: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DHI/tsod/07669cfb4d92daf28e6ee1495f0090ae4046a57e/images/logo/tsod.eps -------------------------------------------------------------------------------- /images/logo/tsod.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DHI/tsod/07669cfb4d92daf28e6ee1495f0090ae4046a57e/images/logo/tsod.png -------------------------------------------------------------------------------- /images/logo/tsod.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | 7 | 16 | 24 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /notebooks/Example Water Level.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Load water level data from DMI" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import os\n", 17 | "import pandas as pd\n", 18 | "import numpy as np" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "name": "stderr", 28 | "output_type": "stream", 29 | "text": [ 30 | "C:\\Users\\JAN\\AppData\\Local\\Temp/ipykernel_21332/396824514.py:3: DeprecationWarning: `set_matplotlib_formats` is deprecated since IPython 7.23, directly use `matplotlib_inline.backend_inline.set_matplotlib_formats()`\n", 31 | " set_matplotlib_formats('png')\n" 32 | ] 33 | } 34 | ], 35 | "source": [ 36 | "from IPython.display import set_matplotlib_formats\n", 37 | "set_matplotlib_formats('png')" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "from tsod.detectors import CombinedDetector, RangeDetector, DiffDetector, RollingStandardDeviationDetector\n", 47 | "from tsod.hampel import HampelDetector" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 4, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "file_path = os.path.join(\"..\", \"tests\", \"data\", \"Ballen_20150218-20201222.csv\")\n", 57 | "df = pd.read_csv(file_path, index_col=0, parse_dates=True)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 5, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "data = df.water_level" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 6, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "data": { 76 | "text/plain": [ 77 | "" 78 | ] 79 | }, 80 | "execution_count": 6, 81 | "metadata": {}, 82 | "output_type": "execute_result" 83 | }, 84 | { 85 | "data": { 86 | "image/png": "\n", 87 | "text/plain": [ 88 | "
" 89 | ] 90 | }, 91 | "metadata": { 92 | "needs_background": "light" 93 | }, 94 | "output_type": "display_data" 95 | } 96 | ], 97 | "source": [ 98 | "data.plot()" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "# Detect anomalies outside manually set range" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 7, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "range_anomalies = RangeDetector(-1, 1.3).detect(data)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 8, 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "data": { 124 | "text/plain": [ 125 | "" 126 | ] 127 | }, 128 | "execution_count": 8, 129 | "metadata": {}, 130 | "output_type": "execute_result" 131 | }, 132 | { 133 | "data": { 134 | "image/png": "\n", 135 | "text/plain": [ 136 | "
" 137 | ] 138 | }, 139 | "metadata": { 140 | "needs_background": "light" 141 | }, 142 | "output_type": "display_data" 143 | } 144 | ], 145 | "source": [ 146 | "detected = data.to_frame()\n", 147 | "detected[\"anomalies\"] = data[range_anomalies.values]\n", 148 | "detected.plot(style=['-', 'o'], figsize=(8,3), title=f\"Anomalies detected: {range_anomalies.sum()}\")" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 9, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "data_clean = data.copy()\n", 158 | "data_clean[range_anomalies.values] = np.nan" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "# Detect anomalies outside automatically set range" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 10, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "N = 1000\n", 175 | "normal_data, test_data = data[:N], data[N:]" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 11, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "anomaly_detector = CombinedDetector([RangeDetector(), DiffDetector()])\n", 185 | "anomaly_detector.fit(normal_data)\n", 186 | "detected_anomalies = anomaly_detector.detect(test_data)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 12, 192 | "metadata": {}, 193 | "outputs": [ 194 | { 195 | "data": { 196 | "text/plain": [ 197 | "2015-02-25 12:30:00 False\n", 198 | "2015-02-25 12:40:00 False\n", 199 | "2015-02-25 12:50:00 False\n", 200 | "2015-02-25 13:00:00 False\n", 201 | "2015-02-25 13:20:00 False\n", 202 | "dtype: bool" 203 | ] 204 | }, 205 | "execution_count": 12, 206 | "metadata": {}, 207 | "output_type": "execute_result" 208 | } 209 | ], 210 | "source": [ 211 | "detected_anomalies.head()" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "# Detect peaks" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 13, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "detector = RollingStandardDeviationDetector(10, 0.1)\n", 228 | "std_anomalies = detector.detect(data)" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 14, 234 | "metadata": {}, 235 | "outputs": [ 236 | { 237 | "data": { 238 | "text/plain": [ 239 | "" 240 | ] 241 | }, 242 | "execution_count": 14, 243 | "metadata": {}, 244 | "output_type": "execute_result" 245 | }, 246 | { 247 | "data": { 248 | "image/png": "\n", 249 | "text/plain": [ 250 | "
" 251 | ] 252 | }, 253 | "metadata": { 254 | "needs_background": "light" 255 | }, 256 | "output_type": "display_data" 257 | } 258 | ], 259 | "source": [ 260 | "detected = data.to_frame()\n", 261 | "detected[\"anomalies\"] = data[std_anomalies.values]\n", 262 | "detected.plot(style=['-', 'o'], figsize=(8,3))" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "# Hampel filter" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "The default threshold of the HampelDetector is 3, which means that a sample that deviates by more than three times of the rolling window's standard deviation is marked as an anomaly. **Increasing** the threshold marks **more** samples as anomalies, **decreasing** the threshold marks **fewer**." 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 16, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "detector = HampelDetector(window_size=20, threshold=3)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 17, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "anomalies = detector.detect(data)" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 18, 300 | "metadata": {}, 301 | "outputs": [ 302 | { 303 | "data": { 304 | "text/plain": [ 305 | "" 306 | ] 307 | }, 308 | "execution_count": 18, 309 | "metadata": {}, 310 | "output_type": "execute_result" 311 | }, 312 | { 313 | "data": { 314 | "image/png": "\n", 315 | "text/plain": [ 316 | "
" 317 | ] 318 | }, 319 | "metadata": { 320 | "needs_background": "light" 321 | }, 322 | "output_type": "display_data" 323 | } 324 | ], 325 | "source": [ 326 | "detected = data.to_frame()\n", 327 | "detected[\"anomalies\"] = data[anomalies]\n", 328 | "detected.plot(style=['-', 'o'], figsize=(8,3), title=f'Anomalies detected: {sum(anomalies)}')" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [] 337 | } 338 | ], 339 | "metadata": { 340 | "kernelspec": { 341 | "display_name": "Python 3 (ipykernel)", 342 | "language": "python", 343 | "name": "python3" 344 | }, 345 | "language_info": { 346 | "codemirror_mode": { 347 | "name": "ipython", 348 | "version": 3 349 | }, 350 | "file_extension": ".py", 351 | "mimetype": "text/x-python", 352 | "name": "python", 353 | "nbconvert_exporter": "python", 354 | "pygments_lexer": "ipython3", 355 | "version": "3.9.6" 356 | } 357 | }, 358 | "nbformat": 4, 359 | "nbformat_minor": 4 360 | } 361 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [tool.hatch.build] 6 | include = [ 7 | "tsod/**/*", 8 | ] 9 | 10 | [project] 11 | name = "tsod" 12 | version = "0.3.dev0" 13 | description = "Time series anomaly detection." 14 | authors = [{ name = "Henrik Andersson", email = "jan@dhigroup.com" }] 15 | license = { text = "MIT" } 16 | readme = "README.md" 17 | requires-python = ">=3.10" 18 | dependencies = [ 19 | "pandas>=1.0.0", 20 | "joblib", 21 | "numba", 22 | ] 23 | 24 | classifiers = [ 25 | "License :: OSI Approved :: MIT License", 26 | "Development Status :: 2 - Pre-Alpha", 27 | "Intended Audience :: Science/Research", 28 | "Programming Language :: Python", 29 | "Programming Language :: Python :: 3", 30 | "Programming Language :: Python :: 3.10", 31 | "Programming Language :: Python :: 3.11", 32 | "Programming Language :: Python :: 3.12", 33 | "Operating System :: OS Independent", 34 | "Topic :: Scientific/Engineering", 35 | ] 36 | 37 | [project.optional-dependencies] 38 | dev = [ 39 | "ruff", 40 | "pytest>=6", 41 | "pytest-cov>=4", 42 | "sphinx<7,>=4", 43 | "sphinx-book-theme", 44 | ] 45 | 46 | [project.urls] 47 | "Homepage" = "https://github.com/DHI/tsod" 48 | "Bug Tracker" = "https://github.com/DHI/tsod/issues" 49 | 50 | 51 | 52 | [tool.ruff] 53 | lint.ignore = ["E501", "E741"] -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DHI/tsod/07669cfb4d92daf28e6ee1495f0090ae4046a57e/tests/__init__.py -------------------------------------------------------------------------------- /tests/data/BO_TS_MO_FINO2.nc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DHI/tsod/07669cfb4d92daf28e6ee1495f0090ae4046a57e/tests/data/BO_TS_MO_FINO2.nc -------------------------------------------------------------------------------- /tests/data/combined.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DHI/tsod/07669cfb4d92daf28e6ee1495f0090ae4046a57e/tests/data/combined.joblib -------------------------------------------------------------------------------- /tests/data/example.csv: -------------------------------------------------------------------------------- 1 | datetime,value 2 | 2000-01-01 00:00,0 3 | 2000-01-01 01:00,1.152781471 4 | 2000-01-01 02:00,1.450555612 5 | 2000-01-01 03:00,0.958642996 6 | 2000-01-01 04:00,1.174894688 7 | 2000-01-01 05:00,1.873848691 8 | 2000-01-01 06:00,1.84439191 9 | 2000-01-01 07:00,1.042026059 10 | 2000-01-01 08:00,0.787750065 11 | 2000-01-01 09:00,0.748739534 12 | 2000-01-01 10:00,0.821560744 13 | 2000-01-01 11:00,1.225795469 14 | 2000-01-01 12:00,0.957734013 15 | 2000-01-01 13:00,3 16 | 2000-01-01 14:00,3 17 | 2000-01-01 15:00,0.41953078 18 | 2000-01-01 16:00,0.612788648 19 | 2000-01-01 17:00,0.960619748 20 | 2000-01-01 18:00,0.922445351 21 | 2000-01-01 19:00,0.317372902 22 | 2000-01-01 20:00,0.683232022 23 | 2000-01-01 21:00,1.59095915 24 | 2000-01-01 22:00,1.284410756 25 | 2000-01-01 23:00,0.506858535 26 | 2000-01-02 00:00,1 27 | 2000-01-02 01:00,1 28 | 2000-01-02 02:00,1 29 | 2000-01-02 03:00,1 30 | 2000-01-02 04:00,1 31 | 2000-01-02 05:00,1 32 | 2000-01-02 06:00,1 33 | 2000-01-02 07:00,1 34 | 2000-01-02 08:00,1 35 | 2000-01-02 09:00,1 36 | 2000-01-02 10:00,1 37 | 2000-01-02 11:00,1 38 | 2000-01-02 12:00,1 39 | 2000-01-02 13:00,1.586141486 40 | 2000-01-02 14:00,1.314504228 41 | 2000-01-02 15:00,1.201552756 42 | 2000-01-02 16:00,1.445814101 43 | 2000-01-02 17:00,1.162387057 44 | 2000-01-02 18:00,1.066841287 45 | 2000-01-02 19:00,1.00248802 46 | 2000-01-02 20:00,0.503450841 47 | 2000-01-02 21:00,0.872922217 48 | 2000-01-02 22:00,1.580114567 49 | 2000-01-02 23:00,1.045514877 50 | 2000-01-03 00:00,1.012183073 51 | 2000-01-03 01:00,1.425633166 52 | 2000-01-03 02:00,1.191899682 53 | 2000-01-03 03:00,1.267666114 54 | 2000-01-03 04:00,1.061485161 55 | 2000-01-03 05:00,0.665546206 56 | 2000-01-03 06:00,0.424668666 57 | 2000-01-03 07:00,2.5 58 | 2000-01-03 08:00,0.4 59 | 2000-01-03 09:00,1.295534815 60 | 2000-01-03 10:00,1.514240194 61 | 2000-01-03 11:00,1.083260333 62 | 2000-01-03 12:00,1.1584635 63 | 2000-01-03 13:00,1.679222803 64 | 2000-01-03 14:00,1.081591441 65 | 2000-01-03 15:00,0.454411928 66 | 2000-01-03 16:00,0.937110802 67 | 2000-01-03 17:00,1.020471646 68 | 2000-01-03 18:00,1.285019944 69 | 2000-01-03 19:00,1.450649173 70 | 2000-01-03 20:00,0.937287208 71 | 2000-01-03 21:00,0.506385868 72 | 2000-01-03 22:00,0.921499469 73 | 2000-01-03 23:00,1.081260008 74 | 2000-01-04 00:00,1.015525661 75 | 2000-01-04 01:00,1.319109975 76 | 2000-01-04 02:00,1.187151435 77 | 2000-01-04 03:00,1.267982362 78 | 2000-01-04 04:00,0.896385507 79 | 2000-01-04 05:00,1 80 | 2000-01-04 06:00,0.9 81 | 2000-01-04 07:00,0.8 82 | 2000-01-04 08:00,0.7 83 | 2000-01-04 09:00,0.6 84 | 2000-01-04 10:00,0.5 85 | 2000-01-04 11:00,0.4 86 | 2000-01-04 12:00,0.3 87 | 2000-01-04 13:00,0.2 88 | 2000-01-04 14:00,0.1 89 | 2000-01-04 15:00,0 90 | 2000-01-04 16:00,0.768838193 91 | 2000-01-04 17:00,1.156824441 92 | 2000-01-04 18:00,1.399798202 93 | 2000-01-04 19:00,1.112722702 94 | 2000-01-04 20:00,1.221512379 95 | 2000-01-04 21:00,0.859352212 96 | 2000-01-04 22:00,0.247665553 97 | 2000-01-04 23:00,0.784977135 98 | 2000-01-05 00:00,0.634419463 99 | 2000-01-05 01:00,0.239960571 100 | 2000-01-05 02:00,0.422304927 101 | 2000-01-05 03:00,0.606980415 102 | 2000-01-05 03:59,0.776914226 103 | 2000-01-05 04:59,0.96975411 104 | 2000-01-05 05:59,0.774306839 105 | 2000-01-05 06:59,0.967961138 106 | 2000-01-05 07:59,1.145144565 107 | 2000-01-05 08:59,0.464866706 108 | 2000-01-05 09:59,0.13530199 109 | 2000-01-05 10:59,0.738101172 110 | 2000-01-05 11:59,0.832450078 111 | 2000-01-05 12:59,1.082938156 112 | 2000-01-05 13:59,0.948400949 113 | 2000-01-05 14:59,0.65301421 114 | 2000-01-05 15:59,1.526724574 115 | 2000-01-05 16:59,1.842639141 116 | 2000-01-05 17:59,1.869540157 117 | 2000-01-05 18:59,1.697764564 118 | 2000-01-05 19:59,0.784910422 119 | 2000-01-05 20:59,1.040359074 120 | 2000-01-05 21:59,1.802531081 121 | 2000-01-05 22:59,0.834522078 122 | 2000-01-05 23:59,0.179125049 123 | 2000-01-06 00:59,1.064403211 124 | 2000-01-06 01:59,1.859795992 125 | 2000-01-06 02:59,1.58075584 126 | 2000-01-06 03:59,1.058748553 127 | 2000-01-06 04:59,0.503596224 128 | 2000-01-06 05:59,0.503728592 129 | 2000-01-06 06:59,0.917595552 130 | 2000-01-06 07:59,1.322622102 131 | 2000-01-06 08:59,0.968904598 132 | 2000-01-06 09:59,1.080349742 133 | 2000-01-06 10:59,1.401408437 134 | 2000-01-06 11:59,1.44165518 135 | 2000-01-06 12:59,1.745179237 136 | -------------------------------------------------------------------------------- /tests/data_generation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def create_random_walk_with_outliers( 5 | n_steps, t0=0, outlier_fraction=0.1, outlier_scale=10, seed=42 6 | ): 7 | """ 8 | Generate a random walk time series with random outlier peaks. 9 | 10 | Parameters 11 | ------------ 12 | n_steps : int 13 | Length of the time series to be generated. 14 | t0 : int 15 | Time series initial value. 16 | outlier_fraction : float 17 | Fraction of outliers to be generated in series [0-1]. 18 | outlier_scale : float 19 | Scalar by which to multiply the RW increment to create an outlier. 20 | seed : int 21 | Random seed 22 | 23 | Returns 24 | ------- 25 | random_walk : np.ndarray 26 | The generated random walk time series with outliers. 27 | outlier_indices : np.ndarray 28 | The indices of the introduced outliers. 29 | """ 30 | assert 0 <= outlier_fraction <= 1 31 | n_outliers = int(outlier_fraction * n_steps) 32 | 33 | # Simulate random walk 34 | np.random.seed(seed) 35 | possible_steps = [-1, 1] 36 | random_steps = np.random.choice(a=possible_steps, size=n_steps) 37 | random_walk = np.append(t0, random_steps[:-1]).cumsum(axis=0) 38 | 39 | # Add outliers 40 | random_walk_with_outliers = random_walk.copy() 41 | outlier_indices = np.random.randint(0, n_steps, n_outliers) 42 | random_walk_with_outliers[outlier_indices] += ( 43 | random_steps[outlier_indices] * outlier_scale 44 | ) 45 | 46 | return random_walk_with_outliers, sorted(outlier_indices), random_walk 47 | -------------------------------------------------------------------------------- /tests/test_detectors.py: -------------------------------------------------------------------------------- 1 | from tsod.base import Detector 2 | import pytest 3 | import numpy as np 4 | import pandas as pd 5 | import os 6 | 7 | from tsod.custom_exceptions import WrongInputDataTypeError 8 | from tsod.detectors import ( 9 | RangeDetector, 10 | DiffDetector, 11 | CombinedDetector, 12 | RollingStandardDeviationDetector, 13 | ConstantValueDetector, 14 | ConstantGradientDetector, 15 | GradientDetector, 16 | ) 17 | 18 | from tsod.features import create_dataset 19 | from tsod.hampel import HampelDetector 20 | 21 | 22 | from tests.data_generation import create_random_walk_with_outliers 23 | 24 | 25 | @pytest.fixture 26 | def data_series(): 27 | n_steps = 100 28 | ( 29 | time_series_with_outliers, 30 | outlier_indices, 31 | random_walk, 32 | ) = create_random_walk_with_outliers(n_steps) 33 | time = pd.date_range(start="2020", periods=n_steps, freq="1h") 34 | return ( 35 | pd.Series(time_series_with_outliers, index=time), 36 | outlier_indices, 37 | pd.Series(random_walk, index=time), 38 | ) 39 | 40 | 41 | @pytest.fixture 42 | def range_data(): 43 | normal_data = np.array([0, np.nan, 1, 0, 2, np.nan, 3.14, 4]) 44 | abnormal_data = np.array([-1.0, np.nan, 2.0, np.nan, 1.0, 0.0, 4.1, 10.0]) 45 | expected_anomalies = np.array([True, False, False, False, False, False, True, True]) 46 | assert len(expected_anomalies) == len(abnormal_data) 47 | return normal_data, abnormal_data, expected_anomalies 48 | 49 | 50 | @pytest.fixture 51 | def range_data_series(range_data): 52 | normal_data, abnormal_data, expected_anomalies = range_data 53 | time = pd.date_range(start="2020", periods=len(normal_data), freq="1h") 54 | return ( 55 | pd.Series(normal_data, index=time), 56 | pd.Series(abnormal_data, index=time), 57 | expected_anomalies, 58 | ) 59 | 60 | 61 | @pytest.fixture 62 | def constant_gradient_data_series(range_data): 63 | normal_data = np.array([0, np.nan, 1, 1.1, 1.4, 1.5555, 3.14, 4]) 64 | abnormal_data = np.array([-1, 2.0, 2.1, 2.2, 2.3, 2.4, 4, 10]) 65 | expected_anomalies = np.array([False, True, True, True, True, True, False, False]) 66 | time = pd.date_range(start="2020", periods=len(normal_data), freq="1h") 67 | return ( 68 | pd.Series(normal_data, index=time), 69 | pd.Series(abnormal_data, index=time), 70 | expected_anomalies, 71 | ) 72 | 73 | 74 | @pytest.fixture 75 | def constant_data_series(range_data): 76 | normal_data = np.array([0, np.nan, 1, 1.1, 1.4, 1.5555, 3.14, 4]) 77 | abnormal_data = np.array([-1, np.nan, 1, 1, 1, 1, 4, 10]) 78 | expected_anomalies = np.array([False, False, True, True, True, True, False, False]) 79 | time = pd.date_range(start="2020", periods=len(normal_data), freq="1h") 80 | return ( 81 | pd.Series(normal_data, index=time), 82 | pd.Series(abnormal_data, index=time), 83 | expected_anomalies, 84 | ) 85 | 86 | 87 | def test_base_detector_exceptions(range_data, range_data_series): 88 | data, _, _ = range_data 89 | data_series, _, _ = range_data_series 90 | 91 | detector = RangeDetector() 92 | pytest.raises(WrongInputDataTypeError, detector.fit, data) 93 | 94 | 95 | def test_range_detector(range_data_series): 96 | data, _, _ = range_data_series 97 | 98 | detector = RangeDetector(0, 2) 99 | anomalies = detector.detect(data) 100 | expected_anomalies = [False, False, False, False, False, False, True, True] 101 | assert len(anomalies) == len(data) 102 | assert sum(anomalies) == 2 103 | assert all(expected_anomalies == anomalies) 104 | 105 | 106 | def test_range_detector_autoset(range_data_series): 107 | data, _, _ = range_data_series 108 | 109 | anomalies = RangeDetector(min_value=3).detect(data) 110 | assert sum(anomalies) == 4 111 | 112 | anomalies = RangeDetector(max_value=3).detect(data) 113 | assert sum(anomalies) == 2 114 | 115 | 116 | def test_combined_fit(range_data_series): 117 | normal_data, abnormal_data, labels = range_data_series 118 | cd = CombinedDetector([ConstantValueDetector(), RangeDetector()]) 119 | cd.fit(normal_data) 120 | 121 | anomalies = cd.detect(abnormal_data) 122 | assert all(anomalies == labels) 123 | 124 | 125 | def test_combined_wrong_type(): 126 | with pytest.raises(ValueError): 127 | CombinedDetector([ConstantValueDetector, RangeDetector()]) # 128 | 129 | 130 | def test_combined_access_items(): 131 | 132 | cd = CombinedDetector([ConstantValueDetector(), RangeDetector()]) 133 | 134 | assert isinstance(cd[0], Detector) 135 | assert isinstance(cd[0], ConstantValueDetector) 136 | assert isinstance(cd[1], RangeDetector) 137 | assert isinstance(cd[-1], RangeDetector) 138 | 139 | 140 | def test_range_detector_quantile(): 141 | np.random.seed(42) 142 | train = np.random.normal(size=1000) 143 | test = np.random.normal(size=1000) 144 | 145 | train[42] = -6.5 146 | train[560] = 10.5 147 | 148 | test[142] = -4.5 149 | test[960] = 5.5 150 | 151 | normal_data_incl_two_outliers = pd.Series(train) 152 | test_data = pd.Series(test) 153 | 154 | # all test data is within range of train data, no anomalies detected 155 | nqdetector = RangeDetector().fit(normal_data_incl_two_outliers) 156 | detected_anomalies = nqdetector.detect(test_data) 157 | assert sum(detected_anomalies) == 0 158 | 159 | # exclude extreme values 160 | detector = RangeDetector(quantiles=[0.001, 0.999]).fit( 161 | normal_data_incl_two_outliers 162 | ) 163 | detected_anomalies = detector.detect(test_data) 164 | assert sum(detected_anomalies) == 2 165 | assert detector._min > normal_data_incl_two_outliers.min() 166 | assert detector._max < normal_data_incl_two_outliers.max() 167 | 168 | 169 | def test_diff_detector_autoset(range_data_series): 170 | normal_data, abnormal_data, expected_anomalies = range_data_series 171 | 172 | detector = DiffDetector().fit(normal_data) 173 | detected_anomalies = detector.detect(abnormal_data) 174 | assert sum(detected_anomalies) == 2 175 | 176 | 177 | def test_combined_detector(): 178 | path_to_tests_super_folder = os.path.abspath(__file__).split("tests")[0] 179 | df = pd.read_csv( 180 | os.path.join(path_to_tests_super_folder, "tests", "data", "example.csv"), 181 | parse_dates=True, 182 | index_col=0, 183 | ) 184 | combined = CombinedDetector( 185 | [ 186 | ConstantValueDetector(), 187 | RangeDetector(max_value=2.0), 188 | ] 189 | ) 190 | 191 | series = df.value 192 | res = combined.detect(series) 193 | 194 | assert isinstance(res, pd.Series) 195 | 196 | 197 | def test_rollingstddev_detector(): 198 | 199 | np.random.seed(42) 200 | normal_data = pd.Series(np.random.normal(scale=1.0, size=1000)) + 10.0 * np.sin( 201 | np.linspace(0, 10, num=1000) 202 | ) 203 | abnormal_data = pd.Series(np.random.normal(scale=2.0, size=100)) 204 | 205 | all_data = pd.concat([normal_data, abnormal_data]) 206 | 207 | detector = RollingStandardDeviationDetector() 208 | anomalies = detector.detect(normal_data) 209 | assert sum(anomalies) == 0 210 | 211 | detector.fit(normal_data) 212 | anomalies = detector.detect(normal_data) 213 | assert sum(anomalies) == 0 214 | 215 | anomalies = detector.detect(all_data) 216 | assert sum(anomalies) > 0 217 | 218 | # Manual specification 219 | detector = RollingStandardDeviationDetector(max_std=2.0) 220 | anomalies = detector.detect(normal_data) 221 | assert sum(anomalies) == 0 222 | 223 | anomalies = detector.detect(all_data) 224 | assert sum(anomalies) > 0 225 | 226 | 227 | def test_hampel_detector(data_series): 228 | data_with_anomalies, expected_anomalies_indices, _ = data_series 229 | detector = HampelDetector() 230 | anomalies = detector.detect(data_with_anomalies) 231 | anomalies_indices = np.array(np.where(anomalies)).flatten() 232 | # Validate if the found anomalies are also in the expected anomaly set 233 | # NB Not necessarily all of them 234 | assert all(i in expected_anomalies_indices for i in anomalies_indices) 235 | 236 | 237 | 238 | def test_constant_value_detector(constant_data_series): 239 | good_data, abnormal_data, _ = constant_data_series 240 | 241 | detector = ConstantValueDetector(2, 0.0001) 242 | anomalies = detector.detect(good_data) 243 | 244 | assert len(anomalies) == len(good_data) 245 | assert sum(anomalies) == 0 246 | 247 | detector = ConstantValueDetector(3, 0.0001) 248 | anomalies = detector.detect(abnormal_data) 249 | 250 | assert len(anomalies) == len(abnormal_data) 251 | assert sum(anomalies) == 4 252 | 253 | 254 | def test_constant_gradient_detector(constant_gradient_data_series): 255 | good_data, abnormal_data, _ = constant_gradient_data_series 256 | 257 | detector = ConstantGradientDetector(3) 258 | anomalies = detector.detect(good_data) 259 | 260 | assert len(anomalies) == len(good_data) 261 | assert sum(anomalies) == 0 262 | 263 | detector = ConstantGradientDetector(3) 264 | anomalies = detector.detect(abnormal_data) 265 | 266 | assert len(anomalies) == len(abnormal_data) 267 | assert sum(anomalies) == 5 268 | 269 | 270 | def test_gradient_detector_constant_gradient(constant_gradient_data_series): 271 | good_data, _, _ = constant_gradient_data_series 272 | 273 | detector = GradientDetector(1.0) 274 | anomalies = detector.detect(good_data) 275 | 276 | assert len(anomalies) == len(good_data) 277 | assert sum(anomalies) == 0 278 | 279 | 280 | def test_gradient_detector_sudden_jump(): 281 | 282 | normal_data = np.array( 283 | [ 284 | -0.5, 285 | -0.6, 286 | 0.6, 287 | 0.6, 288 | 0.1, 289 | 0.6, 290 | 0.4, 291 | 0.8, 292 | 0.7, 293 | 1.5, 294 | 1.6, 295 | 1.1, 296 | 0.3, 297 | 2.1, 298 | 0.7, 299 | 0.3, 300 | -1.7, 301 | -0.3, 302 | 0.0, 303 | -1.0, 304 | ] 305 | ) 306 | abnormal_data = np.array( 307 | [ 308 | -0.5, 309 | -1.5, 310 | 1.5, 311 | 0.6, 312 | 0.1, 313 | 0.6, 314 | 0.4, 315 | 0.8, 316 | 0.7, 317 | 1.5, 318 | 1.6, 319 | 1.1, 320 | 0.3, 321 | 2.1, 322 | 0.7, 323 | 0.3, 324 | -1.7, 325 | -0.3, 326 | 0.0, 327 | -1.0, 328 | ] 329 | ) 330 | 331 | expected_anomalies = np.repeat(False, len(normal_data)) 332 | expected_anomalies[2] = True 333 | time = pd.date_range(start="2020", periods=len(normal_data), freq="1h") 334 | 335 | normal_data = pd.Series(normal_data, index=time) 336 | abnormal_data = pd.Series(abnormal_data, index=time) 337 | 338 | detector = GradientDetector() 339 | 340 | anomalies = detector.detect(normal_data) 341 | assert sum(anomalies) == 0 342 | 343 | # Default is to accept any gradient 344 | anomalies = detector.detect(abnormal_data) 345 | assert sum(anomalies) == 0 346 | 347 | # Max gradient 2.0/h 348 | detector.fit(normal_data) 349 | anomalies = detector.detect(abnormal_data) 350 | 351 | assert sum(anomalies) == 1 352 | 353 | 354 | def test_create_dataset(data_series): 355 | data_with_anomalies, _, _ = data_series 356 | data_with_anomalies.name = "y" 357 | data = data_with_anomalies.to_frame() 358 | time_steps = 2 359 | predictors, y = create_dataset(data[["y"]], data.y, time_steps) 360 | assert len(y) == len(data) - time_steps 361 | assert predictors.shape[0] == len(data) - time_steps 362 | assert predictors.shape[1] == time_steps 363 | -------------------------------------------------------------------------------- /tests/test_persistence.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tsod 3 | from tsod import RangeDetector, ConstantValueDetector, CombinedDetector 4 | 5 | 6 | def test_save_and_load(tmp_path): 7 | 8 | combined = CombinedDetector( 9 | [ 10 | ConstantValueDetector(), 11 | RangeDetector(max_value=2.0), 12 | ] 13 | ) 14 | 15 | path = tmp_path / "combined.joblib" 16 | combined.save(path) 17 | 18 | loaded = tsod.load(path) 19 | 20 | assert isinstance(loaded, CombinedDetector) 21 | 22 | 23 | def test_load(): 24 | path_to_tests_super_folder = os.path.abspath(__file__).split("tests")[0] 25 | filename = os.path.join( 26 | path_to_tests_super_folder, "tests", "data", "combined.joblib" 27 | ) 28 | 29 | loaded = tsod.load(filename) 30 | 31 | assert isinstance(loaded, CombinedDetector) 32 | 33 | 34 | def test_save_and_load_filename(tmpdir): 35 | 36 | combined = CombinedDetector( 37 | [ 38 | ConstantValueDetector(), 39 | RangeDetector(max_value=2.0), 40 | ] 41 | ) 42 | 43 | filename = os.path.join(tmpdir, "combined.joblib") 44 | combined.save(filename) 45 | 46 | loaded = tsod.load(filename) 47 | 48 | assert isinstance(loaded, CombinedDetector) 49 | -------------------------------------------------------------------------------- /tsod/__init__.py: -------------------------------------------------------------------------------- 1 | from .detectors import ( 2 | RangeDetector, 3 | DiffDetector, 4 | ConstantGradientDetector, 5 | GradientDetector, 6 | ConstantValueDetector, 7 | CombinedDetector, 8 | RollingStandardDeviationDetector, 9 | ) 10 | 11 | 12 | from .base import load 13 | 14 | __version__ = "0.2.0" 15 | 16 | __all__ = [ 17 | "RangeDetector", 18 | "DiffDetector", 19 | "ConstantGradientDetector", 20 | "GradientDetector", 21 | "ConstantValueDetector", 22 | "CombinedDetector", 23 | "RollingStandardDeviationDetector", 24 | "load", 25 | ] 26 | -------------------------------------------------------------------------------- /tsod/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Union 3 | 4 | from pathlib import Path 5 | import joblib 6 | 7 | import pandas as pd 8 | 9 | 10 | from .custom_exceptions import WrongInputDataTypeError 11 | 12 | 13 | def load(path: Union[str, Path]): 14 | """Load a saved model from disk saved with `Detector.save` 15 | 16 | Parameters 17 | ========== 18 | path: str or Path 19 | file-like object to load detector from 20 | """ 21 | 22 | return joblib.load(path) 23 | 24 | 25 | class Detector(ABC): 26 | """Abstract base class for all detectors""" 27 | 28 | def __init__(self): 29 | pass 30 | 31 | def fit(self, data: pd.Series): 32 | """Set detector parameters based on data. 33 | 34 | Parameters 35 | ---------- 36 | data: pd.Series 37 | Normal time series data. 38 | """ 39 | data = self.validate(data) 40 | self._fit(data) 41 | return self 42 | 43 | def _fit(self, data: pd.Series): 44 | # Default implementation is a NoOp 45 | return self 46 | 47 | def detect(self, data: pd.Series) -> pd.Series: 48 | """Detect anomalies 49 | 50 | Parameters 51 | ---------- 52 | data: pd.Series 53 | Time series data with possible anomalies 54 | 55 | Returns 56 | ------- 57 | pd.Series 58 | Time series with bools, True == anomaly 59 | """ 60 | data = self.validate(data) 61 | 62 | pred = self._detect(data) 63 | return self._postprocess(pred) 64 | 65 | def _postprocess(self, pred: pd.Series) -> pd.Series: 66 | # TODO implement 67 | return pred 68 | 69 | @abstractmethod 70 | def _detect(self, data: pd.Series) -> pd.Series: 71 | """Detect anomalies""" 72 | pass 73 | 74 | def validate( 75 | self, data: Union[pd.Series, pd.DataFrame] 76 | ) -> Union[pd.Series, pd.DataFrame]: 77 | """Check that input data is in correct format and possibly adjust""" 78 | if not (isinstance(data, pd.Series) or isinstance(data, pd.DataFrame)): 79 | raise WrongInputDataTypeError() 80 | return data 81 | 82 | def _gradient( 83 | self, data: Union[pd.Series, pd.DataFrame], periods: int = 1 84 | ) -> pd.Series: 85 | dt = data.index.to_series().diff().dt.total_seconds() 86 | if dt.min() < 1e-15: 87 | raise ValueError("Index must be monotonically increasing") 88 | 89 | gradient = data.diff(periods=periods) / dt 90 | return gradient 91 | 92 | def __str__(self): 93 | return f"{self.__class__.__name__}" 94 | 95 | def save(self, path: Union[str, Path]) -> None: 96 | """Save a detector for later use 97 | 98 | Parameters 99 | ========== 100 | path: str or Path 101 | file-like object to load detector from 102 | """ 103 | 104 | joblib.dump(self, path) 105 | -------------------------------------------------------------------------------- /tsod/custom_exceptions.py: -------------------------------------------------------------------------------- 1 | class WrongInputDataTypeError(Exception): 2 | def __init__(self, message="Input data must be a pandas.Series."): 3 | self.message = message 4 | super().__init__(self.message) 5 | 6 | 7 | class NotFittedError(Exception): 8 | def __init__(self, message="Please call fit() before detect().", tip=""): 9 | self.message = " ".join([message, tip]) 10 | super().__init__(self.message) 11 | 12 | 13 | class NoRangeDefinedError(NotFittedError): 14 | def __init__( 15 | self, message="Or specify min/max range when instantiating detector object." 16 | ): 17 | super().__init__(message) 18 | 19 | 20 | class InvalidArgumentError(Exception): 21 | def __init__(self, argument_name, requirement): 22 | self.message = f"{argument_name} must be {requirement}." 23 | super().__init__(self.message) 24 | 25 | 26 | class NotIntegerError(InvalidArgumentError): 27 | def __init__(self, argument_name): 28 | super().__init__(argument_name, "an integer") 29 | 30 | 31 | class NonUniqueTimeStampsError(Exception): 32 | def __init__(self, message="Found multiple values at the same time stamp."): 33 | self.message = message 34 | super().__init__(self.message) 35 | 36 | 37 | class WrongInputSizeError(ValueError): 38 | pass 39 | -------------------------------------------------------------------------------- /tsod/detectors.py: -------------------------------------------------------------------------------- 1 | """Simple univariate anomaly detectors""" 2 | 3 | from collections.abc import Sequence 4 | import pandas as pd 5 | import numpy as np 6 | 7 | from .base import Detector 8 | 9 | 10 | class CombinedDetector(Detector, Sequence): 11 | """Combine detectors. 12 | 13 | It is possible to combine several anomaly detection strategies into a combined detector. 14 | 15 | Examples 16 | -------- 17 | >>> normal_data = pd.Series(np.random.normal(size=100)) 18 | >>> abnormal_data = pd.Series(np.random.normal(size=100)) 19 | >>> abnormal_data[[2, 6, 15, 57, 60, 73]] = 5 20 | 21 | >>> anomaly_detector = CombinedDetector([RangeDetector(), DiffDetector()]) 22 | >>> anomaly_detector.fit(normal_data) 23 | >>> detected_anomalies = anomaly_detector.detect(abnormal_data) 24 | """ 25 | 26 | def __init__(self, detectors): 27 | super().__init__() 28 | 29 | for detector in detectors: 30 | if not isinstance(detector, Detector): 31 | raise ValueError( 32 | f"""{detector} is not a Detector. 33 | Did you forget to create an instance, e.g. ConstantValueDetector()?""" 34 | ) 35 | 36 | self._detectors = detectors 37 | 38 | def _fit(self, data): 39 | for detector in self._detectors: 40 | detector.fit(data) 41 | return self 42 | 43 | def _detect(self, data: pd.Series) -> pd.Series: 44 | all_anomalies = [] 45 | for detector in self._detectors: 46 | anom = detector.detect(data) 47 | all_anomalies.append(anom) 48 | data_frame = pd.DataFrame(all_anomalies).T 49 | return data_frame.any(axis=1) 50 | 51 | def __getitem__(self, index): 52 | return self._detectors[index] 53 | 54 | def __len__(self): 55 | return len(self._detectors) 56 | 57 | 58 | class RangeDetector(Detector): 59 | """ 60 | Detect values outside range. 61 | 62 | Parameters 63 | ---------- 64 | min_value : float 65 | Minimum value threshold. 66 | max_value : float 67 | Maximum value threshold. 68 | quantiles : list[2] 69 | Default quantiles [0, 1]. Same as min and max value. 70 | 71 | Examples 72 | --------- 73 | >>> normal_data = pd.Series(np.random.normal(size=100)) 74 | >>> abnormal_data = pd.Series(np.random.normal(size=100)) 75 | >>> abnormal_data[[2, 6, 15, 57, 60, 73]] = 5 76 | >>> normal_data_with_some_outliers = pd.Series(np.random.normal(size=100)) 77 | >>> normal_data_with_some_outliers[[12, 13, 20, 90]] = 7 78 | 79 | >>> detector = RangeDetector(min_value=0.0, max_value=2.0) 80 | >>> anomalies = detector.detect(abnormal_data) 81 | 82 | >>> detector = RangeDetector() 83 | >>> detector.fit(normal_data) # min, max inferred from normal data 84 | >>> anomalies = detector.detect(abnormal_data) 85 | 86 | >>> detector = RangeDetector(quantiles=[0.001,0.999]) 87 | >>> detector.fit(normal_data_with_some_outliers) 88 | >>> anomalies = detector.detect(abnormal_data)""" 89 | 90 | def __init__(self, min_value=-np.inf, max_value=np.inf, quantiles=None): 91 | super().__init__() 92 | 93 | self._min = min_value 94 | 95 | self._max = max_value 96 | 97 | if quantiles is None: 98 | self._quantiles = [0.0, 1.0] 99 | else: 100 | assert 0.0 <= quantiles[0] <= 1.0 101 | assert 0.0 <= quantiles[1] <= 1.0 102 | self._quantiles = quantiles 103 | 104 | def _fit(self, data): 105 | """Set min and max based on data. 106 | 107 | Parameters 108 | ---------- 109 | data : pd.Series 110 | Normal time series data. 111 | """ 112 | super().validate(data) 113 | 114 | quantiles = np.nanquantile(data, self._quantiles) 115 | self._min = quantiles.min() 116 | self._max = quantiles.max() 117 | 118 | assert self._max >= self._min 119 | return self 120 | 121 | def _detect(self, data: pd.Series) -> pd.Series: 122 | """Detect anomalies outside range""" 123 | 124 | if self._max is None: 125 | return data < self._min 126 | 127 | if self._min is None: 128 | return data > self._max 129 | 130 | return (data < self._min) | (data > self._max) 131 | 132 | def __str__(self): 133 | return f"{super.__str__(self)}{self._min}, {self._max})" 134 | 135 | def __repr__(self): 136 | return f"{self.__class__.__name__}(min: {self._min:.1e}, max: {self._max:.1e})" 137 | 138 | 139 | class DiffDetector(Detector): 140 | """Detect sudden shifts in data. Irrespective of time axis. 141 | 142 | Parameters 143 | ---------- 144 | max_diff : float 145 | Maximum change threshold. 146 | direction: str 147 | positive, negative or both, default='both' 148 | 149 | See also 150 | -------- 151 | GradientDetector: similar functionality but considers actual time between data points 152 | """ 153 | 154 | def __init__(self, max_diff=np.inf, direction="both"): 155 | super().__init__() 156 | self._max_diff = max_diff 157 | 158 | valid_directions = ("both", "positive", "negative") 159 | if direction in valid_directions: 160 | self._direction = direction 161 | else: 162 | raise ValueError( 163 | f"Selected direction, '{direction}' is not a valid direction. Valid directions are: {valid_directions}" 164 | ) 165 | 166 | def _fit(self, data): 167 | data_diff = data.diff() 168 | 169 | self._max_diff = data_diff.max() 170 | return self 171 | 172 | def _detect(self, data: pd.Series) -> pd.Series: 173 | if self._direction == "both": 174 | return np.abs(data.diff()) > self._max_diff 175 | elif self._direction == "positive": 176 | return data.diff() > self._max_diff 177 | else: 178 | return data.diff() < -self._max_diff 179 | 180 | def __str__(self): 181 | return ( 182 | f"{self.__class__.__name__}({self._max_diff}, direction:{self._direction})" 183 | ) 184 | 185 | 186 | class RollingStandardDeviationDetector(Detector): 187 | """Detect large variations 188 | 189 | 190 | ---------- 191 | window_size: int 192 | Number of data points to evaluate over, default=10 193 | max_std: float 194 | Maximum standard deviation to accept as normal, default=np.inf 195 | center: bool 196 | Center rolling window, default=True 197 | """ 198 | 199 | def __init__(self, window_size=10, max_std=np.inf, center=True): 200 | super().__init__() 201 | self._window_size = window_size 202 | self._max_std = max_std 203 | self._center = center 204 | 205 | def _fit(self, data): 206 | self._max_std = data.rolling(self._window_size).std().max() 207 | 208 | return self 209 | 210 | def _detect(self, data: pd.Series) -> pd.Series: 211 | anomalies = ( 212 | data.rolling(self._window_size, center=self._center).std() > self._max_std 213 | ) 214 | # anomalies = anomalies.astype(int).diff() > 0 # only take positive edges 215 | anomalies[0] = False # first element cannot be determined by diff 216 | return anomalies 217 | 218 | def __str__(self): 219 | return f"{self.__class__.__name__}(window_size:{self._window_size}, max_std:{self._max_std})" 220 | 221 | 222 | class ConstantValueDetector(Detector): 223 | """ 224 | Detect constant values over a longer period. 225 | 226 | Commonly caused by sensor failures, which get stuck at a constant level. 227 | """ 228 | 229 | def __init__(self, window_size: int = 3, threshold: float = 1e-7): 230 | super().__init__() 231 | self._threshold = threshold 232 | self._window_size = window_size 233 | 234 | def _fit(self, data): 235 | return self 236 | 237 | def _detect(self, data: pd.Series) -> pd.Series: 238 | rollmax = data.rolling(self._window_size, center=True).apply(np.nanmax) 239 | rollmin = data.rolling(self._window_size, center=True).apply(np.nanmin) 240 | anomalies = np.abs(rollmax - rollmin) < self._threshold 241 | anomalies.iloc[0] = False # first element cannot be determined 242 | anomalies.iloc[-1] = False 243 | idx = np.where(anomalies)[0] 244 | if idx is not None: 245 | # assuming window size = 3 246 | # remove also points before and after each detected anomaly 247 | anomalies.iloc[idx[idx > 0] - 1] = True 248 | maxidx = len(anomalies) - 1 249 | anomalies.iloc[idx[idx < maxidx] + 1] = True 250 | 251 | return anomalies 252 | 253 | def __str__(self): 254 | return f"{self.__class__.__name__}({self._window_size}, {self._threshold})" 255 | 256 | 257 | class ConstantGradientDetector(ConstantValueDetector): 258 | """Detect constant gradients. 259 | 260 | Typically caused by linear interpolation over a long interval. 261 | 262 | Parameters 263 | ========== 264 | window_size: int 265 | Minium window to consider as anomaly, default 3 266 | """ 267 | 268 | def __init__(self, window_size: int = 3): 269 | super().__init__(window_size=window_size) 270 | 271 | def _detect(self, data: pd.Series) -> pd.Series: 272 | gradient = self._gradient(data, periods=1) 273 | s1 = super()._detect(gradient) 274 | gradient = self._gradient(data, periods=-1) 275 | s2 = super()._detect(gradient) 276 | return s1 | s2 277 | 278 | def __str__(self): 279 | return f"{self.__class__.__name__}({self._window_size})" 280 | 281 | 282 | class GradientDetector(Detector): 283 | """Detects abrupt changes 284 | 285 | Parameters 286 | ========== 287 | max_gradient: float 288 | Maximum rate of change per second, default np.inf 289 | direction: str 290 | positive, negative or both, default='both' 291 | """ 292 | 293 | def __init__(self, max_gradient=np.inf, direction="both"): 294 | super().__init__() 295 | self._max_gradient = max_gradient 296 | valid_directions = ("both", "positive", "negative") 297 | if direction in valid_directions: 298 | self._direction = direction 299 | else: 300 | raise ValueError( 301 | f"""Selected direction, '{direction}' is not a valid direction. 302 | Valid directions are: {valid_directions}""" 303 | ) 304 | 305 | def _fit(self, data: pd.Series): 306 | """Set max gradient based on data.""" 307 | 308 | self._max_gradient = np.max(np.abs(self._gradient(data))) 309 | return self 310 | 311 | def _detect(self, data: pd.Series) -> pd.Series: 312 | gradient = self._gradient(data) 313 | if self._direction == "negative": 314 | return gradient < -self._max_gradient 315 | elif self._direction == "positive": 316 | return gradient > self._max_gradient 317 | else: 318 | return np.abs(gradient) > self._max_gradient 319 | 320 | def __str__(self): 321 | max_grad_hr = self._max_gradient * 3600.0 322 | return ( 323 | f"{self.__class__.__name__}({max_grad_hr}/hr, direction:{self._direction})" 324 | ) 325 | -------------------------------------------------------------------------------- /tsod/features.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | 5 | def lag_time_series(time_series: pd.Series, lags): 6 | """Create lagged time series features. 7 | 8 | Parameters 9 | ---------- 10 | time_series : pd.Series 11 | lags : list[int] 12 | List of lags 13 | 14 | Returns 15 | ------- 16 | pd.DataFrame 17 | Lagged time series features. 18 | """ 19 | lagged_time_series = {} 20 | for lag in lags: 21 | lagged_time_series[str(lag)] = time_series.shift(lag) 22 | 23 | return pd.concat(lagged_time_series, axis=1) 24 | 25 | 26 | def create_dataset(X, y, time_steps=1): 27 | Xs, ys = [], [] 28 | for i in range(len(X) - time_steps): 29 | v = X.iloc[i : (i + time_steps)].values 30 | Xs.append(v) 31 | ys.append(y.iloc[i + time_steps]) 32 | return np.array(Xs), np.array(ys) 33 | -------------------------------------------------------------------------------- /tsod/hampel.py: -------------------------------------------------------------------------------- 1 | """Hampel detector""" 2 | 3 | import numpy as np 4 | from numba import jit 5 | 6 | from tsod.custom_exceptions import NotIntegerError, InvalidArgumentError 7 | from tsod.detectors import Detector 8 | 9 | 10 | # GAUSSIAN_SCALE_FACTOR = k = 1/Phi^(-1)(3/4) 11 | # Choosing 3/4 as argument makes +-MAD cover 50% of the standard normal cumulative distribution function. 12 | 13 | GAUSSIAN_SCALE_FACTOR = 1.4826 14 | 15 | 16 | def _validate_arguments(window_size, threshold): 17 | if not isinstance(window_size, int): 18 | raise NotIntegerError("window_size") 19 | else: 20 | if window_size <= 0: 21 | raise InvalidArgumentError("window_size", "nonnegative") 22 | 23 | if threshold < 0: 24 | raise InvalidArgumentError("threshold", "positive") 25 | 26 | 27 | @jit(nopython=True) 28 | def _detect(time_series, window_size, threshold=3, k=GAUSSIAN_SCALE_FACTOR): 29 | """ 30 | Hampel filter implementation that works on numpy arrays, implemented with numba. 31 | 32 | Parameters 33 | ---------- 34 | time_series: numpy.ndarray 35 | window_size: int 36 | The window range is from [(i - window_size):(i + window_size)], so window_size is half of the 37 | window, counted in number of array elements (as opposed to specify a time span, which is not 38 | supported by this implementation) 39 | threshold: float 40 | The threshold for marking an outlier. A low threshold "narrows" the band within which values are deemed as 41 | outliers. n_sigmas 42 | k : float 43 | Constant scale factor dependent on distribution. Default is normal distribution. 44 | """ 45 | 46 | # time_series_clean = time_series.copy() 47 | # outlier_indices = [] 48 | is_outlier = [False] * len(time_series) 49 | 50 | for t in range(window_size, (len(time_series) - window_size)): 51 | time_series_window = time_series[(t - window_size) : (t + window_size)] 52 | median_in_window = np.nanmedian(time_series_window) 53 | mad_in_window = k * np.nanmedian(np.abs(time_series_window - median_in_window)) 54 | absolute_deviation_from_median = np.abs(time_series[t] - median_in_window) 55 | is_outlier[t] = absolute_deviation_from_median > threshold * mad_in_window 56 | # if is_outlier[t]: 57 | # outlier_indices.append(t) 58 | # time_series_clean[t] = median_in_window 59 | 60 | return is_outlier 61 | 62 | 63 | class HampelDetector(Detector): 64 | """ 65 | Hampel filter implementation that works on numpy arrays, implemented with numba. 66 | 67 | Parameters 68 | ---------- 69 | window_size: int 70 | The window range is from [(i - window_size):(i + window_size)], so window_size is half of the 71 | window, counted in number of array elements (as opposed to specify a time span, which is not 72 | supported by this implementation) 73 | threshold: float 74 | The threshold for marking an outlier. A low threshold "narrows" the band within which values are deemed as 75 | outliers. n_sigmas, default=3.0 76 | """ 77 | 78 | def __init__(self, window_size=5, threshold=3): 79 | super().__init__() 80 | _validate_arguments(window_size, threshold) 81 | self._threshold = threshold 82 | self._window_size = window_size 83 | 84 | def _detect(self, data): 85 | anomalies = _detect(data.values, self._window_size, self._threshold) 86 | 87 | return anomalies 88 | 89 | def __str__(self): 90 | return f"{self.__class__.__name__}({self._window_size}, {self._threshold})" 91 | --------------------------------------------------------------------------------