├── .coveragerc ├── .gitattributes ├── .github └── workflows │ └── pythonpublish.yml ├── .gitignore ├── .isort.cfg ├── .readthedocs.yml ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.md ├── docs ├── .gitignore ├── Makefile ├── README.md ├── _static │ ├── arundo_logo.png │ └── arundo_logo_black.png ├── api │ ├── aggregators.rst │ ├── data.rst │ ├── detectors.rst │ ├── metrics.rst │ ├── modules.rst │ ├── pipe.rst │ ├── transformers.rst │ └── visualization.rst ├── conf.py ├── developer.rst ├── examples.rst ├── images │ ├── cyclic.png │ ├── level_shift.png │ ├── level_shift_double_rolling.png │ ├── local_spike.png │ ├── missing_data.png │ ├── non_zeros_count.png │ ├── quickstart0.png │ ├── quickstart1.png │ ├── quickstart2.png │ ├── quickstart3.png │ ├── restart.png │ ├── seasonal.png │ ├── spike.png │ ├── split_1.png │ ├── split_2.png │ ├── split_3.png │ ├── split_4.png │ └── volatility_shift_double_rolling.png ├── index.rst ├── inheritance.rst ├── install.rst ├── notebooks │ ├── data │ │ ├── autoregression.csv │ │ ├── cpu.csv │ │ ├── gaussian2d.csv │ │ ├── generator.csv │ │ ├── invalid_series.csv │ │ ├── pressure.csv │ │ ├── price_long.csv │ │ ├── price_short.csv │ │ ├── pricing.csv │ │ ├── quickstart │ │ │ ├── known_anomalies.csv │ │ │ ├── testing.csv │ │ │ └── training.csv │ │ ├── seasonal+trend.csv │ │ ├── seasonal.csv │ │ ├── seismic.csv │ │ ├── sin.csv │ │ └── temperature.csv │ ├── demo.ipynb │ └── quickstart.ipynb ├── quickstart.rst ├── releasehistory.rst ├── requirements-docs.txt └── userguide.rst ├── mypy.ini ├── pyproject.toml ├── setup.cfg ├── setup.py ├── src └── adtk │ ├── __init__.py │ ├── _aggregator_base.py │ ├── _base.py │ ├── _detector_base.py │ ├── _transformer_base.py │ ├── _utils.py │ ├── aggregator │ ├── __init__.py │ └── _aggregator.py │ ├── data │ ├── __init__.py │ └── _data.py │ ├── detector │ ├── __init__.py │ ├── _detector_1d.py │ └── _detector_hd.py │ ├── metrics │ ├── __init__.py │ └── _metrics.py │ ├── pipe │ ├── __init__.py │ └── _pipe.py │ ├── transformer │ ├── __init__.py │ ├── _transformer_1d.py │ └── _transformer_hd.py │ └── visualization │ ├── __init__.py │ └── _visualization.py ├── tests ├── test_aggregators.py ├── test_attribute.py ├── test_data_validation.py ├── test_detector1d.py ├── test_detectorhd.py ├── test_expand_events.py ├── test_few_shot_fit.py ├── test_few_shot_predict.py ├── test_inconsistent_train_test.py ├── test_label_list_convert.py ├── test_metric.py ├── test_pipe.py ├── test_print_subclasses.py ├── test_series_name.py ├── test_train_test_split.py ├── test_transformer1d.py ├── test_transformerhd.py └── test_visualization.ipynb └── tox.ini /.coveragerc: -------------------------------------------------------------------------------- 1 | [report] 2 | omit = src/adtk/visualization/* 3 | show_missing = True 4 | exclude_lines = 5 | pragma: no cover 6 | raise 7 | warnings.warn 8 | pass 9 | @property 10 | @overload 11 | def plot_flowchart -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | tests/*.ipynb linguist-documentation -------------------------------------------------------------------------------- /.github/workflows/pythonpublish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v1 12 | - name: Set up Python 13 | uses: actions/setup-python@v1 14 | with: 15 | python-version: '3.x' 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install setuptools wheel twine 20 | - name: Build and publish 21 | env: 22 | TWINE_USERNAME: __token__ 23 | TWINE_PASSWORD: ${{ secrets.ARUNDO_PYPI_TOKEN }} 24 | run: | 25 | python setup.py sdist bdist_wheel 26 | twine upload dist/* 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | db.sqlite3-journal 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # IPython 80 | profile_default/ 81 | ipython_config.py 82 | 83 | # pyenv 84 | .python-version 85 | 86 | # pipenv 87 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 88 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 89 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 90 | # install all needed dependencies. 91 | #Pipfile.lock 92 | 93 | # celery beat schedule file 94 | celerybeat-schedule 95 | 96 | # SageMath parsed files 97 | *.sage.py 98 | 99 | # Environments 100 | .env 101 | .venv 102 | env/ 103 | venv/ 104 | ENV/ 105 | env.bak/ 106 | venv.bak/ 107 | 108 | # Spyder project settings 109 | .spyderproject 110 | .spyproject 111 | 112 | # Rope project settings 113 | .ropeproject 114 | 115 | # mkdocs documentation 116 | /site 117 | 118 | # mypy 119 | .mypy_cache/ 120 | .dmypy.json 121 | dmypy.json 122 | 123 | # Pyre type checker 124 | .pyre/ 125 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | multi_line_output=3 3 | include_trailing_comma=True 4 | force_grid_wrap=0 5 | use_parentheses=True 6 | line_length=79 -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sphinx: 4 | configuration: docs/conf.py 5 | 6 | python: 7 | version: 3.7 8 | install: 9 | - requirements: docs/requirements-docs.txt 10 | - method: pip 11 | path: . 12 | extra_requirements: 13 | - doc 14 | system_packages: true 15 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: python 3 | python: 4 | - "3.5.2" 5 | - "3.6" 6 | - "3.7" 7 | - "3.8" 8 | 9 | install: pip install tox-travis 10 | 11 | script: 12 | - tox 13 | 14 | after_success: 15 | - pip install -e .[test,dev] 16 | - pytest --cov=adtk --cov-config=.coveragerc 17 | - coveralls 18 | - black --check ./src/adtk 19 | - black --check ./tests 20 | - isort --check-only -rc ./src/adtk 21 | - isort --check-only -rc ./tests 22 | 23 | branches: 24 | only: 25 | - master 26 | - develop 27 | 28 | notifications: 29 | email: false 30 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Anomaly Detection Toolkit (ADTK) 2 | 3 | [![Build Status](https://travis-ci.com/arundo/adtk.svg?branch=master)](https://travis-ci.com/arundo/adtk) 4 | [![Documentation Status](https://readthedocs.org/projects/adtk/badge/?version=stable)](https://adtk.readthedocs.io/en/stable) 5 | [![Coverage Status](https://coveralls.io/repos/github/arundo/adtk/badge.svg?branch=master&service=github)](https://coveralls.io/github/arundo/adtk?branch=master) 6 | [![PyPI](https://img.shields.io/pypi/v/adtk)](https://pypi.org/project/adtk/) 7 | [![Downloads](https://pepy.tech/badge/adtk)](https://pepy.tech/project/adtk) 8 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) 9 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/arundo/adtk/master?filepath=docs%2Fnotebooks%2Fdemo.ipynb) 10 | 11 | Anomaly Detection Toolkit (ADTK) is a Python package for unsupervised / 12 | rule-based time series anomaly detection. 13 | 14 | As the nature of anomaly varies over different cases, a model may not work 15 | universally for all anomaly detection problems. Choosing and combining 16 | detection algorithms (detectors), feature engineering methods (transformers), 17 | and ensemble methods (aggregators) properly is the key to build an effective 18 | anomaly detection model. 19 | 20 | This package offers a set of common detectors, transformers and aggregators 21 | with unified APIs, as well as pipe classes that connect them together into 22 | models. It also provides some functions to process and visualize time series 23 | and anomaly events. 24 | 25 | See https://adtk.readthedocs.io for complete documentation. 26 | 27 | ## Installation 28 | 29 | Prerequisites: Python 3.5 or later. 30 | 31 | It is recommended to install the most recent **stable** release of ADTK from PyPI. 32 | 33 | ```shell 34 | pip install adtk 35 | ``` 36 | 37 | Alternatively, you could install from source code. This will give you the **latest**, but unstable, version of ADTK. 38 | 39 | ```shell 40 | git clone https://github.com/arundo/adtk.git 41 | cd adtk/ 42 | git checkout develop 43 | pip install ./ 44 | ``` 45 | 46 | ## Examples 47 | 48 | Please see [Quick Start](https://adtk.readthedocs.io/en/stable/quickstart.html) for a simple example. 49 | 50 | For more detailed examples of each module of ADTK, please refer to 51 | [Examples](https://adtk.readthedocs.io/en/stable/examples.html) 52 | section in the documentation or [an interactive demo notebook](https://mybinder.org/v2/gh/arundo/adtk/master?filepath=docs%2Fnotebooks%2Fdemo.ipynb). 53 | 54 | ## Contributing 55 | 56 | Pull requests are welcome. For major changes, please open an issue first to 57 | discuss what you would like to change. 58 | 59 | Please make sure to update unit tests as appropriate. 60 | 61 | Please see [Contributing](https://adtk.readthedocs.io/en/stable/developer.html) for more details. 62 | 63 | 64 | ## License 65 | 66 | ADTK is licensed under the Mozilla Public License 2.0 (MPL 2.0). See the 67 | [LICENSE](LICENSE) file for details. 68 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | _build -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python -msphinx 7 | SPHINXPROJ = ADTK 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # ADTK Documentation 2 | 3 | 1. Install necessary sphinx packages if they are not installed yet. 4 | ``` bash 5 | $ pip install -r requirements-docs.txt 6 | ``` 7 | 8 | 2. Build documentation. 9 | ```bash 10 | $ make html 11 | ``` 12 | 13 | 3. Now you may open documentation by opening `_build/html/index.html` in your browser. 14 | -------------------------------------------------------------------------------- /docs/_static/arundo_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/_static/arundo_logo.png -------------------------------------------------------------------------------- /docs/_static/arundo_logo_black.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/_static/arundo_logo_black.png -------------------------------------------------------------------------------- /docs/api/aggregators.rst: -------------------------------------------------------------------------------- 1 | Aggregators 2 | =========== 3 | .. automodule:: adtk.aggregator 4 | :members: 5 | :inherited-members: 6 | -------------------------------------------------------------------------------- /docs/api/data.rst: -------------------------------------------------------------------------------- 1 | Data 2 | =========== 3 | 4 | .. automodule:: adtk.data 5 | :members: 6 | :inherited-members: 7 | -------------------------------------------------------------------------------- /docs/api/detectors.rst: -------------------------------------------------------------------------------- 1 | Detectors 2 | ========= 3 | .. automodule:: adtk.detector 4 | :members: 5 | :inherited-members: 6 | -------------------------------------------------------------------------------- /docs/api/metrics.rst: -------------------------------------------------------------------------------- 1 | Metrics 2 | =========== 3 | 4 | .. automodule:: adtk.metrics 5 | :members: 6 | :inherited-members: 7 | -------------------------------------------------------------------------------- /docs/api/modules.rst: -------------------------------------------------------------------------------- 1 | Modules 2 | ====================================================================== 3 | 4 | .. toctree:: 5 | :maxdepth: 3 6 | 7 | detectors 8 | transformers 9 | aggregators 10 | pipe 11 | data 12 | metrics 13 | visualization -------------------------------------------------------------------------------- /docs/api/pipe.rst: -------------------------------------------------------------------------------- 1 | Pipeline and Pipenet 2 | ==================== 3 | 4 | .. automodule:: adtk.pipe 5 | :members: 6 | :inherited-members: 7 | -------------------------------------------------------------------------------- /docs/api/transformers.rst: -------------------------------------------------------------------------------- 1 | Transformers 2 | ============ 3 | .. automodule:: adtk.transformer 4 | :members: 5 | :inherited-members: 6 | -------------------------------------------------------------------------------- /docs/api/visualization.rst: -------------------------------------------------------------------------------- 1 | Visualization 2 | ============= 3 | 4 | .. automodule:: adtk.visualization 5 | :members: 6 | :inherited-members: 7 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # ADTK documentation build configuration file, created by 5 | # sphinx-quickstart on Wed May 2 11:26:20 2018. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | import os 21 | import sys 22 | 23 | import sphinx_rtd_theme 24 | 25 | sys.path.insert(0, os.path.abspath("..")) 26 | 27 | 28 | # -- General configuration ------------------------------------------------ 29 | 30 | # If your documentation needs a minimal Sphinx version, state it here. 31 | # 32 | # needs_sphinx = '1.0' 33 | 34 | # Add any Sphinx extension module names here, as strings. They can be 35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 36 | # ones. 37 | extensions = [ 38 | "sphinx.ext.autodoc", 39 | "sphinx.ext.todo", 40 | "sphinx.ext.mathjax", 41 | "sphinx.ext.viewcode", 42 | "sphinx.ext.napoleon", 43 | "nbsphinx", 44 | "sphinx.ext.autodoc.typehints", 45 | ] 46 | 47 | autodoc_typehints = "description" 48 | 49 | # Add any paths that contain templates here, relative to this directory. 50 | templates_path = ["_templates"] 51 | 52 | # The suffix(es) of source filenames. 53 | # You can specify multiple suffix as a list of string: 54 | # 55 | # source_suffix = ['.rst', '.md'] 56 | source_suffix = ".rst" 57 | 58 | # The master toctree document. 59 | master_doc = "index" 60 | 61 | # General information about the project. 62 | project = "ADTK" 63 | copyright = "2019-2020, Arundo Analytics, Inc." 64 | author = "Arundo Analytics, Inc" 65 | 66 | # The version info for the project you're documenting, acts as replacement for 67 | # |version| and |release|, also used in various other places throughout the 68 | # built documents. 69 | # 70 | # The short X.Y version. 71 | version = "0.6" 72 | # The full version, including alpha/beta/rc tags. 73 | release = "0.6.2" 74 | 75 | # The language for content autogenerated by Sphinx. Refer to documentation 76 | # for a list of supported languages. 77 | # 78 | # This is also used if you do content translation via gettext catalogs. 79 | # Usually you set "language" from the command line for these cases. 80 | language = None 81 | 82 | # List of patterns, relative to source directory, that match files and 83 | # directories to ignore when looking for source files. 84 | # This patterns also effect to html_static_path and html_extra_path 85 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "**.ipynb_checkpoints"] 86 | 87 | # The name of the Pygments (syntax highlighting) style to use. 88 | pygments_style = "sphinx" 89 | 90 | # If true, `todo` and `todoList` produce output, else they produce nothing. 91 | todo_include_todos = True 92 | 93 | 94 | # -- Options for HTML output ---------------------------------------------- 95 | 96 | # The theme to use for HTML and HTML Help pages. See the documentation for 97 | # a list of builtin themes. 98 | # 99 | # html_theme = 'sphinxdoc' 100 | html_theme = "sphinx_rtd_theme" 101 | 102 | # Theme options are theme-specific and customize the look and feel of a theme 103 | # further. For a list of options available for each theme, see the 104 | # documentation. 105 | # 106 | html_theme_options = {"logo_only": True} 107 | 108 | html_logo = "_static/arundo_logo_black.png" 109 | 110 | # Add any paths that contain custom static files (such as style sheets) here, 111 | # relative to this directory. They are copied after the builtin static files, 112 | # so a file named "default.css" will overwrite the builtin "default.css". 113 | html_static_path = ["_static"] 114 | 115 | # Custom sidebar templates, must be a dictionary that maps document names 116 | # to template names. 117 | # 118 | # This is required for the alabaster theme 119 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars 120 | # html_sidebars = { 121 | # '**': [ 122 | # 'about.html', 123 | # 'navigation.html', 124 | # 'relations.html', # needs 'show_related': True theme option to display 125 | # 'searchbox.html', 126 | # 'donate.html', 127 | # ] 128 | # } 129 | 130 | 131 | # -- Options for HTMLHelp output ------------------------------------------ 132 | 133 | # Output file base name for HTML help builder. 134 | htmlhelp_basename = "ADTKdoc" 135 | 136 | 137 | # -- Options for LaTeX output --------------------------------------------- 138 | 139 | latex_elements = { 140 | # The paper size ('letterpaper' or 'a4paper'). 141 | # 142 | # 'papersize': 'letterpaper', 143 | # The font size ('10pt', '11pt' or '12pt'). 144 | # 145 | # 'pointsize': '10pt', 146 | # Additional stuff for the LaTeX preamble. 147 | # 148 | # 'preamble': '', 149 | # Latex figure (float) alignment 150 | # 151 | # 'figure_align': 'htbp', 152 | } 153 | 154 | # Grouping the document tree into LaTeX files. List of tuples 155 | # (source start file, target name, title, 156 | # author, documentclass [howto, manual, or own class]). 157 | latex_documents = [ 158 | ( 159 | master_doc, 160 | "ADTK.tex", 161 | "ADTK Documentation", 162 | "Arundo Analytics", 163 | "manual", 164 | ) 165 | ] 166 | 167 | 168 | # -- Options for manual page output --------------------------------------- 169 | 170 | # One entry per manual page. List of tuples 171 | # (source start file, name, description, authors, manual section). 172 | man_pages = [(master_doc, "ADTK", "ADTK Documentation", [author], 1)] 173 | 174 | 175 | # -- Options for Texinfo output ------------------------------------------- 176 | 177 | # Grouping the document tree into Texinfo files. List of tuples 178 | # (source start file, target name, title, author, 179 | # dir menu entry, description, category) 180 | texinfo_documents = [ 181 | ( 182 | master_doc, 183 | "ADTK", 184 | "ADTK Documentation", 185 | author, 186 | "ADTK", 187 | "One line description of project.", 188 | "Miscellaneous", 189 | ) 190 | ] 191 | 192 | 193 | autodoc_member_order = "bysource" 194 | -------------------------------------------------------------------------------- /docs/developer.rst: -------------------------------------------------------------------------------- 1 | .. _developer: 2 | 3 | ************ 4 | Contributing 5 | ************ 6 | 7 | - `I have a question/suggestion`_ 8 | - `I found a bug`_ 9 | - `I want to develop a new detector/transformer/aggregator`_ 10 | - `The inheritance relationship between model classes is confusing`_ 11 | - `Formatter and linter`_ 12 | - `Unit test`_ 13 | - `Documentation`_ 14 | - `My pull request is ready`_ 15 | - `How are branches and releases managed?`_ 16 | 17 | ---------- 18 | 19 | I have a question/suggestion 20 | ============================ 21 | Please open a new issue. For questions, please use label **question**. For suggestions, please use label **enhancement**. 22 | 23 | I found a bug 24 | ============= 25 | Please check first whether the bug has been noticed in `Issues `_ or `Pull requests `_. 26 | 27 | If not, please open a new issue with label **bug**. We do not enforce an issue template for now, but we recommend a bug issue to include a description of the bug, configurations of your Python environment, and code that may reproduce the bug. 28 | 29 | If you already know how the problem could be fixed, you are more than welcomed to open a pull request with label **bug** and fix it. Again, we do not enforce a PR template for now, but we recommend you to follow best practice. A unit test is required to cover the found bug. Rules on merging a PR is in `My pull request is ready`_. 30 | 31 | 32 | I want to develop a new detector/transformer/aggregator 33 | ======================================================= 34 | Adding a new detector/transformer/aggregator is usually a task requiring a significant time commitment. Therefore, we want to discuss with you about the necessity of the proposed new component first. Please open a new issue with label **enhancement**. Please do NOT open a PR until the plan of implementation is discussed thoroughly. 35 | 36 | 37 | The inheritance relationship between model classes is confusing 38 | =============================================================== 39 | Yes, it is somehow confusing, but we think it is logical and minimizes duplication of reusable code. 40 | You may see :ref:`inheritance` for the full relationship. 41 | 42 | Formatter and linter 43 | ==================== 44 | `Black `_ v19.3b0 is the required formatter of ADTK. 45 | We required **79** characters as maximal line length in ADTK, which is different to the default value in Black. 46 | A configuration file `pyproject.toml` is included with this setting. 47 | 48 | `isort `_ v4.3.21 is also required to sort imports in ADTK. 49 | A black-compatible configuration is included in `.isort.cfg`. 50 | 51 | You may install the required version of `Black` and `isort` along with ADTK using extra **dev**. 52 | 53 | .. code-block:: console 54 | 55 | $ pip install adtk[dev] 56 | 57 | We recommend `Pylint `_ and/or `flask8 `_ as the Python linter. 58 | 59 | Unit test 60 | ========= 61 | `pytest `_ is the required unit test framework of ADTK. 62 | Unit test coverage is checked by `Coverage.py `_ and pytest plugin `pytest-cov `_. 63 | We use `tox `_ to automate tests in different Python environments. 64 | 65 | You may install all these dependencies along with ADTK using extra **test**. 66 | 67 | .. code-block:: console 68 | 69 | $ pip install adtk[test] 70 | 71 | Documentation 72 | ============= 73 | The documentation is generated with `Sphinx `_. 74 | You may install all necessary packages for compiling documentation along with ADTK using extra **doc**. 75 | 76 | .. code-block:: console 77 | 78 | $ pip install adtk[doc] 79 | 80 | My pull request is ready 81 | ======================== 82 | Here are some general guides about pull requests: 83 | 84 | - Before your pull request is ready for review, please keep a **WIP** label. 85 | - Your pull request must be reviewed by at least one reviewer AND pass all test before it can be merged. 86 | - Remember to create unit tests for anything you added/modified. 87 | - Select the base branch to merge to (for more information about the definition of branches, please see `How are branches and releases managed?`_): 88 | 89 | - If your pull request does not change the API, please select branch **master**. 90 | - If your pull request changes the API, please select branch **develop**. 91 | 92 | - Only repository administrator can merge into branches `master` and `develop`. `Squash and merge `_ is always required. 93 | - Don't worry about updating version number and changelog. The administrator who merges your pull request will take care of them before merging. 94 | 95 | 96 | How are branches and releases managed? 97 | ====================================== 98 | This is a guideline of managing branches and releases of ADTK. 99 | 100 | - The versioning of ADTK follows `SemVer `_. 101 | - ADTK is in major version zero currently (0.Y.Z), which indicates that the public API is unstable. 102 | - ADTK only supports one stable version. If the most recent release is 0.Y.Z, the previous versions (0.y.z | y < Y) are **NOT** supported. 103 | - Release versions 104 | 105 | - An increment of minor version Y (0.[Y+1].Z) introduces modifications that change the API, for example adding new features to existing models, adding new models, etc. 106 | - An increment of patch version Z (0.Y.[Z+1]) introduces modifications that do not change the API, for example bug fix, minor changes to documentation, etc. 107 | - A new version is released when a set of modifications are accumulated, depending on the importance of the new functionalities and urgency of the bug fix. 108 | - A release is published to `PyPI `_ and `GitHub `_. 109 | - The `stable documentation `_ corresponds to the most recent release. 110 | 111 | - Pre-release versions 112 | 113 | - Every time a pull request is merged into branch **master** or **develop**, a new pre-release version is defined. 114 | - A pull request that changes the public API is versioned as (0.[Y+1].0-dev.N+pr.M), where N is a monotonic increasing index and M is the index of the pull request. 115 | 116 | .. admonition:: Example 117 | 118 | Assume the latest release is version **0.1.2**. A new parameter is added to an existing function in pull request **#37**. The new functionality will eventually be included in release version 0.2.0. Merging this pull request to the branch **develop** is versioned as **0.2.0-dev.1+pr.37**. 119 | 120 | Assume a new function is then created in pull request **#39**. The function is also expected to be released in version 0.2.0. Merging this pull request to the branch **develop** is versioned as **0.2.0-dev.2+pr.39**. 121 | 122 | - A pull request that does not change the API is versioned as (0.Y.[Z+1]-dev.N+pr.M), where N is a monotonic increasing index and M is the index of the pull request. 123 | 124 | .. admonition:: Example 125 | 126 | Assume the latest release is version **0.1.2**. A bug is fixed in pull request **#38**. The new functionality will eventually be included in release version 0.1.3. Merging this pull request to the branch **master** is versioned as **0.1.3-dev.1+pr.38**. 127 | 128 | Assume a typo in documentation is then fixed in pull request **#41**. The function is also expected to be released in version 0.1.3. Merging this pull request to the branch **master** is versioned as **0.1.3-dev.2+pr.41**. 129 | 130 | .. attention:: 131 | If the modification should also be included in the next "major" release (0.[Y+1].0), a separate pull request to merge the modifications into branch **develop** should be opened. 132 | 133 | - The `latest documentation `_ corresponds to the most recent pre-release in branch **develop**. 134 | 135 | -------------------------------------------------------------------------------- /docs/examples.rst: -------------------------------------------------------------------------------- 1 | .. _examples: 2 | 3 | ******** 4 | Examples 5 | ******** 6 | 7 | **Launch an interactive demo notebook in Binder ⇒** |binder| 8 | 9 | .. |binder| image:: https://mybinder.org/badge_logo.svg 10 | :target: https://mybinder.org/v2/gh/arundo/adtk/master?filepath=docs%2Fnotebooks%2Fdemo.ipynb 11 | 12 | .. toctree:: 13 | :maxdepth: 2 14 | 15 | notebooks/demo.ipynb 16 | -------------------------------------------------------------------------------- /docs/images/cyclic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/cyclic.png -------------------------------------------------------------------------------- /docs/images/level_shift.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/level_shift.png -------------------------------------------------------------------------------- /docs/images/level_shift_double_rolling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/level_shift_double_rolling.png -------------------------------------------------------------------------------- /docs/images/local_spike.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/local_spike.png -------------------------------------------------------------------------------- /docs/images/missing_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/missing_data.png -------------------------------------------------------------------------------- /docs/images/non_zeros_count.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/non_zeros_count.png -------------------------------------------------------------------------------- /docs/images/quickstart0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/quickstart0.png -------------------------------------------------------------------------------- /docs/images/quickstart1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/quickstart1.png -------------------------------------------------------------------------------- /docs/images/quickstart2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/quickstart2.png -------------------------------------------------------------------------------- /docs/images/quickstart3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/quickstart3.png -------------------------------------------------------------------------------- /docs/images/restart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/restart.png -------------------------------------------------------------------------------- /docs/images/seasonal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/seasonal.png -------------------------------------------------------------------------------- /docs/images/spike.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/spike.png -------------------------------------------------------------------------------- /docs/images/split_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/split_1.png -------------------------------------------------------------------------------- /docs/images/split_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/split_2.png -------------------------------------------------------------------------------- /docs/images/split_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/split_3.png -------------------------------------------------------------------------------- /docs/images/split_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/split_4.png -------------------------------------------------------------------------------- /docs/images/volatility_shift_double_rolling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/volatility_shift_double_rolling.png -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | ================================ 2 | Anomaly Detection Toolkit (ADTK) 3 | ================================ 4 | 5 | Anomaly Detection Toolkit (ADTK) is a Python package for unsupervised / 6 | rule-based time series anomaly detection. 7 | 8 | As the nature of anomaly varies over different cases, a model may not work 9 | universally for all anomaly detection problems. Choosing and combining 10 | detection algorithms (detectors), feature engineering methods (transformers), 11 | and ensemble methods (aggregators) properly is the key to build an effective 12 | anomaly detection model. 13 | 14 | This package offers a set of common detectors, transformers and aggregators 15 | with unified APIs, as well as pipe classes that connect them together into a 16 | model. It also provides some functions to process and visualize time series and 17 | anomaly events. 18 | 19 | .. include:: 20 | install.rst 21 | 22 | .. include:: 23 | quickstart.rst 24 | 25 | .. toctree:: 26 | :caption: Table of Contents 27 | :maxdepth: 1 28 | 29 | install 30 | quickstart 31 | userguide 32 | examples 33 | api/modules 34 | developer 35 | releasehistory 36 | 37 | 38 | Indices and tables 39 | ================== 40 | 41 | * :ref:`genindex` 42 | * :ref:`modindex` 43 | * :ref:`search` 44 | -------------------------------------------------------------------------------- /docs/inheritance.rst: -------------------------------------------------------------------------------- 1 | .. _inheritance: 2 | 3 | Model Classes Inheritance Diagram 4 | ================================== 5 | 6 | .. code-block:: console 7 | 8 | _Model 9 | |-- _NonTrainableModel 10 | | |-- _NonTrainableUnivariateModel 11 | | | |-- _NonTrainableUnivariateDetector 12 | | | | └-- ThresholdAD 13 | | | | 14 | | | └-- _NonTrainableUnivariateTransformer 15 | | | |-- RollingAggregate 16 | | | |-- DoubleRollingAggregate 17 | | | |-- Retrospect 18 | | | └-- StandardScale 19 | | | 20 | | └-- _NonTrainableMultivariateModel 21 | | └-- _NonTrainableMultivariateTransformer 22 | | └-- SumAll 23 | | 24 | |-- _TrainableModel 25 | | |-- _TrainableUnivariateModel 26 | | | |-- _TrainableUnivariateDetector 27 | | | | |-- QuantileAD 28 | | | | |-- InterQuartileRangeAD 29 | | | | |-- GeneralizedESDTestAD 30 | | | | |-- PersistAD 31 | | | | |-- LevelShiftAD 32 | | | | |-- VolatilityShiftAD 33 | | | | |-- SeasonalAD 34 | | | | |-- AutoregressionAD 35 | | | | └-- CustomizedDetector1D 36 | | | | 37 | | | └-- _TrainableUnivariateTransformer 38 | | | |-- ClassicSeasonalDecomposition 39 | | | └-- CustomizedTransformer1D 40 | | | 41 | | └-- _TrainableMultivariateModel 42 | | |-- _TrainableMultivariateDetector 43 | | | |-- MinClusterDetector 44 | | | |-- OutlierDetector 45 | | | |-- RegressionAD 46 | | | |-- PcaAD 47 | | | └-- CustomizedDetectorHD 48 | | | 49 | | └-- _TrainableMultivariateTransformer 50 | | |-- RegressionResidual 51 | | |-- PcaProjection 52 | | |-- PcaReconstruction 53 | | |-- PcaReconstructionError 54 | | └-- CustomizedTransformerHD 55 | | 56 | └-- _Aggregator 57 | |-- AndAggregator 58 | |-- OrAggregator 59 | └-- CustomizedAggregator 60 | -------------------------------------------------------------------------------- /docs/install.rst: -------------------------------------------------------------------------------- 1 | ************ 2 | Installation 3 | ************ 4 | 5 | Prerequisites: Python 3.5 or later. 6 | 7 | It is recommended to install the most recent **stable** release of ADTK from PyPI. 8 | 9 | .. code-block:: console 10 | 11 | $ pip install adtk 12 | 13 | 14 | Alternatively, you could install from source code. This will give you the **latest**, but unstable, version of ADTK. 15 | 16 | .. code-block:: console 17 | 18 | $ git clone https://github.com/arundo/adtk.git 19 | $ cd adtk/ 20 | $ git checkout develop 21 | $ pip install ./ 22 | -------------------------------------------------------------------------------- /docs/notebooks/data/invalid_series.csv: -------------------------------------------------------------------------------- 1 | time,value,category 2 | 2017-01-02,2,"even" 3 | 2017-01-01,1,"odd" 4 | 2017-01-03,3,"odd" 5 | 2017-01-03,3.5,"odd" 6 | 2017-01-06,6,"even" 7 | 2017-01-04,4,"even" 8 | 2017-01-05,5,"odd" 9 | 2017-01-07,7,"odd" 10 | 2017-01-04,4.5,"even" 11 | 2017-01-08,8,"even" -------------------------------------------------------------------------------- /docs/notebooks/data/pressure.csv: -------------------------------------------------------------------------------- 1 | Time,Pressure (psi) 2 | 2017-05-02 17:08:37,15.239709722009712 3 | 2017-05-02 17:08:38,15.36847291056765 4 | 2017-05-02 17:08:39,15.090272828735273 5 | 2017-05-02 17:08:40,15.088763583688477 6 | 2017-05-02 17:08:41,15.200615328290896 7 | 2017-05-02 17:08:42,15.070986867017195 8 | 2017-05-02 17:08:43,15.11878376808163 9 | 2017-05-02 17:08:44,15.138324471477922 10 | 2017-05-02 17:08:45,15.13909700688064 11 | 2017-05-02 17:08:46,15.107665802212917 12 | 2017-05-02 17:08:47,15.18561837029227 13 | 2017-05-02 17:08:48,15.402186917322068 14 | 2017-05-02 17:08:49,15.228457593028624 15 | 2017-05-02 17:08:50,15.413565577090209 16 | 2017-05-02 17:08:51,15.454723147225732 17 | 2017-05-02 17:08:52,15.534855991090339 18 | 2017-05-02 17:08:53,15.330246978165752 19 | 2017-05-02 17:08:54,15.354574067292686 20 | 2017-05-02 17:08:55,15.621817652985236 21 | 2017-05-02 17:08:56,15.427433188894016 22 | 2017-05-02 17:08:57,15.071406227898967 23 | 2017-05-02 17:08:58,14.75756759472466 24 | 2017-05-02 17:08:59,14.805465050015666 25 | 2017-05-02 17:09:00,15.234802426201176 26 | 2017-05-02 17:09:01,15.437087232441085 27 | 2017-05-02 17:09:02,15.738189567539767 28 | 2017-05-02 17:09:03,15.756038224680434 29 | 2017-05-02 17:09:04,16.007702928313606 30 | 2017-05-02 17:09:05,15.95887337079757 31 | 2017-05-02 17:09:06,16.069261245886455 32 | 2017-05-02 17:09:07,16.021139303536756 33 | 2017-05-02 17:09:08,15.92225371806386 34 | 2017-05-02 17:09:09,15.946143618259113 35 | 2017-05-02 17:09:10,15.86042941968364 36 | 2017-05-02 17:09:11,16.095978306292412 37 | 2017-05-02 17:09:12,16.131084233551512 38 | 2017-05-02 17:09:13,16.20312929013024 39 | 2017-05-02 17:09:14,16.14234934574565 40 | 2017-05-02 17:09:15,16.368537142125092 41 | 2017-05-02 17:09:16,16.236769355879016 42 | 2017-05-02 17:09:17,16.355623009859208 43 | 2017-05-02 17:09:18,16.292459705562237 44 | 2017-05-02 17:09:19,16.123344662103197 45 | 2017-05-02 17:09:20,16.035255087851237 46 | 2017-05-02 17:09:21,15.89041408920272 47 | 2017-05-02 17:09:22,15.852124656492714 48 | 2017-05-02 17:09:23,15.791073261142838 49 | 2017-05-02 17:09:24,15.847269511571456 50 | 2017-05-02 17:09:25,15.949007242403294 51 | 2017-05-02 17:09:26,15.922468922219998 52 | 2017-05-02 17:09:27,15.917805491742506 53 | 2017-05-02 17:09:28,15.969847691790006 54 | 2017-05-02 17:09:29,15.872659667049081 55 | 2017-05-02 17:09:30,16.000141314684107 56 | 2017-05-02 17:09:31,16.151757043201414 57 | 2017-05-02 17:09:32,16.18840527910839 58 | 2017-05-02 17:09:33,16.619495372035075 59 | 2017-05-02 17:09:34,16.78465462093398 60 | 2017-05-02 17:09:35,16.764443539809445 61 | 2017-05-02 17:09:36,16.69924449038038 62 | 2017-05-02 17:09:37,16.6574474731717 63 | 2017-05-02 17:09:38,16.567127171281328 64 | 2017-05-02 17:09:39,16.770328971856458 65 | 2017-05-02 17:09:40,16.644710510043257 66 | 2017-05-02 17:09:41,16.63010704562598 67 | 2017-05-02 17:09:42,16.53482415109097 68 | 2017-05-02 17:09:43,16.723156040156468 69 | 2017-05-02 17:09:44,16.467790226245164 70 | 2017-05-02 17:09:45,16.402533798552703 71 | 2017-05-02 17:09:46,16.38058856311423 72 | 2017-05-02 17:09:47,16.438078511145683 73 | 2017-05-02 17:09:48,16.52104939294012 74 | 2017-05-02 17:09:49,16.482203298493655 75 | 2017-05-02 17:09:50,16.66025427986016 76 | 2017-05-02 17:09:51,16.71691566880221 77 | 2017-05-02 17:09:52,17.161137615270192 78 | 2017-05-02 17:09:53,16.8896123908346 79 | 2017-05-02 17:09:54,17.001331766141796 80 | 2017-05-02 17:09:55,16.81318895359905 81 | 2017-05-02 17:09:56, 82 | 2017-05-02 17:09:57,17.026884123119356 83 | 2017-05-02 17:09:58, 84 | 2017-05-02 17:09:59, 85 | 2017-05-02 17:10:00,16.96673195701269 86 | 2017-05-02 17:10:01,16.95487815012832 87 | 2017-05-02 17:10:02,16.57113130728771 88 | 2017-05-02 17:10:03,16.56234092959741 89 | 2017-05-02 17:10:04, 90 | 2017-05-02 17:10:05, 91 | 2017-05-02 17:10:06, 92 | 2017-05-02 17:10:07,16.51264167119444 93 | 2017-05-02 17:10:08, 94 | 2017-05-02 17:10:09,16.545518734116744 95 | 2017-05-02 17:10:10,16.40980432766589 96 | 2017-05-02 17:10:11,16.537969863590902 97 | 2017-05-02 17:10:12, 98 | 2017-05-02 17:10:13,16.616489456883517 99 | 2017-05-02 17:10:14, 100 | 2017-05-02 17:10:15,16.78149160354748 101 | 2017-05-02 17:10:16, 102 | 2017-05-02 17:10:17,16.45079172221274 103 | 2017-05-02 17:10:18,16.107414886400854 104 | 2017-05-02 17:10:19,15.941274479727273 105 | 2017-05-02 17:10:20,16.025838257843454 106 | 2017-05-02 17:10:21,15.999849622088048 107 | 2017-05-02 17:10:22,15.927817461336595 108 | 2017-05-02 17:10:23,15.874534999485718 109 | 2017-05-02 17:10:24,16.027227933248774 110 | 2017-05-02 17:10:25,16.154470235037703 111 | 2017-05-02 17:10:26,16.012747549449983 112 | 2017-05-02 17:10:27,16.065482193047696 113 | 2017-05-02 17:10:28,15.980728348649485 114 | 2017-05-02 17:10:29,16.413550565480858 115 | 2017-05-02 17:10:30,16.280452942679567 116 | 2017-05-02 17:10:31,16.224138092517475 117 | 2017-05-02 17:10:32,16.161333019493096 118 | 2017-05-02 17:10:33,16.082300334261678 119 | 2017-05-02 17:10:34,16.107701643749454 120 | 2017-05-02 17:10:35,16.156855473719947 121 | 2017-05-02 17:10:36,15.865483382752707 122 | 2017-05-02 17:10:37,15.762312582993582 123 | 2017-05-02 17:10:38,15.524608075496365 124 | 2017-05-02 17:10:39,15.747147455527768 125 | 2017-05-02 17:10:40,16.1909506844219 126 | 2017-05-02 17:10:41,16.43995757371415 127 | 2017-05-02 17:10:42,16.50228581418069 128 | 2017-05-02 17:10:43,16.686357259074 129 | 2017-05-02 17:10:44,16.716386132610687 130 | 2017-05-02 17:10:45,17.01462517323702 131 | 2017-05-02 17:10:46,17.134857416482305 132 | 2017-05-02 17:10:47,17.093462850011885 133 | 2017-05-02 17:10:48,16.8901231481046 134 | 2017-05-02 17:10:49,16.7749182722062 135 | 2017-05-02 17:10:50,16.831413458097444 136 | 2017-05-02 17:10:51,16.610740748317834 137 | 2017-05-02 17:10:52,16.570813961056842 138 | 2017-05-02 17:10:53,16.812024999376426 139 | 2017-05-02 17:10:54,16.81730127894263 140 | 2017-05-02 17:10:55,17.17470672260042 141 | 2017-05-02 17:10:56,17.435222778269644 142 | 2017-05-02 17:10:57,17.38344426701696 143 | 2017-05-02 17:10:58,17.13874370975417 144 | 2017-05-02 17:10:59,17.130095399877185 145 | 2017-05-02 17:11:00,17.04354495851162 146 | 2017-05-02 17:11:01,17.111547365373095 147 | 2017-05-02 17:11:02,17.402170081578568 148 | 2017-05-02 17:11:03,17.199990985262467 149 | 2017-05-02 17:11:04,17.130012904387513 150 | 2017-05-02 17:11:05,17.1898419232046 151 | 2017-05-02 17:11:06,17.39415767960812 152 | 2017-05-02 17:11:07,17.486095536595165 153 | 2017-05-02 17:11:08,17.560676626214136 154 | 2017-05-02 17:11:09,17.802719908797613 155 | 2017-05-02 17:11:10,17.89156161689412 156 | 2017-05-02 17:11:11,17.84889656508481 157 | 2017-05-02 17:11:12,17.859208931105794 158 | 2017-05-02 17:11:13,17.731923246329142 159 | 2017-05-02 17:11:14,17.830332832061277 160 | 2017-05-02 17:11:15,18.0 161 | 2017-05-02 17:11:16,17.46361853963141 162 | 2017-05-02 17:11:17,17.68290742651825 163 | 2017-05-02 17:11:18,17.672732671235057 164 | 2017-05-02 17:11:19,17.755700951898312 165 | 2017-05-02 17:11:20,17.87494028762646 166 | 2017-05-02 17:11:21,17.461780061638102 167 | 2017-05-02 17:11:22,17.2464910947515 168 | 2017-05-02 17:11:23,17.186526844040376 169 | 2017-05-02 17:11:24,17.04406215164424 170 | 2017-05-02 17:11:25,17.09341822491084 171 | 2017-05-02 17:11:26,17.000725099835716 172 | 2017-05-02 17:11:27,16.692701250287314 173 | 2017-05-02 17:11:28,16.802925871903067 174 | 2017-05-02 17:11:29,17.000969112504883 175 | 2017-05-02 17:11:30,17.102562803035664 176 | 2017-05-02 17:11:31,16.974245976212465 177 | 2017-05-02 17:11:32,16.927286538809113 178 | 2017-05-02 17:11:33,17.026013537698333 179 | 2017-05-02 17:11:34,17.134600307088746 180 | 2017-05-02 17:11:35,17.315853573694895 181 | 2017-05-02 17:11:36,17.262851552297864 182 | 2017-05-02 17:11:37,17.007067288759316 183 | 2017-05-02 17:11:38,16.884844555126286 184 | 2017-05-02 17:11:39,16.980953503984082 185 | 2017-05-02 17:11:40,17.11488239028685 186 | 2017-05-02 17:11:41,17.515431681471497 187 | 2017-05-02 17:11:42,17.536526462450738 188 | 2017-05-02 17:11:43,17.58056151272379 189 | 2017-05-02 17:11:44,17.548644508941294 190 | 2017-05-02 17:11:45,17.4756058351621 191 | 2017-05-02 17:11:46,17.616339548651887 192 | 2017-05-02 17:11:47,17.67997193023491 193 | 2017-05-02 17:11:48,17.642657863333127 194 | 2017-05-02 17:11:49,17.448292751520448 195 | 2017-05-02 17:11:50,17.42613795024508 196 | 2017-05-02 17:11:51,17.35575406556324 197 | 2017-05-02 17:11:52,17.581708102622727 198 | 2017-05-02 17:11:53,17.752207974920502 199 | 2017-05-02 17:11:54,17.568173508445064 200 | 2017-05-02 17:11:55,17.77828498393766 201 | 2017-05-02 17:11:56,17.675318783481277 202 | -------------------------------------------------------------------------------- /docs/notebooks/data/price_short.csv: -------------------------------------------------------------------------------- 1 | Time,Price ($) 2 | 2017-05-02 00:00:00,21.33 3 | 2017-05-02 01:00:00,22.05 4 | 2017-05-02 02:00:00,20.5 5 | 2017-05-02 03:00:00,20.49 6 | 2017-05-02 04:00:00,21.11 7 | 2017-05-02 05:00:00,20.39 8 | 2017-05-02 06:00:00,20.66 9 | 2017-05-02 07:00:00,20.77 10 | 2017-05-02 08:00:00,20.77 11 | 2017-05-02 09:00:00,20.6 12 | 2017-05-02 10:00:00,21.03 13 | 2017-05-02 11:00:00,22.23 14 | 2017-05-02 12:00:00,21.27 15 | 2017-05-02 13:00:00,22.3 16 | 2017-05-02 14:00:00,22.53 17 | 2017-05-02 15:00:00,22.97 18 | 2017-05-02 16:00:00,21.83 19 | 2017-05-02 17:00:00,21.97 20 | 2017-05-02 18:00:00,23.45 21 | 2017-05-02 19:00:00,22.37 22 | 2017-05-02 20:00:00,20.4 23 | 2017-05-02 21:00:00,18.65 24 | 2017-05-02 22:00:00,18.92 25 | 2017-05-02 23:00:00,21.3 26 | 2017-05-03 00:00:00,22.43 27 | 2017-05-03 01:00:00,24.1 28 | 2017-05-03 02:00:00,24.2 29 | 2017-05-03 03:00:00,25.6 30 | 2017-05-03 04:00:00,25.33 31 | 2017-05-03 05:00:00,25.94 32 | 2017-05-03 06:00:00,25.67 33 | 2017-05-03 07:00:00,25.12 34 | 2017-05-03 08:00:00,25.26 35 | 2017-05-03 09:00:00,24.78 36 | 2017-05-03 10:00:00,26.09 37 | 2017-05-03 11:00:00,26.28 38 | 2017-05-03 12:00:00,26.68 39 | 2017-05-03 13:00:00,26.35 40 | 2017-05-03 14:00:00,27.6 41 | 2017-05-03 15:00:00,26.87 42 | 2017-05-03 16:00:00,27.53 43 | 2017-05-03 17:00:00,27.18 44 | 2017-05-03 18:00:00,26.24 45 | 2017-05-03 19:00:00,25.75 46 | 2017-05-03 20:00:00,24.95 47 | 2017-05-03 21:00:00,24.73 48 | 2017-05-03 22:00:00,24.39 49 | 2017-05-03 23:00:00,24.71 50 | 2017-05-04 00:00:00,25.27 51 | 2017-05-04 01:00:00,25.12 52 | 2017-05-04 02:00:00,25.1 53 | 2017-05-04 03:00:00,25.39 54 | 2017-05-04 04:00:00,24.85 55 | 2017-05-04 05:00:00,25.56 56 | 2017-05-04 06:00:00,26.4 57 | 2017-05-04 07:00:00,26.6 58 | 2017-05-04 08:00:00,29.0 59 | 2017-05-04 09:00:00,29.91 60 | 2017-05-04 10:00:00,29.8 61 | 2017-05-04 11:00:00,29.44 62 | 2017-05-04 12:00:00,29.21 63 | 2017-05-04 13:00:00,28.71 64 | 2017-05-04 14:00:00,29.83 65 | 2017-05-04 15:00:00,29.14 66 | 2017-05-04 16:00:00,29.06 67 | 2017-05-04 17:00:00,28.53 68 | 2017-05-04 18:00:00,29.57 69 | 2017-05-04 19:00:00,28.15 70 | 2017-05-04 20:00:00,27.79 71 | 2017-05-04 21:00:00,27.67 72 | 2017-05-04 22:00:00,27.99 73 | 2017-05-04 23:00:00,28.45 74 | 2017-05-05 00:00:00,28.23 75 | 2017-05-05 01:00:00,29.22 76 | 2017-05-05 02:00:00,29.54 77 | 2017-05-05 03:00:00,32.01 78 | 2017-05-05 04:00:00,30.5 79 | 2017-05-05 05:00:00,31.12 80 | 2017-05-05 06:00:00,30.07 81 | 2017-05-05 07:00:00,29.27 82 | 2017-05-05 08:00:00,31.26 83 | 2017-05-05 09:00:00,33.0 84 | 2017-05-05 10:00:00,31.15 85 | 2017-05-05 11:00:00,45.93 86 | 2017-05-05 12:00:00,45.86 87 | 2017-05-05 13:00:00,43.730000000000004 88 | 2017-05-05 14:00:00,43.68 89 | 2017-05-05 15:00:00,44.07 90 | 2017-05-05 16:00:00,44.29 91 | 2017-05-05 17:00:00,42.3 92 | 2017-05-05 18:00:00,43.4 93 | 2017-05-05 19:00:00,43.65 94 | 2017-05-05 20:00:00,43.59 95 | 2017-05-05 21:00:00,42.83 96 | 2017-05-05 22:00:00,43.54 97 | 2017-05-05 23:00:00,44.46 98 | 2017-05-06 00:00:00,43.980000000000004 99 | 2017-05-06 01:00:00,44.07 100 | 2017-05-06 02:00:00,44.9 101 | 2017-05-06 03:00:00,42.94 102 | 2017-05-06 04:00:00,43.06 103 | 2017-05-06 05:00:00,41.15 104 | 2017-05-06 06:00:00,40.230000000000004 105 | 2017-05-06 07:00:00,40.7 106 | 2017-05-06 08:00:00,40.55 107 | 2017-05-06 09:00:00,40.15 108 | 2017-05-06 10:00:00,39.86 109 | 2017-05-06 11:00:00,40.71 110 | 2017-05-06 12:00:00,41.41 111 | 2017-05-06 13:00:00,40.629999999999995 112 | 2017-05-06 14:00:00,40.92 113 | 2017-05-06 15:00:00,40.45 114 | 2017-05-06 16:00:00,42.85 115 | 2017-05-06 17:00:00,42.11 116 | 2017-05-06 18:00:00,41.8 117 | 2017-05-06 19:00:00,41.45 118 | 2017-05-06 20:00:00,41.010000000000005 119 | 2017-05-06 21:00:00,41.15 120 | 2017-05-06 22:00:00,41.43 121 | 2017-05-06 23:00:00,39.81 122 | 2017-05-07 00:00:00,39.230000000000004 123 | 2017-05-07 01:00:00,37.91 124 | 2017-05-07 02:00:00,39.15 125 | 2017-05-07 03:00:00,41.620000000000005 126 | 2017-05-07 04:00:00,43.0 127 | 2017-05-07 05:00:00,43.35 128 | 2017-05-07 06:00:00,44.370000000000005 129 | 2017-05-07 07:00:00,44.53 130 | 2017-05-07 08:00:00,46.19 131 | 2017-05-07 09:00:00,46.86 132 | 2017-05-07 10:00:00,46.629999999999995 133 | 2017-05-07 11:00:00,45.5 134 | 2017-05-07 12:00:00,44.86 135 | 2017-05-07 13:00:00,45.17 136 | 2017-05-07 14:00:00,43.95 137 | 2017-05-07 15:00:00,43.730000000000004 138 | 2017-05-07 16:00:00,45.07 139 | 2017-05-07 17:00:00,45.1 140 | 2017-05-07 18:00:00,47.08 141 | 2017-05-07 19:00:00,48.53 142 | 2017-05-07 20:00:00,48.24 143 | 2017-05-07 21:00:00,46.879999999999995 144 | 2017-05-07 22:00:00,46.83 145 | 2017-05-07 23:00:00,46.35 146 | 2017-05-08 00:00:00,46.730000000000004 147 | 2017-05-08 01:00:00,48.34 148 | 2017-05-08 02:00:00,47.22 149 | 2017-05-08 03:00:00,46.83 150 | 2017-05-08 04:00:00,47.16 151 | 2017-05-08 05:00:00,48.3 152 | 2017-05-08 06:00:00,48.81 153 | 2017-05-08 07:00:00,49.22 154 | 2017-05-08 08:00:00,50.57 155 | 2017-05-08 09:00:00,51.06 156 | 2017-05-08 10:00:00,50.83 157 | 2017-05-08 11:00:00,50.88 158 | 2017-05-08 12:00:00,50.18 159 | 2017-05-08 13:00:00,50.72 160 | 2017-05-08 14:00:00,51.66 161 | 2017-05-08 15:00:00,48.69 162 | 2017-05-08 16:00:00,49.9 163 | 2017-05-08 17:00:00,49.85 164 | 2017-05-08 18:00:00,50.31 165 | 2017-05-08 19:00:00,50.97 166 | 2017-05-08 20:00:00,48.68 167 | 2017-05-08 21:00:00,47.48 168 | 2017-05-08 22:00:00,47.15 169 | 2017-05-08 23:00:00,46.35 170 | 2017-05-09 00:00:00,46.629999999999995 171 | 2017-05-09 01:00:00,46.11 172 | 2017-05-09 02:00:00,44.4 173 | 2017-05-09 03:00:00,45.019999999999996 174 | 2017-05-09 04:00:00,46.120000000000005 175 | 2017-05-09 05:00:00,46.68 176 | 2017-05-09 06:00:00,45.97 177 | 2017-05-09 07:00:00,45.71 178 | 2017-05-09 08:00:00,46.25 179 | 2017-05-09 09:00:00,46.86 180 | 2017-05-09 10:00:00,47.86 181 | 2017-05-09 11:00:00,47.57 182 | 2017-05-09 12:00:00,46.15 183 | 2017-05-09 13:00:00,45.47 184 | 2017-05-09 14:00:00,46.0 185 | 2017-05-09 15:00:00,46.75 186 | 2017-05-09 16:00:00,48.97 187 | 2017-05-09 17:00:00,49.09 188 | 2017-05-09 18:00:00,49.34 189 | 2017-05-09 19:00:00,49.16 190 | 2017-05-09 20:00:00,48.75 191 | 2017-05-09 21:00:00,49.53 192 | 2017-05-09 22:00:00,49.89 193 | 2017-05-09 23:00:00,49.68 194 | 2017-05-10 00:00:00,48.6 195 | 2017-05-10 01:00:00,48.48 196 | 2017-05-10 02:00:00,48.09 197 | 2017-05-10 03:00:00,49.34 198 | 2017-05-10 04:00:00,50.29 199 | 2017-05-10 05:00:00,49.27 200 | 2017-05-10 06:00:00,50.43 201 | 2017-05-10 07:00:00,49.86 202 | -------------------------------------------------------------------------------- /docs/notebooks/data/seasonal+trend.csv: -------------------------------------------------------------------------------- 1 | Time,Value 2 | 2017-05-07,0.0 3 | 2017-05-14,1.2 4 | 2017-05-21,2.4 5 | 2017-05-28,3.6 6 | 2017-06-04,4.8 7 | 2017-06-11,6.0 8 | 2017-06-18,7.2 9 | 2017-06-25,1.4 10 | 2017-07-02,2.6 11 | 2017-07-09,3.8 12 | 2017-07-16,5.0 13 | 2017-07-23,6.2 14 | 2017-07-30,7.4 15 | 2017-08-06,8.6 16 | 2017-08-13,2.8 17 | 2017-08-20,4.0 18 | 2017-08-27,5.2 19 | 2017-09-03,6.4 20 | 2017-09-10,7.6 21 | 2017-09-17,8.8 22 | 2017-09-24,10.0 23 | 2017-10-01,4.2 24 | 2017-10-08,5.4 25 | 2017-10-15,6.6 26 | 2017-10-22,7.8 27 | 2017-10-29,9.0 28 | 2017-11-05,10.2 29 | 2017-11-12,11.4 30 | 2017-11-19,5.6 31 | 2017-11-26,6.8 32 | 2017-12-03,13.0 33 | 2017-12-10,9.2 34 | 2017-12-17,10.4 35 | 2017-12-24,11.6 36 | 2017-12-31,12.8 37 | 2018-01-07,7.0 38 | 2018-01-14,8.2 39 | 2018-01-21,9.4 40 | 2018-01-28,10.6 41 | 2018-02-04,11.8 42 | 2018-02-11,13.0 43 | 2018-02-18,14.2 44 | 2018-02-25,8.4 45 | 2018-03-04,9.6 46 | 2018-03-11,10.8 47 | 2018-03-18,12.0 48 | 2018-03-25,13.2 49 | 2018-04-01,14.4 50 | 2018-04-08,15.6 51 | 2018-04-15,9.8 52 | 2018-04-22,11.0 53 | 2018-04-29,12.2 54 | 2018-05-06,13.4 55 | 2018-05-13,14.6 56 | 2018-05-20,15.8 57 | 2018-05-27,17.0 58 | 2018-06-03,11.2 59 | 2018-06-10,12.4 60 | 2018-06-17,13.6 61 | 2018-06-24,14.8 62 | 2018-07-01,16.0 63 | 2018-07-08,17.2 64 | 2018-07-15,18.4 65 | 2018-07-22,12.6 66 | 2018-07-29,13.8 67 | 2018-08-05,15.0 68 | 2018-08-12,16.2 69 | 2018-08-19,17.4 70 | 2018-08-26,18.6 71 | 2018-09-02,19.8 72 | 2018-09-09,14.0 73 | 2018-09-16,15.2 74 | 2018-09-23,16.4 75 | 2018-09-30,17.6 76 | 2018-10-07,18.8 77 | 2018-10-14,20.0 78 | 2018-10-21,21.2 79 | 2018-10-28,15.4 80 | 2018-11-04,16.6 81 | 2018-11-11,17.8 82 | 2018-11-18,19.0 83 | 2018-11-25,20.2 84 | 2018-12-02,21.4 85 | 2018-12-09,22.6 86 | 2018-12-16,16.8 87 | 2018-12-23,18.0 88 | 2018-12-30,19.2 89 | 2019-01-06,20.4 90 | 2019-01-13,21.6 91 | 2019-01-20,22.8 92 | 2019-01-27,24.0 93 | 2019-02-03,18.2 94 | 2019-02-10,19.4 95 | 2019-02-17,20.6 96 | 2019-02-24,21.8 97 | 2019-03-03,23.0 98 | 2019-03-10,24.2 99 | 2019-03-17,25.4 100 | 2019-03-24,19.6 101 | 2019-03-31,20.8 102 | -------------------------------------------------------------------------------- /docs/notebooks/data/temperature.csv: -------------------------------------------------------------------------------- 1 | Time,Temperature (C) 2 | 2017-05-02 00:00:00,18.91 3 | 2017-05-02 01:00:00,19.91 4 | 2017-05-02 02:00:00,20.19 5 | 2017-05-02 03:00:00,18.69 6 | 2017-05-02 04:00:00,18.11 7 | 2017-05-02 05:00:00,19.76 8 | 2017-05-02 06:00:00,17.33 9 | 2017-05-02 07:00:00,16.91 10 | 2017-05-02 08:00:00,18.17 11 | 2017-05-02 09:00:00,17.3 12 | 2017-05-02 10:00:00,16.63 13 | 2017-05-02 11:00:00,16.53 14 | 2017-05-02 12:00:00,18.02 15 | 2017-05-02 13:00:00,17.38 16 | 2017-05-02 14:00:00,16.94 17 | 2017-05-02 15:00:00,16.51 18 | 2017-05-02 16:00:00,18.71 19 | 2017-05-02 17:00:00,20.9 20 | 2017-05-02 18:00:00,21.9 21 | 2017-05-02 19:00:00,22.29 22 | 2017-05-02 20:00:00,23.03 23 | 2017-05-02 21:00:00,24.52 24 | 2017-05-02 22:00:00,23.58 25 | 2017-05-02 23:00:00,24.76 26 | 2017-05-03 00:00:00,23.5 27 | 2017-05-03 01:00:00,22.86 28 | 2017-05-03 02:00:00,23.77 29 | 2017-05-03 03:00:00,22.34 30 | 2017-05-03 04:00:00,22.2 31 | 2017-05-03 05:00:00,21.34 32 | 2017-05-03 06:00:00,21.09 33 | 2017-05-03 07:00:00,18.29 34 | 2017-05-03 08:00:00,16.52 35 | 2017-05-03 09:00:00,15.82 36 | 2017-05-03 10:00:00,16.74 37 | 2017-05-03 11:00:00,16.57 38 | 2017-05-03 12:00:00,16.57 39 | 2017-05-03 13:00:00,17.26 40 | 2017-05-03 14:00:00,14.38 41 | 2017-05-03 15:00:00,14.66 42 | 2017-05-03 16:00:00,14.86 43 | 2017-05-03 17:00:00,14.13 44 | 2017-05-03 18:00:00,14.74 45 | 2017-05-03 19:00:00,10.31 46 | 2017-05-03 20:00:00,8.65 47 | 2017-05-03 21:00:00,9.64 48 | 2017-05-03 22:00:00,11.03 49 | 2017-05-03 23:00:00,13.45 50 | 2017-05-04 00:00:00,17.43 51 | 2017-05-04 01:00:00,20.66 52 | 2017-05-04 02:00:00,19.37 53 | 2017-05-04 03:00:00,18.33 54 | 2017-05-04 04:00:00,20.07 55 | 2017-05-04 05:00:00,19.28 56 | 2017-05-04 06:00:00,19.31 57 | 2017-05-04 07:00:00,20.38 58 | 2017-05-04 08:00:00,21.27 59 | 2017-05-04 09:00:00,23.02 60 | 2017-05-04 10:00:00,24.52 61 | 2017-05-04 11:00:00,25.59 62 | 2017-05-04 12:00:00,24.81 63 | 2017-05-04 13:00:00,25.61 64 | 2017-05-04 14:00:00,25.92 65 | 2017-05-04 15:00:00,24.6 66 | 2017-05-04 16:00:00,26.01 67 | 2017-05-04 17:00:00,26.82 68 | 2017-05-04 18:00:00,26.87 69 | 2017-05-04 19:00:00,26.63 70 | 2017-05-04 20:00:00,25.43 71 | 2017-05-04 21:00:00,25.63 72 | 2017-05-04 22:00:00,26.1 73 | 2017-05-04 23:00:00,25.27 74 | 2017-05-05 00:00:00,26.43 75 | 2017-05-05 01:00:00,25.34 76 | 2017-05-05 02:00:00,23.21 77 | 2017-05-05 03:00:00,24.25 78 | 2017-05-05 04:00:00,23.85 79 | 2017-05-05 05:00:00,23.72 80 | 2017-05-05 06:00:00,22.89 81 | 2017-05-05 07:00:00,21.28 82 | 2017-05-05 08:00:00,22.54 83 | 2017-05-05 09:00:00,21.85 84 | 2017-05-05 10:00:00,23.51 85 | 2017-05-05 11:00:00,24.31 86 | 2017-05-05 12:00:00,24.0 87 | 2017-05-05 13:00:00,22.91 88 | 2017-05-05 14:00:00,22.18 89 | 2017-05-05 15:00:00,20.97 90 | 2017-05-05 16:00:00,23.06 91 | 2017-05-05 17:00:00,23.22 92 | 2017-05-05 18:00:00,24.37 93 | 2017-05-05 19:00:00,23.1 94 | 2017-05-05 20:00:00,23.28 95 | 2017-05-05 21:00:00,24.46 96 | 2017-05-05 22:00:00,24.13 97 | 2017-05-05 23:00:00,25.16 98 | 2017-05-06 00:00:00,24.07 99 | 2017-05-06 01:00:00,22.71 100 | 2017-05-06 02:00:00,23.09 101 | 2017-05-06 03:00:00,22.71 102 | 2017-05-06 04:00:00,23.35 103 | 2017-05-06 05:00:00,21.38 104 | 2017-05-06 06:00:00,22.09 105 | 2017-05-06 07:00:00,24.69 106 | 2017-05-06 08:00:00,24.66 107 | 2017-05-06 09:00:00,24.7 108 | 2017-05-06 10:00:00,24.87 109 | 2017-05-06 11:00:00,23.01 110 | 2017-05-06 12:00:00,23.44 111 | 2017-05-06 13:00:00,21.83 112 | 2017-05-06 14:00:00,21.41 113 | 2017-05-06 15:00:00,22.65 114 | 2017-05-06 16:00:00,21.91 115 | 2017-05-06 17:00:00,22.41 116 | 2017-05-06 18:00:00,23.43 117 | 2017-05-06 19:00:00,23.71 118 | 2017-05-06 20:00:00,22.34 119 | 2017-05-06 21:00:00,22.0 120 | 2017-05-06 22:00:00,23.96 121 | 2017-05-06 23:00:00,21.94 122 | 2017-05-07 00:00:00,21.66 123 | 2017-05-07 01:00:00,21.11 124 | 2017-05-07 02:00:00,21.23 125 | 2017-05-07 03:00:00,21.98 126 | 2017-05-07 04:00:00,23.59 127 | 2017-05-07 05:00:00,23.32 128 | 2017-05-07 06:00:00,24.13 129 | 2017-05-07 07:00:00,24.63 130 | 2017-05-07 08:00:00,25.1 131 | 2017-05-07 09:00:00,24.54 132 | 2017-05-07 10:00:00,23.54 133 | 2017-05-07 11:00:00,22.44 134 | 2017-05-07 12:00:00,21.69 135 | 2017-05-07 13:00:00,22.01 136 | 2017-05-07 14:00:00,22.77 137 | 2017-05-07 15:00:00,23.09 138 | 2017-05-07 16:00:00,22.54 139 | 2017-05-07 17:00:00,24.35 140 | 2017-05-07 18:00:00,25.87 141 | 2017-05-07 19:00:00,25.51 142 | 2017-05-07 20:00:00,24.69 143 | 2017-05-07 21:00:00,24.82 144 | 2017-05-07 22:00:00,26.09 145 | 2017-05-07 23:00:00,26.42 146 | 2017-05-08 00:00:00,26.98 147 | 2017-05-08 01:00:00,26.76 148 | 2017-05-08 02:00:00,27.22 149 | 2017-05-08 03:00:00,28.77 150 | 2017-05-08 04:00:00,28.53 151 | 2017-05-08 05:00:00,28.67 152 | 2017-05-08 06:00:00,28.92 153 | 2017-05-08 07:00:00,29.21 154 | 2017-05-08 08:00:00,27.79 155 | 2017-05-08 09:00:00,25.92 156 | 2017-05-08 10:00:00,24.9 157 | 2017-05-08 11:00:00,25.07 158 | 2017-05-08 12:00:00,25.62 159 | 2017-05-08 13:00:00,25.09 160 | 2017-05-08 14:00:00,26.47 161 | 2017-05-08 15:00:00,26.32 162 | 2017-05-08 16:00:00,26.34 163 | 2017-05-08 17:00:00,26.15 164 | 2017-05-08 18:00:00,26.28 165 | 2017-05-08 19:00:00,27.99 166 | 2017-05-08 20:00:00,27.65 167 | 2017-05-08 21:00:00,28.76 168 | 2017-05-08 22:00:00,29.28 169 | 2017-05-08 23:00:00,31.18 170 | 2017-05-09 00:00:00,31.26 171 | 2017-05-09 01:00:00,32.99 172 | 2017-05-09 02:00:00,35.94 173 | 2017-05-09 03:00:00,35.86 174 | 2017-05-09 04:00:00,32.12 175 | 2017-05-09 05:00:00,29.2 176 | 2017-05-09 06:00:00,25.6 177 | 2017-05-09 07:00:00,27.07 178 | 2017-05-09 08:00:00,27.38 179 | 2017-05-09 09:00:00,26.77 180 | 2017-05-09 10:00:00,26.38 181 | 2017-05-09 11:00:00,26.52 182 | 2017-05-09 12:00:00,26.61 183 | 2017-05-09 13:00:00,28.07 184 | 2017-05-09 14:00:00,29.46 185 | 2017-05-09 15:00:00,29.11 186 | 2017-05-09 16:00:00,28.56 187 | 2017-05-09 17:00:00,26.0 188 | 2017-05-09 18:00:00,25.45 189 | 2017-05-09 19:00:00,24.47 190 | 2017-05-09 20:00:00,24.12 191 | 2017-05-09 21:00:00,24.51 192 | 2017-05-09 22:00:00,24.69 193 | 2017-05-09 23:00:00,24.66 194 | 2017-05-10 00:00:00,24.86 195 | 2017-05-10 01:00:00,24.73 196 | 2017-05-10 02:00:00,24.93 197 | 2017-05-10 03:00:00,21.7 198 | 2017-05-10 04:00:00,21.43 199 | 2017-05-10 05:00:00,21.32 200 | 2017-05-10 06:00:00,20.98 201 | 2017-05-10 07:00:00,20.76 202 | -------------------------------------------------------------------------------- /docs/quickstart.rst: -------------------------------------------------------------------------------- 1 | *********** 2 | Quick Start 3 | *********** 4 | 5 | In this example, we build a model to detect violation of seasonal (weekly and 6 | daily) traffic pattern. The data used here is the NYC taxi traffic dataset from 7 | `Numenta Anomaly Benchmark `_. 8 | 9 | 1. Load and validate time series for training. 10 | 11 | .. code-block:: python 12 | 13 | >>> import pandas as pd 14 | >>> s_train = pd.read_csv("./training.csv", index_col="Datetime", parse_dates=True, squeeze=True) 15 | >>> from adtk.data import validate_series 16 | >>> s_train = validate_series(s_train) 17 | >>> print(s_train) 18 | Time 19 | 2014-07-01 00:00:00 10844 20 | 2014-07-01 00:30:00 8127 21 | 2014-07-01 01:00:00 6210 22 | 2014-07-01 01:30:00 4656 23 | 2014-07-01 02:00:00 3820 24 | ... 25 | 2015-01-04 09:30:00 9284 26 | 2015-01-04 10:00:00 10955 27 | 2015-01-04 10:30:00 13348 28 | 2015-01-04 11:00:00 13517 29 | 2015-01-04 11:30:00 14443 30 | Freq: 30T, Name: Traffic, Length: 9000, dtype: int64 31 | 32 | 2. Visualize training time series. 33 | 34 | .. code-block:: python 35 | 36 | >>> from adtk.visualization import plot 37 | >>> plot(s_train) 38 | 39 | .. figure:: images/quickstart0.png 40 | :width: 800px 41 | :align: center 42 | :height: 150 43 | :alt: quickstart0 44 | 45 | 3. Detect violation of seasonal pattern. 46 | 47 | .. code-block:: python 48 | 49 | >>> from adtk.detector import SeasonalAD 50 | >>> seasonal_ad = SeasonalAD() 51 | >>> anomalies = seasonal_ad.fit_detect(s_train) 52 | >>> plot(s_train, anomaly=anomalies, anomaly_color="red", anomaly_tag="marker") 53 | 54 | .. figure:: images/quickstart1.png 55 | :width: 800px 56 | :align: center 57 | :height: 150 58 | :alt: quickstart1 59 | 60 | 4. If known anomalies are available, cross check with detection results. 61 | 62 | .. code-block:: python 63 | 64 | >>> known_anomalies = pd.read_csv("./known_anomalies.csv", index_col="Datetime", parse_dates=True, squeeze=True) 65 | >>> from adtk.data import to_events 66 | >>> known_anomalies = to_events(known_anomalies) 67 | >>> print(known_anomalies) 68 | [(Timestamp('2014-07-03 07:00:00', freq='30T'), 69 | Timestamp('2014-07-06 14:59:59.999999999', freq='30T')), 70 | (Timestamp('2014-08-31 18:30:00', freq='30T'), 71 | Timestamp('2014-09-01 21:59:59.999999999', freq='30T')), 72 | (Timestamp('2014-10-31 14:30:00', freq='30T'), 73 | Timestamp('2014-11-02 13:59:59.999999999', freq='30T')), 74 | (Timestamp('2014-11-26 19:00:00', freq='30T'), 75 | Timestamp('2014-11-29 14:29:59.999999999', freq='30T')), 76 | (Timestamp('2014-12-23 19:00:00', freq='30T'), 77 | Timestamp('2014-12-28 13:59:59.999999999', freq='30T')), 78 | (Timestamp('2014-12-28 19:30:00', freq='30T'), 79 | Timestamp('2015-01-02 21:29:59.999999999', freq='30T'))] 80 | >>> plot(s_train, 81 | anomaly={"Known": known_anomalies, "Model": anomalies}, 82 | anomaly_tag={"Known": "span", "Model": "marker"}, 83 | anomaly_color={"Known": "orange", "Model": "red"}) 84 | 85 | .. figure:: images/quickstart2.png 86 | :width: 800px 87 | :align: center 88 | :height: 150 89 | :alt: quickstart2 90 | 91 | 92 | 5. Apply the trained model to new data. 93 | 94 | .. code-block:: python 95 | 96 | >>> s_test = pd.read_csv("./testing.csv", index_col="Datetime", parse_dates=True, squeeze=True) 97 | >>> s_test = validate_series(s_test) 98 | >>> print(s_test) 99 | Datetime 100 | 2015-01-04 12:00:00 15285 101 | 2015-01-04 12:30:00 16028 102 | 2015-01-04 13:00:00 16329 103 | 2015-01-04 13:30:00 15891 104 | 2015-01-04 14:00:00 15960 105 | ... 106 | 2015-01-31 21:30:00 24670 107 | 2015-01-31 22:00:00 25721 108 | 2015-01-31 22:30:00 27309 109 | 2015-01-31 23:00:00 26591 110 | 2015-01-31 23:30:00 26288 111 | Freq: 30T, Name: Traffic, Length: 1320, dtype: int64 112 | >>> anomalies_pred = seasonal_ad.detect(s_test) 113 | >>> plot(s_test, anomaly=anomalies_pred, 114 | ts_linewidth=1, anomaly_color='red', anomaly_tag="marker") 115 | 116 | .. figure:: images/quickstart3.png 117 | :width: 800px 118 | :align: center 119 | :height: 150 120 | :alt: quickstart3 121 | 122 | For more examples, please check :ref:`examples`. But before that, we recommend 123 | you to read :ref:`userguide` first. 124 | -------------------------------------------------------------------------------- /docs/releasehistory.rst: -------------------------------------------------------------------------------- 1 | *************** 2 | Release History 3 | *************** 4 | 5 | Version 0.6.2 (Apr 16, 2020) 6 | =================================== 7 | - Hot fix of wrong documentation url 8 | 9 | Version 0.6.1 (Apr 16, 2020) 10 | =================================== 11 | - Migrated the documentation to a new host 12 | - Fixed minor typos in the documentation 13 | - Fixed a minor type hinting bug 14 | 15 | Version 0.6.0 (Mar 10, 2020) 16 | =================================== 17 | - Re-designed the API of :py:mod:`adtk.visualization.plot` 18 | - Removed :py:mod:`adtk.data.resample` because its functionality is highly overlapped with pandas resampler module 19 | - Made :py:mod:`adtk.data.expand_events` accept events in the form of pandas Series/DataFrame 20 | - Made :py:mod:`adtk.data.expand_events` accept time delta in the form of `str` or `int` 21 | - Changed the output type of :py:mod:`adtk.data.split_train_test` from a 2-tuple of lists to a list of 2-tuples 22 | - Turned the following model parameters required from optional 23 | 24 | - `window` in :py:mod:`adtk.detector.LevelShiftAD` 25 | - `window` in :py:mod:`adtk.detector.VolatilityShiftAD` 26 | - `window` in :py:mod:`adtk.transformer.RollingAggregate` 27 | - `window` in :py:mod:`adtk.transformer.DoubleRollingAggregate` 28 | - `model` in :py:mod:`adtk.detector.MinClusterDetector` 29 | - `model` in :py:mod:`adtk.detector.OutlierDetector` 30 | - `target` and `regressor` in :py:mod:`adtk.detector.RegressionAD` 31 | - `target` and `regressor` in :py:mod:`adtk.transformer.RegressionResidual` 32 | - `aggregate_func` in :py:mod:`adtk.aggregator.CustomizedAggregator` 33 | - `detect_func` in :py:mod:`adtk.detector.CustomizedDetector1D` 34 | - `detect_func` in :py:mod:`adtk.detector.CustomizedDetectorHD` 35 | - `transform_func` in :py:mod:`adtk.transformer.CustomizedTransformer1D` 36 | - `transform_func` in :py:mod:`adtk.detector.CustomizedTransformer1D` 37 | - `steps` in :py:mod:`adtk.pipe.Pipeline` 38 | 39 | - Added consistency check between training and testing inputs in multivariate models 40 | - Improved time index check in time-dependent models 41 | - Turned all second-order sub-modules private, and a user now can only import from the following first-order modules 42 | 43 | - :py:mod:`adtk.detector` 44 | - :py:mod:`adtk.transformer` 45 | - :py:mod:`adtk.aggregator` 46 | - :py:mod:`adtk.pipe` 47 | - :py:mod:`adtk.data` 48 | - :py:mod:`adtk.metrics` 49 | - :py:mod:`adtk.visualization` 50 | 51 | - Refactored the inheritance structure of model components (see :ref:`inheritance`) 52 | - Added Python 3.8 support 53 | - Fixed compatibility issues with statsmodels v0.11 54 | - Fixed compatibility issues with pandas v1.0 55 | - Created an interactive demo notebook in Binder 56 | - Added type hints, and added type checking in CI/CD test 57 | - Added `Black` and `isort` to developer requirement and CI/CD check 58 | - Optimized release process by publishing package to PyPI through GitHub Actions 59 | - Improved docstrings and API documentation 60 | - Fixed many minor bugs and typos 61 | 62 | Version 0.5.5 (Feb 24, 2020) 63 | =================================== 64 | - Fixed a bug that empty lists were ignored by AndAggregator 65 | - Fixed some typo in the documentation 66 | 67 | Version 0.5.4 (Feb 18, 2020) 68 | =================================== 69 | - Optimized the workflow of how a univariate model is applied to pandas DataFrame 70 | 71 | - Added more informative error messages 72 | - Fixed some bugs resulting in model-column matching error due to inconsistency between output Series names and DataFrame columns 73 | - Clarified the workflow in the documentation 74 | 75 | Version 0.5.3 (Feb 12, 2020) 76 | =================================== 77 | - Quick hotfix to avoid errors caused by statsmodels v0.11 by requiring statsmodels dependency <0.11 78 | 79 | Version 0.5.2 (Jan 14, 2020) 80 | =================================== 81 | - Formalized the management of releases and pre-releases, including rules of branches and versioning 82 | - Added more rules for developers to the documentation 83 | 84 | Version 0.5.1 (Jan 2, 2020) 85 | =================================== 86 | - Added many new unit tests, and modified some old unit test 87 | - Removed seaborn from dependencies (use matplotlib built-in style now) 88 | - Fixed a bug in the metric module of dict objects as input 89 | - Fixed a bug in the detector OutlierDetector that output series has dtype object if NaN is present 90 | - Fixed a bug in transformer pipeline that detect and transform methods are confused 91 | - Fixed a bug in pipenet that an aggregator node may crash if its input is from a node where subset contains a single item 92 | - Fixed a bug in pipenet summary that subset column are always "all" even if not 93 | - Some minor optimization of code 94 | 95 | Version 0.5.0 (Dec 18, 2019) 96 | =================================== 97 | - Changed the parameter `steps` of pipenet from list to dict 98 | - Added method `summary` to pipenet 99 | - Corrected some major algorithmic issues on seasonal decomposition 100 | 101 | - Removed STL decomposition transformer, and hence the corresponding option in SeasonalAD detector 102 | - Recreated classic seasonal decomposition transformer 103 | 104 | - Updated the demo notebook in the documentation 105 | - Added an option to hide legend in the plotting function 106 | - Added some package setup options for developers 107 | - Fixed an issue of tracking Travis and Coveralls status 108 | - Some minor internal optimization in the code 109 | - Fixed some format issues and typos in the documentation 110 | 111 | Version 0.4.1 (Nov 21, 2019) 112 | =================================== 113 | - Fixed an issue of tox environments 114 | - Minor spelling/grammar fix in documentation 115 | 116 | Version 0.4.0 (Nov 18, 2019) 117 | =================================== 118 | - Added support to Python 3.5 119 | - Better unit tests on dependencies 120 | - Minor typo fix in documentation 121 | - Minor code optimization 122 | - Added download statistics to README 123 | - Added coverage test 124 | 125 | Version 0.3.0 (Sep 27, 2019) 126 | =================================== 127 | - Initial release -------------------------------------------------------------------------------- /docs/requirements-docs.txt: -------------------------------------------------------------------------------- 1 | sphinx>=2.0 2 | sphinx_rtd_theme<0.5 3 | nbsphinx>=0.4 4 | python-dateutil>=2.5 5 | jupyter>=1 6 | -------------------------------------------------------------------------------- /docs/userguide.rst: -------------------------------------------------------------------------------- 1 | .. _userguide: 2 | 3 | ********** 4 | User Guide 5 | ********** 6 | 7 | This is a brief guide of how to build an anomaly detection model for time series with ADTK. We recommend all users to read through this guide before starting to use ADTK. 8 | 9 | 10 | - `Unsupervised vs. Supervised`_ 11 | - `Anomaly Types`_ 12 | - `Univariate vs. Multivariate`_ 13 | - `Detector, Transformer, Aggregator, and Pipe`_ 14 | 15 | ---------- 16 | 17 | Unsupervised vs. Supervised 18 | =================================== 19 | 20 | The first thing a user needs to decide before building a model is whether to formulate the problem as a supervised learning problem or an unsupervised problem. Supervised learning methods train models based on time series and normal/anomalous labels in the training set, while unsupervised methods build models only based on time series and domain knowledge, and do not require data labeled. 21 | 22 | Real-world anomaly detection problems usually suffer from lack of labeled historical anomalies, which may prevent users from building a robust supervised model. In this case, an unsupervised/rule-based method is a better choice. ADTK is a package for unsupervised/rule-based models of time series anomaly detection. If a user formulates a task as a supervised learning problem, alternative tools will be needed. 23 | 24 | Anomaly Types 25 | ===================== 26 | 27 | Anomaly is a broad concept, which may refer to many different types of events in time series. A spike of value, a shift of volatility, a violation of seasonal pattern, etc. could all be anomalous or normal, depending on the specific context. ADTK offers a set of common components that can be combined into various types of anomaly detection models for different scenarios. However, ADTK does not select or build a model for a user automatically. A user should know what type of anomaly to detect, therefore can build a model accordingly. 28 | 29 | Outlier 30 | ``````` 31 | 32 | An *outlier* is a data point whose value is significantly different from others. An outlier point in a time series time exceeds the normal range of this series, without considering the temporal relationship between data points. In other words, even regarding all data points as time-independent, an outlier point still outstands. 33 | 34 | .. figure:: images/spike.png 35 | :width: 600px 36 | :align: center 37 | :height: 200px 38 | :alt: spike 39 | 40 | Outlier 41 | 42 | To detect outliers, the *normal range* of time series values is what a detector needs to learn. It can be defined with user-given absolute thresholds (:py:mod:`adtk.detector.ThresholdAD`). Alternatively, a user may create a detector to learn the normal range from historical data (:py:mod:`adtk.detector.QuantileAD`, :py:mod:`adtk.detector.InterQuartileRangeAD`, and :py:mod:`adtk.detector.GeneralizedESDTestAD`). 43 | 44 | 45 | **Outlier is the most basic type of anomaly. Anomaly detection methods targeting at other types often transform a time series into a new one to which outlier detection is applied. Most advanced detectors in ADTK follow this strategy.** 46 | 47 | Spike and Level Shift 48 | ````````````````````` 49 | 50 | In some situations, whether a time point is normal depends on if its value is aligned with its near past. An abrupt increase or decrease of value is called a *spike* if the change is temporary, or a *level shift* if the change is permanent. Please note that, although a spike appears similar to an outlier, it is time-dependent while an outlier is time-independent. The value of a spike could be normal if examing with all data points without considering temporal order (see figure below). 51 | 52 | .. figure:: images/local_spike.png 53 | :width: 600px 54 | :align: center 55 | :height: 200px 56 | :alt: local_spike 57 | 58 | Spike 59 | 60 | .. figure:: images/level_shift.png 61 | :width: 600px 62 | :align: center 63 | :height: 200px 64 | :alt: level_shift 65 | 66 | Level shift 67 | 68 | We may slide two time windows side-by-side and keep tracking the difference between their mean or median values. This difference over time, which is a new time series, is examed by an outlier detector. Whenever the statistics in left and right windows are significantly different, it indicates an abrupt change around this time point. The length of time window controls the time scale of changes to detect: for spikes, the left window is longer than the right one to capture representative information of the near past; on the other hand, for level shifts, both windows should be long enough to capture stable status. 69 | 70 | :py:mod:`adtk.detector.PersistAD` and :py:mod:`adtk.detector.LevelShiftAD` are detectors of spikes and level shifts respectively. Both are implemented with transformer :py:mod:`adtk.transformer.DoubleRollingAggregate` which transforms a time series to the new series with two time windows as mentioned above. 71 | 72 | .. figure:: images/level_shift_double_rolling.png 73 | :width: 600px 74 | :align: center 75 | :height: 400px 76 | :alt: level_shift_double_rolling 77 | 78 | Transform a time series with level shift using `DoubleRollingAggregate` with mean as time window statistic. 79 | 80 | Pattern Change 81 | `````````````` 82 | The strategy mentioned above could be generalized to detect the shift of patterns other than value. For example, if shifts of volatility is of interest, the statistic to track in time windows can be standard deviation instead of mean/median. :py:mod:`adtk.transformer.DoubleRollingAggregate` supports 16 common statistics that could be used to quantify the pattern of interest. 83 | 84 | .. figure:: images/volatility_shift_double_rolling.png 85 | :width: 600px 86 | :align: center 87 | :height: 400px 88 | :alt: volatility_shift_double_rolling 89 | 90 | Transform a time series with volatility level shift using `DoubleRollingAggregate` with standard deviation as metric. 91 | 92 | For detecting temporal changes of pattern, :py:mod:`adtk.transformer.RollingAggregate` could also be a good choice. It slides a time window and returns a statistic measured inside the window that quantifies a temporal pattern. For example, if a user wants to detect temporary anomalously high number of visit to a system, tracking the number of visits in sliding window is an effective approach. 93 | 94 | .. figure:: images/non_zeros_count.png 95 | :width: 600px 96 | :align: center 97 | :height: 400px 98 | :alt: non_zeros_count 99 | 100 | Transform a time series with temporary high frequency of requests using `RollingAggregate` with number of non-zeros values as metric. 101 | 102 | Seasonality 103 | ``````````` 104 | A seasonal pattern exists when a time series is influenced by seasonal factors (e.g. the hour of the day, the day of the week, the month of the year). Detector :py:mod:`adtk.detector.SeasonalAD` uses transformer :py:mod:`adtk.transformer.ClassicSeasonalDecomposition` to remove the seasonal pattern from the original time series, and highlight time period when the time series does not follow the seasonal pattern normally by examing the residual series. 105 | 106 | .. figure:: images/seasonal.png 107 | :width: 600px 108 | :align: center 109 | :height: 400px 110 | :alt: seasonal 111 | 112 | Remove the seasonal pattern from time series of NYC traffic using `ClassicSeasonalDecomposition` with the period as a week (data from `Numenta Anomaly Benchmark `_) 113 | 114 | A user needs to be careful about distinguishing seasonal series and cyclic series. A seasonal series always has a fixed, usually interpretable and known, period because of its seasonal nature. A cyclic time series does not follow a fixed periodic pattern because of its physics nature, even if it appears repeating similar subseries. For example, the trajectory of a moving part in rotating equipment is a 3-D cyclic time series, whose cycle length depends on rotation speed and is not necessarily fixed. Applying seasonality decomposition to it would be problematic, because every cycle may last a slightly different length, and decomposition residuals will be misleading for anomaly detection purpose. 115 | 116 | .. figure:: images/cyclic.png 117 | :width: 600px 118 | :align: center 119 | :height: 400px 120 | :alt: cyclic 121 | 122 | Applying `ClassicSeasonalDecomposition` to a cyclic series fails to detect anomalous behavior. 123 | 124 | Currently, ADTK does not provide a transformer that removes cyclic patterns from cyclic (but not seasonal) time series. However, :py:mod:`adtk.detector.AutoregressionAD` can capture changes of autoregressive relationship (the relationship between a data point and points in its near past) and could be used for cyclic (but not seasonal) series in some situations. 125 | 126 | 127 | Univariate vs. Multivariate 128 | =========================== 129 | 130 | If the time series to detect anomalies from is univariate, anomaly detection models should use univariate transformers in :py:mod:`adtk.transformer` and univariate detectors in :py:mod:`adtk.detector`. 131 | 132 | If the time series is multivariate, a user should understand whether the anomaly detection task is *separable* over series or not. In many cases, detecting anomalies along each series in parallel satisfies the need. For example, if a user has a two-dimensional time series, temperature and humidity, and is trying to detect anomalous temperature or humidity, then applying univariate detector to both temperature and humidity respectively and then aggregating the results will satisfy the need. For users' convenience, when a univariate detector or univariate transformer is applied to a multivariate series (i.e. pandas DataFrame), it applies to every series automatically. 133 | 134 | Sometimes, a user needs to use intrinsic multivariate algorithms, if the type of anomalies to detect cannot be represented by single dimensions separately. For the previous example, if the user tries to detect anomalous `heat index `_ (a hybrid metric of temperature and humidity), multivariate transformers and detectors should be considered, because anomalies must be detected based on temperature and humidity simultaneously. 135 | 136 | Detector, Transformer, Aggregator, and Pipe 137 | =========================================== 138 | 139 | ADTK provides three types of components to be combined into a model. 140 | A detector is a component that scans time series and returns anomalous time points. They are all included in module :py:mod:`adtk.detector`. 141 | A transformer is a component that transforms time series such that useful information is extracted. It can also be interpreted as a feature engineering component. They are all included in module :py:mod:`adtk.transformer`. 142 | An Aggregator is a component that combines different detection results (anomaly lists). It is an ensemble component. They are all included in module :py:mod:`adtk.aggregator`. 143 | 144 | A model can be a single detector or a combination of multiple components. If the combination is sequential, i.e. one or several transformers connected with a detector sequentially, it can be connected by an :py:mod:`adtk.pipe.Pipeline` object. If the combination is more complicated and not sequential, it can be connected by an :py:mod:`adtk.pipe.Pipenet` object. 145 | Many detectors in :py:mod:`adtk.detector` are internally implemented as a Pipeline or Pipenet object, but are listed in module :py:mod:`adtk.detector` for users' convenience. 146 | 147 | For any component that has yet been implemented, a user may implement it as a function and use components :py:mod:`adtk.detector.CustomizedDetector1D`, :py:mod:`adtk.detector.CustomizedDetectorHD`, :py:mod:`adtk.transformer.CustomizedTransformer1D`, :py:mod:`adtk.transformer.CustomizedTransformerHD`, or :py:mod:`adtk.aggregator.CustomizedAggregator` to convert a function into an ADTK component. Then it has the unified APIs and can be used as a normal ADTK component (for example, to be connected with other components using Pipeline or Pipenet). Users are always welcomed to contribute their algorithm into the package permanently. More information for contributors can be found in :ref:`developer`. 148 | 149 | A user may check :ref:`examples` for examples of ADTK components. 150 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | # mypy.ini 2 | [mypy] 3 | disallow_untyped_defs = True 4 | disallow_untyped_calls = True 5 | 6 | [mypy-scipy.stats] 7 | ignore_missing_imports = True 8 | 9 | [mypy-sklearn.decomposition] 10 | ignore_missing_imports = True 11 | 12 | [mypy-sklearn.linear_model] 13 | ignore_missing_imports = True 14 | 15 | [mypy-numpy] 16 | ignore_missing_imports = True 17 | 18 | [mypy-pandas] 19 | ignore_missing_imports = True 20 | 21 | [mypy-matplotlib] 22 | ignore_missing_imports = True 23 | 24 | [mypy-matplotlib.pyplot] 25 | ignore_missing_imports = True 26 | 27 | [mypy-matplotlib.collections] 28 | ignore_missing_imports = True 29 | 30 | [mypy-matplotlib.lines] 31 | ignore_missing_imports = True 32 | 33 | [mypy-matplotlib.patches] 34 | ignore_missing_imports = True 35 | 36 | [mypy-statsmodels.tsa.seasonal] 37 | ignore_missing_imports = True 38 | 39 | [mypy-statsmodels.tsa.stattools] 40 | ignore_missing_imports = True 41 | 42 | [mypy-pandas.plotting] 43 | ignore_missing_imports = True 44 | 45 | [mypy-statsmodels] 46 | ignore_missing_imports = True 47 | 48 | ; we didn't typing the visualization module because there are a lot recursion 49 | ; on nested tree structure which would be messy if we type rigorously 50 | [mypy-adtk.visualization.*] 51 | ignore_errors = True 52 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 79 -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = adtk 3 | version = 0.6.2 4 | author = Arundo Analytics, Inc. 5 | maintainer = Tailai Wen 6 | maintainer_email = tailai.wen@arundo.com 7 | url = https://github.com/arundo/adtk 8 | description = A package for unsupervised time series anomaly detection 9 | long_description = file: README.md 10 | long_description_content_type= text/markdown 11 | keywords = anomaly detection, time series 12 | classifiers = 13 | Development Status :: 5 - Production/Stable 14 | License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0) 15 | Topic :: Scientific/Engineering 16 | Programming Language :: Python :: 3.5 17 | Programming Language :: Python :: 3.6 18 | Programming Language :: Python :: 3.7 19 | Programming Language :: Python :: 3.8 20 | Operating System :: POSIX :: Linux 21 | Operating System :: Unix 22 | Operating System :: MacOS 23 | Operating System :: Microsoft :: Windows 24 | license = Mozilla Public License 2.0 (MPL 2.0) 25 | 26 | [options] 27 | zip_safe = False 28 | python_requires = >=3.5 29 | package_dir = 30 | =src 31 | packages = find: 32 | install_requires = 33 | numpy>=1.15 34 | pandas>=0.23 35 | matplotlib>=3.0 36 | scikit-learn>=0.20 37 | statsmodels>=0.9 38 | packaging>=17.0 39 | tabulate>=0.8 40 | 41 | [options.packages.find] 42 | where = src 43 | exclude = 44 | tests 45 | docs 46 | 47 | [options.extras_require] 48 | test = 49 | pytest>=4 50 | tox>=3 51 | coverage>3.6,<5 52 | pytest-cov>=2.7 53 | coveralls>=1.7 54 | mypy>=0.641 55 | doc = 56 | sphinx>=2.4,<3 57 | sphinx_rtd_theme<0.5 58 | nbsphinx>=0.4 59 | python-dateutil>=2.5 60 | jupyter>=1 61 | dev = 62 | black==19.3b0 63 | isort==4.3.21 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup() 4 | -------------------------------------------------------------------------------- /src/adtk/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Anomaly Detection Toolkit 3 | ========================= 4 | 5 | Anomaly Detection Toolkit (ADTK) is a Python package for unsupervised / 6 | rule-based time series anomaly detection. 7 | 8 | As the nature of anomaly varies over different cases, a model may not work 9 | universally for all anomaly detection problems. Choosing and combining 10 | detection algorithms (detectors), feature engineering methods (transformers), 11 | and ensemble methods (aggregators) properly is the key to build an effective 12 | anomaly detection model. 13 | 14 | This package offers a set of common detectors, transformers and aggregators 15 | with unified APIs, as well as pipe classes that connect them together into 16 | models. It also provides some functions to process and visualize time series 17 | and anomaly events. 18 | 19 | See https://adtk.readthedocs.io for complete documentation. 20 | 21 | """ 22 | 23 | __version__ = "0.6.2" 24 | -------------------------------------------------------------------------------- /src/adtk/_aggregator_base.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Tuple, Union 2 | 3 | import pandas as pd 4 | 5 | from ._base import _NonTrainableModel 6 | 7 | 8 | class _Aggregator(_NonTrainableModel): 9 | def _predict( 10 | self, 11 | lists: Union[ 12 | pd.DataFrame, 13 | Dict[str, Union[pd.Series, pd.DataFrame]], 14 | Dict[ 15 | str, 16 | List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]], 17 | ], 18 | ], 19 | ) -> Union[ 20 | pd.Series, List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]] 21 | ]: 22 | if isinstance(lists, dict): 23 | if not ( 24 | all([isinstance(lst, list) for lst in lists.values()]) 25 | or all( 26 | [ 27 | isinstance(lst, (pd.Series, pd.DataFrame)) 28 | for lst in lists.values() 29 | ] 30 | ) 31 | ): 32 | raise TypeError( 33 | "Input must be a pandas DataFrame, a dict of lists, or a " 34 | "dict of pandas Series/DataFrame." 35 | ) 36 | elif isinstance(lists, pd.DataFrame): 37 | pass 38 | else: 39 | raise TypeError( 40 | "Input must be a pandas DataFrame, a dict of lists, or a dict " 41 | "of pandas Series/DataFrame." 42 | ) 43 | return self._predict_core(lists) 44 | 45 | def predict( 46 | self, 47 | lists: Union[ 48 | pd.DataFrame, 49 | Dict[str, Union[pd.Series, pd.DataFrame]], 50 | Dict[ 51 | str, 52 | List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]], 53 | ], 54 | ], 55 | ) -> Union[ 56 | pd.Series, List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]] 57 | ]: 58 | """Aggregate multiple lists of anomalies into one. 59 | 60 | Parameters 61 | ---------- 62 | lists: pandas.DataFrame, a dict of Series/DataFrame, or a dict of lists 63 | Anomaly lists to be aggregated. 64 | 65 | - If a pandas DataFrame, every column is a binary Series 66 | representing a type of anomaly. 67 | - If a dict of pandas Series/DataFrame, every value of the dict is 68 | a binary Series/DataFrame representing a type or some types of 69 | anomaly; 70 | - If a dict of list, every value of the dict is a type of anomaly 71 | as a list of events, where each event is represented as a pandas 72 | Timestamp if it is instantaneous or a 2-tuple of pandas 73 | Timestamps if it is a closed time interval. 74 | 75 | Returns 76 | ------- 77 | list or a binary pandas Series 78 | Aggregated list of anomalies. 79 | 80 | - If input is a pandas DataFrame or a dict of Series/DataFrame, 81 | return a single binary pandas Series; 82 | - If input is a dict of lists, return a single list of events. 83 | 84 | """ 85 | return self._predict(lists) 86 | 87 | aggregate = predict 88 | -------------------------------------------------------------------------------- /src/adtk/_base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from copy import deepcopy 3 | from typing import Any, Dict, List, Tuple, Union 4 | 5 | import pandas as pd 6 | 7 | 8 | class _Model(ABC): 9 | "Base class for all models (detectors, transformers, and aggregators)." 10 | 11 | def __init__(self, *args: Any, **kwargs: Any) -> None: 12 | pass 13 | 14 | def get_params(self) -> Dict[str, Any]: 15 | """Get the parameters of this model. 16 | 17 | Returns 18 | ------- 19 | dict 20 | Model parameters. 21 | 22 | """ 23 | return {key: getattr(self, key) for key in self._param_names} 24 | 25 | def set_params(self, **params: Any) -> None: 26 | """Set the parameters of this model. 27 | 28 | Parameters 29 | ---------- 30 | **params 31 | Model parameters to set. 32 | 33 | """ 34 | for key in params.keys(): 35 | if key not in self._param_names: 36 | raise KeyError( 37 | "'{}' is not a valid parameter name.".format(key) 38 | ) 39 | for key, value in params.items(): 40 | setattr(self, key, value) 41 | 42 | @property 43 | @abstractmethod 44 | def _param_names(self) -> Tuple[str, ...]: 45 | return tuple() 46 | 47 | 48 | class _NonTrainableModel(_Model): 49 | "Base class of models that do not need training." 50 | 51 | @abstractmethod 52 | def _predict(self, input: Any) -> Any: 53 | pass 54 | 55 | @abstractmethod 56 | def _predict_core(self, input: Any) -> Any: 57 | pass 58 | 59 | @abstractmethod 60 | def predict(self, input: Any) -> Any: 61 | pass 62 | 63 | 64 | class _TrainableModel(_Model): 65 | "Base class of models that need training." 66 | 67 | def __init__(self, *args: Any, **kwargs: Any) -> None: 68 | super().__init__(*args, **kwargs) 69 | # 0 for not fitted, 1 for fitted, 2 for univariate model fitted by DF 70 | self._fitted = 0 # type: int 71 | 72 | @abstractmethod 73 | def _fit(self, input: Any) -> None: 74 | pass 75 | 76 | @abstractmethod 77 | def _fit_core(self, input: Any) -> None: 78 | pass 79 | 80 | @abstractmethod 81 | def fit(self, input: Any) -> None: 82 | pass 83 | 84 | @abstractmethod 85 | def _predict(self, input: Any) -> Any: 86 | pass 87 | 88 | @abstractmethod 89 | def _predict_core(self, input: Any) -> Any: 90 | pass 91 | 92 | @abstractmethod 93 | def predict(self, input: Any) -> Any: 94 | pass 95 | 96 | @abstractmethod 97 | def fit_predict(self, input: Any) -> Any: 98 | pass 99 | 100 | 101 | class _NonTrainableUnivariateModel(_NonTrainableModel): 102 | "Base class of univariate detectors and transformers." 103 | 104 | def _predict( 105 | self, ts: Union[pd.Series, pd.DataFrame] 106 | ) -> Union[pd.Series, pd.DataFrame]: 107 | if isinstance(ts, pd.Series): 108 | s = ts.copy() # type: pd.Series 109 | if not isinstance(s.index, pd.DatetimeIndex): 110 | raise TypeError( 111 | "Index of the input time series must be a pandas " 112 | "DatetimeIndex object." 113 | ) 114 | predicted = self._predict_core(s) 115 | # if a Series-to-Series operation, make sure Series name keeps 116 | if isinstance(predicted, pd.Series): 117 | predicted.name = ts.name 118 | elif isinstance(ts, pd.DataFrame): 119 | df = ts.copy() # type: pd.DataFrame 120 | if df.columns.duplicated().any(): 121 | raise ValueError( 122 | "Input DataFrame must have unique column names." 123 | ) 124 | # apply the model to each column 125 | predicted_all_cols = [] 126 | for col in df.columns: 127 | predicted_this_col = self._predict(df[col]) 128 | # if a Series-to-DF operation, update column name 129 | if isinstance(predicted_this_col, pd.DataFrame): 130 | predicted_this_col = predicted_this_col.rename( 131 | columns={ 132 | col1: "{}_{}".format(col, col1) 133 | for col1 in predicted_this_col.columns 134 | } 135 | ) 136 | predicted_all_cols.append(predicted_this_col) 137 | predicted = pd.concat(predicted_all_cols, axis=1) 138 | else: 139 | raise TypeError("Input must be a pandas Series or DataFrame.") 140 | # make sure index freq is the same (because pandas has a bug that some 141 | # operation, e.g. concat, may change freq) 142 | predicted.index.freq = ts.index.freq 143 | return predicted 144 | 145 | 146 | class _TrainableUnivariateModel(_TrainableModel): 147 | def __init__(self, *args: Any, **kwargs: Any) -> None: 148 | super().__init__(*args, **kwargs) 149 | self._models = dict() # type: Dict[str, _TrainableUnivariateModel] 150 | 151 | def _fit(self, ts: Union[pd.Series, pd.DataFrame]) -> None: 152 | if isinstance(ts, pd.Series): 153 | s = ts.copy() # type: pd.Series 154 | self._fit_core(s) 155 | self._fitted = 1 156 | elif isinstance(ts, pd.DataFrame): 157 | df = ts.copy() 158 | if not isinstance(df.index, pd.DatetimeIndex): 159 | raise TypeError( 160 | "Index of the input time series must be a pandas " 161 | "DatetimeIndex object." 162 | ) 163 | if df.columns.duplicated().any(): 164 | raise ValueError( 165 | "Input DataFrame must have unique column names." 166 | ) 167 | # create internal models 168 | self._models = { 169 | col: self.__class__(**deepcopy(self.get_params())) 170 | for col in df.columns 171 | } 172 | # fit model for each column 173 | for col in df.columns: 174 | self._models[col].fit(df[col]) 175 | self._fitted = 2 176 | else: 177 | raise TypeError("Input must be a pandas Series or DataFrame.") 178 | 179 | def _predict( 180 | self, ts: Union[pd.Series, pd.DataFrame] 181 | ) -> Union[pd.Series, pd.DataFrame]: 182 | if self._fitted == 0: 183 | raise RuntimeError("The model must be trained first.") 184 | 185 | if isinstance(ts, pd.Series): 186 | if self._fitted == 2: 187 | raise RuntimeError( 188 | "The model was trained by a pandas DataFrame object, " 189 | "it can only be applied to a pandas DataFrame object with " 190 | "the same column names as the one used for training." 191 | ) 192 | s = ts.copy() 193 | if not isinstance(s.index, pd.DatetimeIndex): 194 | raise TypeError( 195 | "Index of the input time series must be a pandas " 196 | "DatetimeIndex object." 197 | ) 198 | predicted = self._predict_core(s) 199 | # if a Series-to-Series operation, make sure Series name keeps 200 | if isinstance(predicted, pd.Series): 201 | predicted.name = ts.name 202 | elif isinstance(ts, pd.DataFrame): 203 | df = ts.copy() 204 | if not isinstance(df.index, pd.DatetimeIndex): 205 | raise TypeError( 206 | "Index of the input time series must be a pandas " 207 | "DatetimeIndex object." 208 | ) 209 | if df.columns.duplicated().any(): 210 | raise ValueError( 211 | "Input DataFrame must have unique column names." 212 | ) 213 | if self._fitted == 1: 214 | # apply the model to each column 215 | predicted_all_cols = [] 216 | for col in df.columns: 217 | predicted_this_col = self._predict(df[col]) 218 | if isinstance(predicted_this_col, pd.DataFrame): 219 | predicted_this_col = predicted_this_col.rename( 220 | columns={ 221 | col1: "{}_{}".format(col, col1) 222 | for col1 in predicted_this_col.columns 223 | } 224 | ) 225 | predicted_all_cols.append(predicted_this_col) 226 | predicted = pd.concat(predicted_all_cols, axis=1) 227 | else: 228 | # predict for each column 229 | if not (set(self._models.keys()) >= set(df.columns)): 230 | raise ValueError( 231 | "The model was trained by a pandas DataFrame with " 232 | "columns {}, but the input DataFrame contains columns " 233 | "{} which are unknown to the model.".format( 234 | list(set(self._models.keys())), 235 | list(set(df.columns) - set(self._models.keys())), 236 | ) 237 | ) 238 | predicted = pd.concat( 239 | [ 240 | self._models[col]._predict(df[col]) 241 | for col in df.columns 242 | ], 243 | axis=1, 244 | ) 245 | else: 246 | raise TypeError("Input must be a pandas Series or DataFrame.") 247 | # make sure index freq is the same (because pandas has a bug that some 248 | # operation, e.g. concat, may change freq) 249 | predicted.index.freq = ts.index.freq 250 | return predicted 251 | 252 | 253 | class _NonTrainableMultivariateModel(_NonTrainableModel): 254 | def _predict(self, df: pd.DataFrame) -> Union[pd.Series, pd.DataFrame]: 255 | if isinstance(df, pd.DataFrame): 256 | if df.columns.duplicated().any(): 257 | raise ValueError( 258 | "Input DataFrame must have unique column names." 259 | ) 260 | df_copy = df.copy() 261 | predicted = self._predict_core(df_copy) 262 | else: 263 | raise TypeError("Input must be a pandas DataFrame.") 264 | # make sure index freq is the same (because pandas has a bug that some 265 | # operation, e.g. concat, may change freq) 266 | predicted.index.freq = df.index.freq 267 | return predicted 268 | 269 | 270 | class _TrainableMultivariateModel(_TrainableModel): 271 | def __init__(self, *args: Any, **kwargs: Any) -> None: 272 | super().__init__(*args, **kwargs) 273 | self._cols = [] # type: List[str] 274 | 275 | def _fit(self, df: pd.DataFrame) -> None: 276 | if isinstance(df, pd.DataFrame): 277 | if df.columns.duplicated().any(): 278 | raise ValueError( 279 | "Input DataFrame must have unique column names." 280 | ) 281 | df_copy = df.copy() 282 | self._fit_core(df_copy) 283 | else: 284 | raise TypeError("Input must be a pandas DataFrame.") 285 | self._cols = list(df.columns) 286 | self._fitted = 1 287 | 288 | def _predict(self, df: pd.DataFrame) -> Union[pd.Series, pd.DataFrame]: 289 | if self._fitted == 0: 290 | raise RuntimeError("The model must be trained first.") 291 | if isinstance(df, pd.DataFrame): 292 | if df.columns.duplicated().any(): 293 | raise ValueError( 294 | "Input DataFrame must have unique column names." 295 | ) 296 | if not (set(df.columns) >= set(self._cols)): 297 | raise ValueError( 298 | "The model was trained by a pandas DataFrame with columns " 299 | "{}, but the input DataFrame does not contain columns {}.".format( 300 | self._cols, list(set(self._cols) - set(df.columns)) 301 | ) 302 | ) 303 | df_copy = ( 304 | df.loc[:, self._cols].copy() if self._cols else df.copy() 305 | ) # in a customized hd model that doesn't need fit, self._cols is empty 306 | predicted = self._predict_core(df_copy) 307 | else: 308 | raise TypeError("Input must be a pandas DataFrame.") 309 | # make sure index freq is the same (because pandas has a bug that some 310 | # operation, e.g. concat, may change freq) 311 | predicted.index.freq = df.index.freq 312 | return predicted 313 | -------------------------------------------------------------------------------- /src/adtk/_transformer_base.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import pandas as pd 4 | 5 | from ._base import ( 6 | _NonTrainableMultivariateModel, 7 | _NonTrainableUnivariateModel, 8 | _TrainableMultivariateModel, 9 | _TrainableUnivariateModel, 10 | ) 11 | 12 | 13 | class _NonTrainableUnivariateTransformer(_NonTrainableUnivariateModel): 14 | def predict( 15 | self, ts: Union[pd.Series, pd.DataFrame] 16 | ) -> Union[pd.Series, pd.DataFrame]: 17 | """Transform time series. 18 | 19 | Parameters 20 | ---------- 21 | ts: pandas.Series or pandas.DataFrame 22 | Time series to be transformed. If a DataFrame with k columns, it is 23 | treated as k independent univariate time series and the transformer 24 | will be applied to each univariate series independently. 25 | 26 | Returns 27 | ------- 28 | pandas.Series or pandas.DataFrame 29 | Transformed time series. 30 | 31 | """ 32 | return self._predict(ts) 33 | 34 | transform = predict 35 | 36 | 37 | class _TrainableUnivariateTransformer(_TrainableUnivariateModel): 38 | def fit(self, ts: Union[pd.Series, pd.DataFrame]) -> None: 39 | """Train the transformer with given time series. 40 | 41 | Parameters 42 | ---------- 43 | ts: pandas.Series or pandas.DataFrame 44 | Time series to be used to train the transformer. 45 | If a DataFrame with k columns, k univariate transformers will be 46 | trained independently. 47 | 48 | """ 49 | self._fit(ts) 50 | 51 | def predict( 52 | self, ts: Union[pd.Series, pd.DataFrame] 53 | ) -> Union[pd.Series, pd.DataFrame]: 54 | """Transform time series. 55 | 56 | Parameters 57 | ---------- 58 | ts: pandas.Series or pandas.DataFrame 59 | Time series to be transformed. If a DataFrame with k columns, it is 60 | treated as k independent univariate time series. 61 | 62 | - If the transformer was trained with a Series, the transformer 63 | will be applied to each univariate series independently; 64 | - If the transformer was trained with a DataFrame, i.e. the 65 | transformer is essentially k transformers, those transformers 66 | will be applied to each univariate series respectively. 67 | 68 | Returns 69 | ------- 70 | pandas.Series or pandas.DataFrame 71 | Transformed time series. 72 | 73 | """ 74 | return self._predict(ts) 75 | 76 | def fit_predict( 77 | self, ts: Union[pd.Series, pd.DataFrame] 78 | ) -> Union[pd.Series, pd.DataFrame]: 79 | """Train the transformer, and tranform the time series used for 80 | training. 81 | 82 | Parameters 83 | ---------- 84 | ts: pandas.Series or pandas.DataFrame 85 | Time series to be used for training and be transformed. 86 | If a DataFrame with k columns, it is treated as k independent 87 | univariate time series, and k univariate transformers will be 88 | trained and applied to each series independently. 89 | 90 | Returns 91 | ------- 92 | pandas.Series or pandas.DataFrame 93 | Transformed time series. 94 | 95 | """ 96 | self.fit(ts) 97 | return self.predict(ts) 98 | 99 | transform = predict 100 | fit_transform = fit_predict 101 | 102 | 103 | class _NonTrainableMultivariateTransformer(_NonTrainableMultivariateModel): 104 | def predict(self, df: pd.DataFrame) -> Union[pd.Series, pd.DataFrame]: 105 | """Transform time series. 106 | 107 | Parameters 108 | ---------- 109 | df: pandas.DataFrame 110 | Time series to be transformed. 111 | 112 | Returns 113 | ------- 114 | pandas.Series or pandas.DataFrame 115 | Transformed time series. 116 | 117 | """ 118 | return self._predict(df) 119 | 120 | transform = predict 121 | 122 | 123 | class _TrainableMultivariateTransformer(_TrainableMultivariateModel): 124 | def fit(self, df: pd.DataFrame) -> None: 125 | """Train the transformer with given time series. 126 | 127 | Parameters 128 | ---------- 129 | df: pandas.DataFrame 130 | Time series to be used to train the transformer. 131 | 132 | """ 133 | self._fit(df) 134 | 135 | def predict(self, df: pd.DataFrame) -> Union[pd.Series, pd.DataFrame]: 136 | """Transform time series. 137 | 138 | Parameters 139 | ---------- 140 | df: pandas.DataFrame 141 | Time series to be transformed. 142 | 143 | Returns 144 | ------- 145 | pandas.Series or pandas.DataFrame 146 | Transformed time series. 147 | 148 | """ 149 | return self._predict(df) 150 | 151 | def fit_predict(self, df: pd.DataFrame) -> Union[pd.Series, pd.DataFrame]: 152 | """Train the transformer, and tranform the time series used for 153 | training. 154 | 155 | Parameters 156 | ---------- 157 | df: pandas.DataFrame 158 | Time series to be used for training and be transformed. 159 | 160 | Returns 161 | ------- 162 | pandas.Series or pandas.DataFrame 163 | Transformed time series. 164 | 165 | """ 166 | self.fit(df) 167 | return self.predict(df) 168 | 169 | transform = predict 170 | fit_transform = fit_predict 171 | -------------------------------------------------------------------------------- /src/adtk/_utils.py: -------------------------------------------------------------------------------- 1 | """Module for all utility functions. 2 | 3 | """ 4 | 5 | from typing import Dict, Optional, Type 6 | 7 | 8 | def _get_all_subclasses_from_superclass( 9 | superclass: Type 10 | ) -> Dict[str, Optional[str]]: 11 | result = dict() 12 | for sb in superclass.__subclasses__(): 13 | if sb.__name__[0] != "_": 14 | result.update({sb.__name__: sb.__doc__}) 15 | else: 16 | result.update(_get_all_subclasses_from_superclass(sb)) 17 | return result 18 | 19 | 20 | class PandasBugError(Exception): 21 | def __init__(self) -> None: 22 | msg = ( 23 | """Pandas before v0.25 has a known bug in method `rolling` when """ 24 | """parameter `window` is offset and `closed` is 'left'. Your """ 25 | """current execution is impacted by this bug. If you are using """ 26 | """Python 3.5.3 or later, please upgrade pandas to v0.25 or """ 27 | """later. If you are using Python 3.5.2 or earlier, please """ 28 | """consider using integer instead of offset to define the left """ 29 | """rolling window.""" 30 | ) 31 | super().__init__(msg) 32 | -------------------------------------------------------------------------------- /src/adtk/aggregator/__init__.py: -------------------------------------------------------------------------------- 1 | """Module of aggregators. 2 | 3 | An aggregator combines multiple lists of anomalies into one. 4 | 5 | """ 6 | from typing import Dict, Optional 7 | 8 | from .._aggregator_base import _Aggregator 9 | from .._utils import _get_all_subclasses_from_superclass 10 | from ._aggregator import AndAggregator, CustomizedAggregator, OrAggregator 11 | 12 | 13 | def print_all_models() -> None: 14 | """ 15 | Print description of every model in this module. 16 | """ 17 | model_desc = _get_all_subclasses_from_superclass( 18 | _Aggregator 19 | ) # type: Dict[str, Optional[str]] 20 | for key, value in model_desc.items(): 21 | print("-" * 80) 22 | print(key) 23 | print(value) 24 | 25 | 26 | __all__ = [ 27 | "OrAggregator", 28 | "AndAggregator", 29 | "CustomizedAggregator", 30 | "print_all_models", 31 | ] 32 | -------------------------------------------------------------------------------- /src/adtk/aggregator/_aggregator.py: -------------------------------------------------------------------------------- 1 | """Module for aggregators. 2 | 3 | An aggregator combines multiple lists of anomalies into one. 4 | 5 | """ 6 | 7 | from typing import Any, Callable, Dict, List, Optional, Tuple, Union 8 | 9 | import pandas as pd 10 | 11 | from .._aggregator_base import _Aggregator 12 | from ..data import validate_events 13 | 14 | 15 | class CustomizedAggregator(_Aggregator): 16 | """Aggregator derived from a user-given function and parameters. 17 | 18 | Parameters 19 | ---------- 20 | aggregate_func: function 21 | A function aggregating multiple types of anomaly. 22 | 23 | The first input argument must be a pandas DataFrame, a dict of pandas 24 | Series/DataFrame, or a dict of event lists. 25 | 26 | - If a pandas DataFrame, every column is a binary Series representing a 27 | type of anomaly. 28 | - If a dict of pandas Series/DataFrame, every value of the dict is a 29 | binary Series/DataFrame representing a type or some types of anomaly; 30 | - If a dict of list, every value of the dict is a type of anomaly as a 31 | list of events, where each event is represented as a pandas Timestamp 32 | if it is instantaneous or a 2-tuple of pandas Timestamps if it is a 33 | closed time interval. 34 | 35 | Optional input argument may be accepted through parameter 36 | `aggregate_func_params`. 37 | 38 | The output must be a list of pandas Timestamps. 39 | 40 | - If input is a pandas DataFrame or a dict of Series/DataFrame, return 41 | a single binary pandas Series; 42 | - If input is a dict of lists, return a single list of events. 43 | 44 | aggregate_func_params: dict, optional 45 | Parameters of `aggregate_func`. Default: None. 46 | 47 | """ 48 | 49 | def __init__( 50 | self, 51 | aggregate_func: Callable, 52 | aggregate_func_params: Optional[Dict[str, Any]] = None, 53 | ) -> None: 54 | super().__init__() 55 | self.aggregate_func = aggregate_func 56 | self.aggregate_func_params = aggregate_func_params 57 | 58 | @property 59 | def _param_names(self) -> Tuple[str, ...]: 60 | return ("aggregate_func", "aggregate_func_params") 61 | 62 | def _predict_core( 63 | self, 64 | lists: Union[ 65 | pd.DataFrame, 66 | Dict[str, Union[pd.Series, pd.DataFrame]], 67 | Dict[ 68 | str, 69 | List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]], 70 | ], 71 | ], 72 | ) -> Union[ 73 | pd.Series, List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]] 74 | ]: 75 | if self.aggregate_func_params is None: 76 | aggregate_func_params = {} 77 | else: 78 | aggregate_func_params = self.aggregate_func_params 79 | return self.aggregate_func(lists, **aggregate_func_params) 80 | 81 | 82 | class OrAggregator(_Aggregator): 83 | """Aggregator that identifies a time point as anomalous as long as it is 84 | included in one of the input anomaly lists. 85 | """ 86 | 87 | def __init__(self) -> None: 88 | super().__init__() 89 | 90 | @property 91 | def _param_names(self) -> Tuple[str, ...]: 92 | return tuple() 93 | 94 | def _predict_core( 95 | self, 96 | lists: Union[ 97 | pd.DataFrame, 98 | Dict[str, Union[pd.Series, pd.DataFrame]], 99 | Dict[ 100 | str, 101 | List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]], 102 | ], 103 | ], 104 | ) -> Union[ 105 | pd.Series, List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]] 106 | ]: 107 | if isinstance(lists, dict): 108 | if isinstance(next(iter(lists.values())), list): 109 | clean_lists = { 110 | key: validate_events(value) for key, value in lists.items() 111 | } 112 | return validate_events( 113 | [ 114 | window 115 | for clean_predict in clean_lists.values() 116 | for window in clean_predict 117 | ] 118 | ) 119 | else: # a dict of pandas Series/DataFrame 120 | return self._predict_core( 121 | pd.concat(lists, join="outer", axis=1) 122 | ) 123 | else: # pandas DataFrame 124 | predicted = lists.any(axis=1) 125 | predicted[~predicted & lists.isna().any(axis=1)] = float("nan") 126 | return predicted 127 | 128 | 129 | class AndAggregator(_Aggregator): 130 | """Aggregator that identifies a time point as anomalous only if it is 131 | included in all the input anomaly lists. 132 | """ 133 | 134 | def __init__(self) -> None: 135 | super().__init__() 136 | 137 | @property 138 | def _param_names(self) -> Tuple[str, ...]: 139 | return tuple() 140 | 141 | def _predict_core( 142 | self, 143 | lists: Union[ 144 | pd.DataFrame, 145 | Dict[str, Union[pd.Series, pd.DataFrame]], 146 | Dict[ 147 | str, 148 | List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]], 149 | ], 150 | ], 151 | ) -> Union[ 152 | pd.Series, List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]] 153 | ]: 154 | if isinstance(lists, dict): 155 | if isinstance(next(iter(lists.values())), list): 156 | clean_lists = { 157 | key: validate_events(value, point_as_interval=True) 158 | for key, value in lists.items() 159 | } 160 | time_window_stats = { 161 | key: pd.Series( 162 | [0] * len(clean_predict) 163 | + [1] * 2 * len(clean_predict) 164 | + [0] * len(clean_predict), 165 | index=( 166 | [ 167 | window[0] - pd.Timedelta("1ns") 168 | for window in clean_predict 169 | ] 170 | + [window[0] for window in clean_predict] 171 | + [window[1] for window in clean_predict] 172 | + [ 173 | window[1] + pd.Timedelta("1ns") 174 | for window in clean_predict 175 | ] 176 | ), 177 | dtype=int, 178 | ).sort_index() 179 | for key, clean_predict in clean_lists.items() 180 | } # type: Union[Dict, pd.Series] 181 | time_window_stats = { 182 | key: value[~value.index.duplicated()] 183 | for key, value in time_window_stats.items() 184 | } 185 | time_window_stats = ( 186 | pd.concat(time_window_stats, axis=1, join="outer") 187 | .fillna(method="ffill") 188 | .fillna(method="bfill") 189 | .fillna(0) 190 | ) 191 | time_window_stats = time_window_stats.all(axis=1) 192 | status = 0 193 | last_t = None 194 | aggregated_predict = [] 195 | for t, v in time_window_stats.items(): 196 | if (status == 0) and (v == 1): 197 | start = t 198 | status = 1 199 | if (status == 1) and (v == 0): 200 | end = last_t 201 | aggregated_predict.append((start, end)) 202 | status = 0 203 | last_t = t 204 | return validate_events(aggregated_predict) 205 | else: # a dict of pandas Series/DataFrame 206 | return self._predict_core( 207 | pd.concat(lists, join="outer", axis=1) 208 | ) 209 | else: # pandas DataFrame 210 | predicted = lists.all(axis=1) 211 | predicted[predicted & lists.isna().any(axis=1)] = float("nan") 212 | return predicted 213 | -------------------------------------------------------------------------------- /src/adtk/data/__init__.py: -------------------------------------------------------------------------------- 1 | """Module of data processing.""" 2 | 3 | from ._data import ( 4 | expand_events, 5 | split_train_test, 6 | to_events, 7 | to_labels, 8 | validate_events, 9 | validate_series, 10 | ) 11 | 12 | __all__ = [ 13 | "validate_series", 14 | "to_events", 15 | "to_labels", 16 | "expand_events", 17 | "validate_events", 18 | "split_train_test", 19 | ] 20 | -------------------------------------------------------------------------------- /src/adtk/detector/__init__.py: -------------------------------------------------------------------------------- 1 | """Module of detectors. 2 | 3 | A detector detects anomalous time points from time series. 4 | 5 | """ 6 | from .._detector_base import ( # _NonTrainableMultivariateDetector, 7 | _NonTrainableUnivariateDetector, 8 | _TrainableMultivariateDetector, 9 | _TrainableUnivariateDetector, 10 | ) 11 | from .._utils import _get_all_subclasses_from_superclass 12 | from ._detector_1d import ( 13 | AutoregressionAD, 14 | CustomizedDetector1D, 15 | GeneralizedESDTestAD, 16 | InterQuartileRangeAD, 17 | LevelShiftAD, 18 | PersistAD, 19 | QuantileAD, 20 | SeasonalAD, 21 | ThresholdAD, 22 | VolatilityShiftAD, 23 | ) 24 | from ._detector_hd import ( 25 | CustomizedDetectorHD, 26 | MinClusterDetector, 27 | OutlierDetector, 28 | PcaAD, 29 | RegressionAD, 30 | ) 31 | 32 | 33 | def print_all_models() -> None: 34 | """ 35 | Print description of every model in this module. 36 | """ 37 | model_desc = _get_all_subclasses_from_superclass( 38 | _NonTrainableUnivariateDetector 39 | ) 40 | # model_desc.update( 41 | # _get_all_subclasses_from_superclass(_NonTrainableMultivariateDetector) 42 | # ) 43 | model_desc.update( 44 | _get_all_subclasses_from_superclass(_TrainableUnivariateDetector) 45 | ) 46 | model_desc.update( 47 | _get_all_subclasses_from_superclass(_TrainableMultivariateDetector) 48 | ) 49 | for key, value in model_desc.items(): 50 | print("-" * 80) 51 | print(key) 52 | print(value) 53 | 54 | 55 | __all__ = [ 56 | "ThresholdAD", 57 | "QuantileAD", 58 | "InterQuartileRangeAD", 59 | "GeneralizedESDTestAD", 60 | "PersistAD", 61 | "LevelShiftAD", 62 | "VolatilityShiftAD", 63 | "AutoregressionAD", 64 | "SeasonalAD", 65 | "CustomizedDetector1D", 66 | "MinClusterDetector", 67 | "OutlierDetector", 68 | "RegressionAD", 69 | "PcaAD", 70 | "CustomizedDetectorHD", 71 | "print_all_models", 72 | ] 73 | -------------------------------------------------------------------------------- /src/adtk/detector/_detector_hd.py: -------------------------------------------------------------------------------- 1 | """Module for high-dimensional detectors. 2 | 3 | High-dimensional detectors detect anomalies from high-dimensional time series, 4 | i.e. from pandas DataFrame. 5 | """ 6 | 7 | from collections import Counter 8 | from typing import Any, Callable, Dict, Optional, Tuple 9 | 10 | import pandas as pd 11 | 12 | from .._detector_base import _TrainableMultivariateDetector 13 | from ..aggregator import AndAggregator 14 | from ..detector import InterQuartileRangeAD, ThresholdAD 15 | from ..pipe import Pipeline, Pipenet 16 | from ..transformer import ( 17 | CustomizedTransformer1D, 18 | PcaReconstructionError, 19 | RegressionResidual, 20 | ) 21 | 22 | 23 | class CustomizedDetectorHD(_TrainableMultivariateDetector): 24 | """Multivariate detector derived from a user-given function and parameters. 25 | 26 | Parameters 27 | ---------- 28 | detect_func: function 29 | A function detecting anomalies from multivariate time series. 30 | 31 | The first input argument must be a pandas DataFrame, optional input 32 | argument may be accepted through parameter `detect_func_params` and the 33 | output of `fit_func`, and the output must be a binary pandas Series 34 | with the same index as input. 35 | 36 | detect_func_params: dict, optional 37 | Parameters of `detect_func`. Default: None. 38 | 39 | fit_func: function, optional 40 | A function training parameters of `detect_func` with multivariate time 41 | series. 42 | 43 | The first input argument must be a pandas Series, optional input 44 | argument may be accepted through parameter `fit_func_params`, and the 45 | output must be a dict that can be used by `detect_func` as parameters. 46 | Default: None. 47 | 48 | fit_func_params: dict, optional 49 | Parameters of `fit_func`. Default: None. 50 | 51 | """ 52 | 53 | def __init__( 54 | self, 55 | detect_func: Callable, 56 | detect_func_params: Optional[Dict[str, Any]] = None, 57 | fit_func: Optional[Callable] = None, 58 | fit_func_params: Optional[Dict[str, Any]] = None, 59 | ) -> None: 60 | self._fitted_detect_func_params = {} # type: Dict 61 | super().__init__() 62 | self.detect_func = detect_func 63 | self.detect_func_params = detect_func_params 64 | self.fit_func = fit_func 65 | self.fit_func_params = fit_func_params 66 | if self.fit_func is None: 67 | self._fitted = 1 68 | 69 | @property 70 | def _param_names(self) -> Tuple[str, ...]: 71 | return ( 72 | "detect_func", 73 | "detect_func_params", 74 | "fit_func", 75 | "fit_func_params", 76 | ) 77 | 78 | def _fit_core(self, df: pd.DataFrame) -> None: 79 | if self.fit_func is not None: 80 | if self.fit_func_params is not None: 81 | fit_func_params = self.fit_func_params 82 | else: 83 | fit_func_params = {} 84 | self._fitted_detect_func_params = self.fit_func( 85 | df, **fit_func_params 86 | ) 87 | 88 | def _predict_core(self, df: pd.DataFrame) -> pd.Series: 89 | if self.detect_func_params is not None: 90 | detect_func_params = self.detect_func_params 91 | else: 92 | detect_func_params = {} 93 | if self.fit_func is not None: 94 | return self.detect_func( 95 | df, **{**self._fitted_detect_func_params, **detect_func_params} 96 | ) 97 | else: 98 | return self.detect_func(df, **detect_func_params) 99 | 100 | 101 | class MinClusterDetector(_TrainableMultivariateDetector): 102 | """Detector that detects anomaly based on clustering of historical data. 103 | 104 | This detector peforms clustering using a clustering model, and identifies 105 | a time points as anomalous if it belongs to the minimal cluster. 106 | 107 | Parameters 108 | ---------- 109 | model: object 110 | A clustering model to be used for clustering time series values. Same 111 | as a clustering model in scikit-learn, the model should minimally have 112 | a `fit` method and a `predict` method. The `predict` method should 113 | return an array of cluster labels. 114 | 115 | """ 116 | 117 | def __init__(self, model: Any) -> None: 118 | super().__init__() 119 | self.model = model 120 | 121 | @property 122 | def _param_names(self) -> Tuple[str, ...]: 123 | return ("model",) 124 | 125 | def _fit_core(self, df: pd.DataFrame) -> None: 126 | if df.dropna().empty: 127 | raise RuntimeError("Valid values are not enough for training.") 128 | clustering_result = self.model.fit_predict(df.dropna()) 129 | cluster_count = Counter(clustering_result) # type: Counter 130 | self._anomalous_cluster_id = cluster_count.most_common()[-1][0] 131 | 132 | def _predict_core(self, df: pd.DataFrame) -> pd.Series: 133 | cluster_id = pd.Series(float("nan"), index=df.index) 134 | if not df.dropna().empty: 135 | cluster_id.loc[df.dropna().index] = self.model.predict(df.dropna()) 136 | predicted = pd.Series( 137 | cluster_id == self._anomalous_cluster_id, index=df.index 138 | ) 139 | predicted[cluster_id.isna()] = float("nan") 140 | return predicted 141 | 142 | 143 | class OutlierDetector(_TrainableMultivariateDetector): 144 | """Detector that detects anomaly based on a outlier detection model. 145 | 146 | This detector peforms time-independent outlier detection using given model, 147 | and identifies a time points as anomalous if it is labelled as an outlier. 148 | 149 | Parameters 150 | ---------- 151 | model: object 152 | An outlier detection model to be used. Same as a outlier detection 153 | model in scikit-learn (e.g. EllipticEnvelope, IsolationForest, 154 | LocalOutlierFactor), the model should minimally have a `fit_predict` 155 | method, or `fit` and `predict` methods. The `fit_predict` or `predict` 156 | method should return an array of outlier indicators where outliers are 157 | marked by -1. 158 | 159 | """ 160 | 161 | def __init__(self, model: Any) -> None: 162 | super().__init__() 163 | self.model = model 164 | 165 | @property 166 | def _param_names(self) -> Tuple[str, ...]: 167 | return ("model",) 168 | 169 | def _fit_core(self, df: pd.DataFrame) -> None: 170 | if hasattr(self.model, "fit"): 171 | if df.dropna().empty: 172 | raise RuntimeError("Valid values are not enough for training.") 173 | self.model.fit(df.dropna()) 174 | 175 | def _predict_core(self, df: pd.DataFrame) -> pd.Series: 176 | is_outliers = pd.Series(float("nan"), index=df.index) 177 | if not df.dropna().empty: 178 | if hasattr(self.model, "predict"): 179 | is_outliers.loc[df.dropna().index] = ( 180 | self.model.predict(df.dropna()) == -1 181 | ) 182 | else: 183 | is_outliers.loc[df.dropna().index] = ( 184 | self.model.fit_predict(df.dropna()) == -1 185 | ) 186 | predicted = pd.Series(is_outliers == 1, index=df.index) 187 | predicted[is_outliers.isna()] = float("nan") 188 | return predicted 189 | 190 | 191 | # ============================================================================= 192 | # PLEASE PUT PIPE-DERIVED DETECTOR CLASSES BELOW THIS LINE 193 | # ============================================================================= 194 | 195 | 196 | class RegressionAD(_TrainableMultivariateDetector): 197 | """Detector that detects anomalous inter-series relationship. 198 | 199 | This detector performs regression to build relationship between a target 200 | series and the rest of series, and identifies a time point as anomalous 201 | when the residual of regression is anomalously large. 202 | 203 | This detector is internally implemented as a `Pipenet` object. Advanced 204 | users may learn more details by checking attribute `pipe_`. 205 | 206 | Parameters 207 | ---------- 208 | target: str 209 | Name of the column to be regarded as target variable. 210 | 211 | regressor: object 212 | Regressor to be used. Same as a scikit-learn regressor, it should 213 | minimally have `fit` and `predict` methods. 214 | 215 | c: float, optional 216 | Factor used to determine the bound of normal range based on historical 217 | interquartile range. Default: 3.0. 218 | 219 | side: str, optional 220 | - If "both", to detect anomalous positive and negative residuals; 221 | - If "positive", to only detect anomalous positive residuals; 222 | - If "negative", to only detect anomalous negative residuals. 223 | 224 | Default: "both". 225 | 226 | Attributes 227 | ---------- 228 | pipe_: adtk.pipe.Pipenet 229 | Internal pipenet object. 230 | 231 | """ 232 | 233 | def __init__( 234 | self, regressor: Any, target: str, c: float = 3.0, side: str = "both" 235 | ) -> None: 236 | self.pipe_ = Pipenet( 237 | { 238 | "regression_residual": { 239 | "model": RegressionResidual( 240 | regressor=regressor, target=target 241 | ), 242 | "input": "original", 243 | }, 244 | "abs_residual": { 245 | "model": CustomizedTransformer1D(transform_func=abs), 246 | "input": "regression_residual", 247 | }, 248 | "iqr_ad": { 249 | "model": InterQuartileRangeAD((None, c)), 250 | "input": "abs_residual", 251 | }, 252 | "sign_check": { 253 | "model": ThresholdAD( 254 | high=( 255 | 0.0 256 | if side == "positive" 257 | else ( 258 | float("inf") 259 | if side == "negative" 260 | else -float("inf") 261 | ) 262 | ), 263 | low=( 264 | 0.0 265 | if side == "negative" 266 | else ( 267 | -float("inf") 268 | if side == "positive" 269 | else float("inf") 270 | ) 271 | ), 272 | ), 273 | "input": "regression_residual", 274 | }, 275 | "and": { 276 | "model": AndAggregator(), 277 | "input": ["iqr_ad", "sign_check"], 278 | }, 279 | } 280 | ) 281 | super().__init__() 282 | self.regressor = regressor 283 | self.target = target 284 | self.side = side 285 | self.c = c 286 | self._sync_params() 287 | 288 | @property 289 | def _param_names(self) -> Tuple[str, ...]: 290 | return ("regressor", "target", "c", "side") 291 | 292 | def _sync_params(self) -> None: 293 | if self.side not in ["both", "positive", "negative"]: 294 | raise ValueError( 295 | "Parameter `side` must be 'both', 'positive' or 'negative'." 296 | ) 297 | self.pipe_.steps["regression_residual"][ 298 | "model" 299 | ].regressor = self.regressor 300 | self.pipe_.steps["regression_residual"]["model"].set_params( 301 | target=self.target 302 | ) 303 | self.pipe_.steps["iqr_ad"]["model"].set_params(c=(None, self.c)) 304 | self.pipe_.steps["sign_check"]["model"].set_params( 305 | high=( 306 | 0.0 307 | if self.side == "positive" 308 | else ( 309 | float("inf") if self.side == "negative" else -float("inf") 310 | ) 311 | ), 312 | low=( 313 | 0.0 314 | if self.side == "negative" 315 | else ( 316 | -float("inf") if self.side == "positive" else float("inf") 317 | ) 318 | ), 319 | ) 320 | 321 | def _fit_core(self, s: pd.DataFrame) -> None: 322 | self._sync_params() 323 | self.pipe_.fit(s) 324 | 325 | def _predict_core(self, s: pd.DataFrame) -> pd.Series: 326 | self._sync_params() 327 | return self.pipe_.detect(s) 328 | 329 | 330 | class PcaAD(_TrainableMultivariateDetector): 331 | """Detector that detects outlier point with principal component analysis. 332 | 333 | This detector performs principal component analysis (PCA) to the 334 | multivariate time series (every time point is treated as a point in high- 335 | dimensional space), measures reconstruction error at every time point, and 336 | identifies a time point as anomalous when the recontruction error is beyond 337 | anomalously large. 338 | 339 | This detector is internally implemented as a `Pipeline` object. Advanced 340 | users may learn more details by checking attribute `pipe_`. 341 | 342 | Parameters 343 | ---------- 344 | k: int, optional 345 | Number of principal components to use. Default: 1. 346 | 347 | c: float, optional 348 | Factor used to determine the bound of normal range based on historical 349 | interquartile range. Default: 5.0. 350 | 351 | Attributes 352 | ---------- 353 | pipe_: adtk.pipe.Pipenet 354 | Internal pipenet object. 355 | """ 356 | 357 | def __init__(self, k: int = 1, c: float = 5.0) -> None: 358 | self.pipe_ = Pipeline( 359 | [ 360 | ("pca_reconstruct_error", PcaReconstructionError(k=k)), 361 | ("ad", InterQuartileRangeAD(c=c)), 362 | ] 363 | ) 364 | super().__init__() 365 | self.k = k 366 | self.c = c 367 | self._sync_params() 368 | 369 | @property 370 | def _param_names(self) -> Tuple[str, ...]: 371 | return ("k", "c") 372 | 373 | def _sync_params(self) -> None: 374 | self.pipe_.steps[0][1].set_params(k=self.k) 375 | self.pipe_.steps[1][1].set_params(c=self.c) 376 | 377 | def _fit_core(self, s: pd.DataFrame) -> None: 378 | self._sync_params() 379 | self.pipe_.fit(s) 380 | 381 | def _predict_core(self, s: pd.DataFrame) -> pd.Series: 382 | self._sync_params() 383 | return self.pipe_.detect(s) 384 | -------------------------------------------------------------------------------- /src/adtk/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module of metrics that measure the quality of detection results against true 3 | anomalies. 4 | """ 5 | 6 | from ._metrics import f1_score, iou, precision, recall 7 | 8 | __all__ = ["recall", "precision", "f1_score", "iou"] 9 | -------------------------------------------------------------------------------- /src/adtk/pipe/__init__.py: -------------------------------------------------------------------------------- 1 | """Module of model pipeline and pipenet. 2 | 3 | Pipeline or Pipenet connects multiple components (transformers, detectors, 4 | and/or aggregators) into a model that may perform complex anomaly detection 5 | process. 6 | 7 | """ 8 | 9 | from ._pipe import Pipeline, Pipenet 10 | 11 | __all__ = ["Pipeline", "Pipenet"] 12 | -------------------------------------------------------------------------------- /src/adtk/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | """Module of transformers. 2 | 3 | A transformer transforms time series to extract useful information. 4 | 5 | """ 6 | from .._transformer_base import ( 7 | _NonTrainableMultivariateTransformer, 8 | _NonTrainableUnivariateTransformer, 9 | _TrainableMultivariateTransformer, 10 | _TrainableUnivariateTransformer, 11 | ) 12 | from .._utils import _get_all_subclasses_from_superclass 13 | from ._transformer_1d import ( 14 | ClassicSeasonalDecomposition, 15 | CustomizedTransformer1D, 16 | DoubleRollingAggregate, 17 | Retrospect, 18 | RollingAggregate, 19 | StandardScale, 20 | ) 21 | from ._transformer_hd import ( 22 | CustomizedTransformerHD, 23 | PcaProjection, 24 | PcaReconstruction, 25 | PcaReconstructionError, 26 | RegressionResidual, 27 | SumAll, 28 | ) 29 | 30 | 31 | def print_all_models() -> None: 32 | """ 33 | Print description of every model in this module. 34 | """ 35 | model_desc = _get_all_subclasses_from_superclass( 36 | _NonTrainableUnivariateTransformer 37 | ) 38 | model_desc.update( 39 | _get_all_subclasses_from_superclass( 40 | _NonTrainableMultivariateTransformer 41 | ) 42 | ) 43 | model_desc.update( 44 | _get_all_subclasses_from_superclass(_TrainableUnivariateTransformer) 45 | ) 46 | model_desc.update( 47 | _get_all_subclasses_from_superclass(_TrainableMultivariateTransformer) 48 | ) 49 | for key, value in model_desc.items(): 50 | print("-" * 80) 51 | print(key) 52 | print(value) 53 | 54 | 55 | __all__ = [ 56 | "RollingAggregate", 57 | "DoubleRollingAggregate", 58 | "ClassicSeasonalDecomposition", 59 | "Retrospect", 60 | "StandardScale", 61 | "CustomizedTransformer1D", 62 | "RegressionResidual", 63 | "PcaProjection", 64 | "PcaReconstruction", 65 | "PcaReconstructionError", 66 | "SumAll", 67 | "CustomizedTransformerHD", 68 | "print_all_models", 69 | ] 70 | -------------------------------------------------------------------------------- /src/adtk/transformer/_transformer_hd.py: -------------------------------------------------------------------------------- 1 | """Module for high-dimensional transformers. 2 | 3 | High-dimensional transformers transform hight-dimensional time series, i.e. 4 | pandas DataFrame, into different series, to extract useful information out of 5 | the original time series. 6 | 7 | """ 8 | 9 | from typing import Any, Callable, Dict, Optional, Tuple, Union 10 | 11 | import pandas as pd 12 | from sklearn.decomposition import PCA 13 | 14 | from .._transformer_base import ( 15 | _NonTrainableMultivariateTransformer, 16 | _TrainableMultivariateTransformer, 17 | ) 18 | 19 | 20 | class CustomizedTransformerHD(_TrainableMultivariateTransformer): 21 | """Multivariate transformer derived from a user-given function and parameters. 22 | 23 | Parameters 24 | ---------- 25 | Parameters 26 | ---------- 27 | transform_func: function 28 | A function transforming multivariate time series. 29 | 30 | The first input argument must be a pandas DataFrame, optional input 31 | argument may be accepted through parameter `transform_func_params` and 32 | the output of `fit_func`, and the output must be a pandas Series or 33 | DataFrame with the same index as input. 34 | 35 | transform_func_params: dict, optional 36 | Parameters of `transform_func`. Default: None. 37 | 38 | fit_func: function, optional 39 | A function training parameters of `transform_func` with multivariate 40 | time series. 41 | 42 | The first input argument must be a pandas DataFrame, optional input 43 | argument may be accepted through parameter `fit_func_params`, and the 44 | output must be a dict that can be used by `transform_func` as 45 | parameters. Default: None. 46 | 47 | fit_func_params: dict, optional 48 | Parameters of `fit_func`. Default: None. 49 | 50 | """ 51 | 52 | def __init__( 53 | self, 54 | transform_func: Callable, 55 | transform_func_params: Optional[Dict[str, Any]] = None, 56 | fit_func: Optional[Callable] = None, 57 | fit_func_params: Optional[Dict[str, Any]] = None, 58 | ) -> None: 59 | self._fitted_transform_func_params = {} # type: Dict 60 | super().__init__() 61 | self.transform_func = transform_func 62 | self.transform_func_params = transform_func_params 63 | self.fit_func = fit_func 64 | self.fit_func_params = fit_func_params 65 | if self.fit_func is None: 66 | self._fitted = 1 67 | 68 | @property 69 | def _param_names(self) -> Tuple[str, ...]: 70 | return ( 71 | "transform_func", 72 | "transform_func_params", 73 | "fit_func", 74 | "fit_func_params", 75 | ) 76 | 77 | def _fit_core(self, df: pd.DataFrame) -> None: 78 | if self.fit_func is not None: 79 | if self.fit_func_params is not None: 80 | fit_func_params = self.fit_func_params 81 | else: 82 | fit_func_params = {} 83 | self._fitted_transform_func_params = self.fit_func( 84 | df, **fit_func_params 85 | ) 86 | 87 | def _predict_core( 88 | self, df: pd.DataFrame 89 | ) -> Union[pd.Series, pd.DataFrame]: 90 | if self.transform_func_params is not None: 91 | transform_func_params = self.transform_func_params 92 | else: 93 | transform_func_params = {} 94 | if self.fit_func is not None: 95 | return self.transform_func( 96 | df, 97 | **{ 98 | **self._fitted_transform_func_params, 99 | **transform_func_params, 100 | } 101 | ) 102 | else: 103 | return self.transform_func(df, **transform_func_params) 104 | 105 | 106 | class SumAll(_NonTrainableMultivariateTransformer): 107 | """Transformer that returns the sum all series as one series.""" 108 | 109 | def __init__(self) -> None: 110 | super().__init__() 111 | 112 | @property 113 | def _param_names(self) -> Tuple[str, ...]: 114 | return tuple() 115 | 116 | def _predict_core(self, df: pd.DataFrame) -> pd.Series: 117 | return df.sum(axis=1, skipna=False) 118 | 119 | 120 | class RegressionResidual(_TrainableMultivariateTransformer): 121 | """Transformer that performs regression to build relationship between a 122 | target series and the rest of series, and returns regression residual 123 | series. 124 | 125 | Parameters 126 | ---------- 127 | regressor: object 128 | Regressor to be used. Same as a scikit-learn regressor, it should 129 | minimally have `fit` and `predict` methods. 130 | target: str, optional 131 | Name of the column to be regarded as target variable. 132 | 133 | """ 134 | 135 | def __init__(self, regressor: Any, target: str) -> None: 136 | super().__init__() 137 | self.regressor = regressor 138 | self.target = target 139 | 140 | @property 141 | def _param_names(self) -> Tuple[str, ...]: 142 | return ("regressor", "target") 143 | 144 | def _fit_core(self, df: pd.DataFrame) -> None: 145 | if self.target not in df.columns: 146 | raise RuntimeError( 147 | "Cannot find target series {} in input dataframe.".format( 148 | self.target 149 | ) 150 | ) 151 | self._target = self.target 152 | self._features = [col for col in df.columns if col != self._target] 153 | if df.dropna().empty: 154 | raise RuntimeError("Valid values are not enough for training.") 155 | self.regressor.fit( 156 | df.dropna().loc[:, self._features], 157 | df.dropna().loc[:, self._target], 158 | ) 159 | 160 | def _predict_core(self, df: pd.DataFrame) -> pd.Series: 161 | target = self._target 162 | features = self._features 163 | if target not in df.columns: 164 | raise RuntimeError( 165 | "Cannot find target series {} in input dataframe.".format( 166 | target 167 | ) 168 | ) 169 | if not set(features) <= set(df.columns): 170 | raise RuntimeError( 171 | "The following series are not found in input dataframe: {}.".format( 172 | set(features) - set(df.columns) 173 | ) 174 | ) 175 | residual = pd.Series(index=df.index, dtype="float64") 176 | if not df.dropna().empty: 177 | residual.loc[df.dropna().index] = df.dropna().loc[ 178 | :, target 179 | ] - self.regressor.predict(df.dropna().loc[:, features]) 180 | return residual 181 | 182 | 183 | class PcaProjection(_TrainableMultivariateTransformer): 184 | """Transformer that performs principal component analysis (PCA) to the 185 | multivariate time series (every time point is treated as a point in high- 186 | dimensional space), and represents those points with their projection on 187 | the first k principal components. 188 | 189 | Parameters 190 | ---------- 191 | k: int, optional 192 | Number of principal components to use. Default: 1. 193 | 194 | """ 195 | 196 | def __init__(self, k: int = 1) -> None: 197 | self._model = None # type: Any 198 | super().__init__() 199 | self.k = k 200 | 201 | @property 202 | def _param_names(self) -> Tuple[str, ...]: 203 | return ("k",) 204 | 205 | def _fit_core(self, df: pd.DataFrame) -> None: 206 | self._model = PCA(n_components=self.k) 207 | if df.dropna().empty: 208 | raise RuntimeError("Valid values are not enough for training.") 209 | self._model.fit(df.dropna().values) 210 | 211 | def _predict_core(self, df: pd.DataFrame) -> pd.DataFrame: 212 | if self.k > self._model.n_components: 213 | raise ValueError( 214 | "k is increased after previous fitting. Please fit again." 215 | ) 216 | results = pd.DataFrame( 217 | index=df.index, columns=["pc{}".format(i) for i in range(self.k)] 218 | ) 219 | if not df.dropna().empty: 220 | results.loc[df.dropna().index] = self._model.transform( 221 | df.dropna().values 222 | )[:, : self.k] 223 | return results 224 | 225 | 226 | class PcaReconstruction(_TrainableMultivariateTransformer): 227 | """Transformer that performs principal component analysis (PCA) to the 228 | multivariate time series (every time point is treated as a point in high- 229 | dimensional space), and reconstructs those points with the first k 230 | principal components. 231 | 232 | Parameters 233 | ---------- 234 | k: int, optional 235 | Number of principal components to use. Default: 1. 236 | 237 | """ 238 | 239 | def __init__(self, k: int = 1) -> None: 240 | self._model = None # type: Any 241 | super().__init__() 242 | self.k = k 243 | 244 | @property 245 | def _param_names(self) -> Tuple[str, ...]: 246 | return ("k",) 247 | 248 | def _fit_core(self, df: pd.DataFrame) -> None: 249 | self._model = PCA(n_components=self.k) 250 | if df.dropna().empty: 251 | raise RuntimeError("Valid values are not enough for training.") 252 | self._model.fit(df.dropna().values) 253 | 254 | def _predict_core(self, df: pd.DataFrame) -> pd.DataFrame: 255 | if self._model is None: 256 | raise RuntimeError("Please fit the model first.") 257 | if self.k > self._model.n_components: 258 | raise ValueError( 259 | "k is increased after previous fitting. Please fit again." 260 | ) 261 | results = pd.DataFrame(columns=df.columns, index=df.index) 262 | if not df.dropna().empty: 263 | results.loc[df.dropna().index] = self._model.inverse_transform( 264 | self._model.transform(df.dropna().values) 265 | ) 266 | return results 267 | 268 | 269 | class PcaReconstructionError(_TrainableMultivariateTransformer): 270 | """Transformer that performs principal component analysis (PCA) to the 271 | multivariate time series (every time point is treated as a point in high- 272 | dimensional space), reconstruct those points with the first k principal 273 | components, and returns the reconstruction error (i.e. squared distance 274 | bewteen the reconstructed point and original point). 275 | 276 | Parameters 277 | ---------- 278 | k: int, optional 279 | Number of principal components to use. Default: 1. 280 | 281 | """ 282 | 283 | def __init__(self, k: int = 1) -> None: 284 | self._model = None # type: Any 285 | super().__init__() 286 | self.k = k 287 | 288 | @property 289 | def _param_names(self) -> Tuple[str, ...]: 290 | return ("k",) 291 | 292 | def _fit_core(self, df: pd.DataFrame) -> None: 293 | self._model = PCA(n_components=self.k) 294 | if df.dropna().empty: 295 | raise RuntimeError("Valid values are not enough for training.") 296 | self._model.fit(df.dropna().values) 297 | 298 | def _predict_core(self, df: pd.DataFrame) -> pd.Series: 299 | if self._model is None: 300 | raise RuntimeError("Please fit the model first.") 301 | if self.k > self._model.n_components: 302 | raise ValueError( 303 | "k is increased after previous fitting. Please fit again." 304 | ) 305 | results = pd.DataFrame(columns=df.columns, index=df.index) 306 | if not df.dropna().empty: 307 | results.loc[df.dropna().index] = self._model.inverse_transform( 308 | self._model.transform(df.dropna().values) 309 | ) 310 | return ((results - df) ** 2).sum(axis=1, skipna=False) 311 | -------------------------------------------------------------------------------- /src/adtk/visualization/__init__.py: -------------------------------------------------------------------------------- 1 | """Module of visualization.""" 2 | 3 | from ._visualization import plot 4 | 5 | __all__ = ["plot"] 6 | -------------------------------------------------------------------------------- /tests/test_aggregators.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pandas import Timestamp 3 | 4 | import adtk.aggregator as aggt 5 | 6 | 7 | def test_or_dict_of_lists(): 8 | """ 9 | Test OrAggregator with input as a dict of lists of time stamps or time 10 | stamp 2-tuples 11 | """ 12 | lists = { 13 | "A": [ 14 | (Timestamp("2017-1-1"), Timestamp("2017-1-2")), 15 | (Timestamp("2017-1-5"), Timestamp("2017-1-8")), 16 | Timestamp("2017-1-10"), 17 | ], 18 | "B": [ 19 | Timestamp("2017-1-2"), 20 | (Timestamp("2017-1-3"), Timestamp("2017-1-6")), 21 | Timestamp("2017-1-8"), 22 | (Timestamp("2017-1-7"), Timestamp("2017-1-9")), 23 | (Timestamp("2017-1-11"), Timestamp("2017-1-11")), 24 | ], 25 | } 26 | assert aggt.OrAggregator().aggregate(lists) == [ 27 | (Timestamp("2017-01-01 00:00:00"), Timestamp("2017-01-02 00:00:00")), 28 | (Timestamp("2017-01-03 00:00:00"), Timestamp("2017-01-09 00:00:00")), 29 | Timestamp("2017-1-10"), 30 | Timestamp("2017-1-11"), 31 | ] 32 | 33 | lists = { 34 | "A": [ 35 | (Timestamp("2017-1-1"), Timestamp("2017-1-2")), 36 | (Timestamp("2017-1-5"), Timestamp("2017-1-8")), 37 | Timestamp("2017-1-10"), 38 | ], 39 | "B": [], 40 | } 41 | assert aggt.OrAggregator().aggregate(lists) == [ 42 | (Timestamp("2017-1-1"), Timestamp("2017-1-2")), 43 | (Timestamp("2017-1-5"), Timestamp("2017-1-8")), 44 | Timestamp("2017-1-10"), 45 | ] 46 | 47 | 48 | def test_or_df(): 49 | """ 50 | Test OrAggregator with input as a DataFrame 51 | """ 52 | df = pd.DataFrame( 53 | [[1, 1], [1, 0], [0, 1], [0, 0], [float("nan"), 1], [0, float("nan")]], 54 | index=pd.date_range(start="2017-1-1", periods=6, freq="D"), 55 | ) 56 | pd.testing.assert_series_equal( 57 | aggt.OrAggregator().aggregate(df), 58 | pd.Series( 59 | [1, 1, 1, 0, 1, float("nan")], 60 | index=pd.date_range(start="2017-1-1", periods=6, freq="D"), 61 | ), 62 | ) 63 | 64 | 65 | def test_or_dict_of_dfs(): 66 | """ 67 | Test OrAggregator with input as a dict of DataFrame 68 | """ 69 | df1 = pd.DataFrame( 70 | [[1, 1], [1, 0], [0, 1], [0, 0], [float("nan"), 1], [0, float("nan")]], 71 | index=pd.date_range(start="2017-1-1", periods=6, freq="D"), 72 | ) 73 | df2 = pd.DataFrame( 74 | [[1, 1], [1, 0], [0, 1], [0, 0], [float("nan"), 1], [0, float("nan")]], 75 | index=pd.date_range(start="2017-1-1", periods=6, freq="D"), 76 | ) 77 | pd.testing.assert_series_equal( 78 | aggt.OrAggregator().aggregate({"A": df1, "B": df2}), 79 | pd.Series( 80 | [1, 1, 1, 0, 1, float("nan")], 81 | index=pd.date_range(start="2017-1-1", periods=6, freq="D"), 82 | ), 83 | ) 84 | 85 | 86 | def test_and_dict_of_lists(): 87 | """ 88 | Test AndAggregator with input as a dict of lists of time stamps or time 89 | stamp 2-tuples 90 | """ 91 | lists = { 92 | "A": [ 93 | (Timestamp("2017-1-1"), Timestamp("2017-1-2")), 94 | (Timestamp("2017-1-5"), Timestamp("2017-1-8")), 95 | Timestamp("2017-1-10"), 96 | ], 97 | "B": [ 98 | Timestamp("2017-1-2"), 99 | (Timestamp("2017-1-3"), Timestamp("2017-1-6")), 100 | Timestamp("2017-1-8"), 101 | (Timestamp("2017-1-7"), Timestamp("2017-1-9")), 102 | (Timestamp("2017-1-11"), Timestamp("2017-1-11")), 103 | ], 104 | } 105 | assert aggt.AndAggregator().aggregate(lists) == [ 106 | Timestamp("2017-1-2"), 107 | (Timestamp("2017-01-05 00:00:00"), Timestamp("2017-01-06 00:00:00")), 108 | (Timestamp("2017-1-7 00:00:00"), Timestamp("2017-1-8 00:00:00")), 109 | ] 110 | 111 | lists = { 112 | "A": [ 113 | (Timestamp("2017-1-1"), Timestamp("2017-1-2")), 114 | (Timestamp("2017-1-5"), Timestamp("2017-1-8")), 115 | Timestamp("2017-1-10"), 116 | ], 117 | "B": [], 118 | } 119 | assert aggt.AndAggregator().aggregate(lists) == [] 120 | 121 | 122 | def test_and_df(): 123 | """ 124 | Test AndAggregator with input as a DataFrame 125 | """ 126 | df = pd.DataFrame( 127 | [[1, 1], [1, 0], [0, 1], [0, 0], [float("nan"), 1], [0, float("nan")]], 128 | index=pd.date_range(start="2017-1-1", periods=6, freq="D"), 129 | ) 130 | pd.testing.assert_series_equal( 131 | aggt.AndAggregator().aggregate(df), 132 | pd.Series( 133 | [1, 0, 0, 0, float("nan"), 0], 134 | index=pd.date_range(start="2017-1-1", periods=6, freq="D"), 135 | ), 136 | ) 137 | 138 | 139 | def test_and_dict_of_dfs(): 140 | """ 141 | Test AndAggregator with input as a dict of DataFrame 142 | """ 143 | df1 = pd.DataFrame( 144 | [[1, 1], [1, 0], [0, 1], [0, 0], [float("nan"), 1], [0, float("nan")]], 145 | index=pd.date_range(start="2017-1-1", periods=6, freq="D"), 146 | ) 147 | df2 = pd.DataFrame( 148 | [[1, 1], [1, 0], [0, 1], [0, 0], [float("nan"), 1], [0, float("nan")]], 149 | index=pd.date_range(start="2017-1-1", periods=6, freq="D"), 150 | ) 151 | pd.testing.assert_series_equal( 152 | aggt.AndAggregator().aggregate({"A": df1, "B": df2}), 153 | pd.Series( 154 | [1, 0, 0, 0, float("nan"), 0], 155 | index=pd.date_range(start="2017-1-1", periods=6, freq="D"), 156 | ), 157 | ) 158 | 159 | 160 | def test_customized_aggregator(): 161 | """ 162 | Test customized aggregate 163 | """ 164 | 165 | def myAggFunc(df, agg="and"): 166 | if agg == "and": 167 | return df.all(axis=1) 168 | elif agg == "or": 169 | return df.any(axis=1) 170 | else: 171 | raise ValueError("`agg` must be either 'and' or 'or'.") 172 | 173 | model = aggt.CustomizedAggregator(myAggFunc) 174 | 175 | df = pd.DataFrame( 176 | [[1, 1], [1, 0], [0, 1], [0, 0]], 177 | index=pd.date_range(start="2017-1-1", periods=4, freq="D"), 178 | ) 179 | 180 | pd.testing.assert_series_equal( 181 | model.aggregate(df), 182 | pd.Series([True, False, False, False], index=df.index), 183 | ) 184 | 185 | model.aggregate_func_params = {"agg": "or"} 186 | pd.testing.assert_series_equal( 187 | model.aggregate(df), 188 | pd.Series([True, True, True, False], index=df.index), 189 | ) 190 | -------------------------------------------------------------------------------- /tests/test_attribute.py: -------------------------------------------------------------------------------- 1 | """Test read-only attributes""" 2 | import numpy as np 3 | import pandas as pd 4 | import pytest 5 | 6 | import adtk.detector as detector 7 | 8 | testCases = [ 9 | { 10 | "model": detector.QuantileAD(), 11 | "s": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 12 | "a": {"abs_low_": -float("inf"), "abs_high_": float("inf")}, 13 | }, 14 | { 15 | "model": detector.QuantileAD(low=0.1), 16 | "s": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 17 | "a": {"abs_low_": 1, "abs_high_": float("inf")}, 18 | }, 19 | { 20 | "model": detector.QuantileAD(high=0.9), 21 | "s": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 22 | "a": {"abs_low_": -float("inf"), "abs_high_": 9}, 23 | }, 24 | { 25 | "model": detector.QuantileAD(low=0.1, high=0.9), 26 | "s": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 27 | "a": {"abs_low_": 1, "abs_high_": 9}, 28 | }, 29 | { 30 | "model": detector.InterQuartileRangeAD(), 31 | "s": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 32 | "a": {"abs_low_": 2.5 - 15, "abs_high_": 7.5 + 15}, 33 | }, 34 | { 35 | "model": detector.InterQuartileRangeAD(c=2), 36 | "s": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 37 | "a": {"abs_low_": 2.5 - 10, "abs_high_": 7.5 + 10}, 38 | }, 39 | { 40 | "model": detector.InterQuartileRangeAD(c=(2, 4)), 41 | "s": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 42 | "a": {"abs_low_": 2.5 - 10, "abs_high_": 7.5 + 20}, 43 | }, 44 | { 45 | "model": detector.InterQuartileRangeAD(c=(2, None)), 46 | "s": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 47 | "a": {"abs_low_": 2.5 - 10, "abs_high_": float("inf")}, 48 | }, 49 | { 50 | "model": detector.InterQuartileRangeAD(c=(None, 4)), 51 | "s": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 52 | "a": {"abs_low_": -float("inf"), "abs_high_": 7.5 + 20}, 53 | }, 54 | { 55 | "model": detector.InterQuartileRangeAD(c=None), 56 | "s": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 57 | "a": {"abs_low_": -float("inf"), "abs_high_": float("inf")}, 58 | }, 59 | { 60 | "model": detector.SeasonalAD(freq=4), 61 | "s": [0, 1, 2, 1] * 10, 62 | "a": {"freq_": 4, "seasonal_": [0, 1, 2, 1]}, 63 | }, 64 | { 65 | "model": detector.SeasonalAD(freq=8), 66 | "s": [0, 1, 2, 1] * 10, 67 | "a": {"freq_": 8, "seasonal_": [0, 1, 2, 1, 0, 1, 2, 1]}, 68 | }, 69 | { 70 | "model": detector.SeasonalAD(), 71 | "s": [0, 1, 2, 1] * 10, 72 | "a": {"freq_": 4, "seasonal_": [0, 1, 2, 1]}, 73 | }, 74 | { 75 | "model": detector.SeasonalAD(trend=True), 76 | "s": np.array([0, 1, 2, 1] * 10) + np.arange(40) / 10, 77 | "a": {"freq_": 4, "seasonal_": [-1, 0, 1, 0]}, 78 | }, 79 | { 80 | "model": detector.SeasonalAD(trend=True, freq=8), 81 | "s": np.array([0, 1, 2, 1] * 10) + np.arange(40), 82 | "a": {"freq_": 8, "seasonal_": [-1, 0, 1, 0, -1, 0, 1, 0]}, 83 | }, 84 | ] 85 | 86 | 87 | @pytest.mark.parametrize("testCase", testCases) 88 | def test_attribute(testCase): 89 | """Test fit_detect the detector.""" 90 | s = pd.Series( 91 | testCase["s"], 92 | pd.date_range(start="2017-1-1", periods=len(testCase["s"]), freq="D"), 93 | ) 94 | model = testCase["model"] 95 | for key, value in testCase["a"].items(): 96 | with pytest.raises(AttributeError): 97 | getattr(model, key) 98 | model.fit(s) 99 | for key, value in testCase["a"].items(): 100 | if isinstance(value, list): 101 | pd.testing.assert_series_equal( 102 | getattr(model, key), 103 | pd.Series(value, index=s.index[: len(value)]), 104 | check_dtype=False, 105 | check_names=False, 106 | ) 107 | else: 108 | assert getattr(model, key) == value 109 | -------------------------------------------------------------------------------- /tests/test_data_validation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Unit tests on data 3 | """ 4 | import numpy as np 5 | import pandas as pd 6 | import pytest 7 | 8 | from adtk.data import validate_series 9 | 10 | rand = np.random.RandomState(123) 11 | 12 | regular_time_index = pd.date_range(start=0, periods=10, freq="1d") 13 | so = pd.Series(np.arange(10), index=regular_time_index, name="value") 14 | bo = pd.Series( 15 | [1, 0, 0, 0, 1, 1, 1, 0, 0, 0], index=regular_time_index, name="type1" 16 | ) 17 | bom = pd.concat([bo, (1 - bo).rename("type2")], axis=1) 18 | co = pd.Series( 19 | ["B", "A", "A", "A", np.nan, np.nan, np.nan, "B", "B", "B"], 20 | index=regular_time_index, 21 | ) 22 | coi = pd.get_dummies(co) 23 | con = pd.Series( 24 | ["B", "A", "A", "A", np.nan, np.nan, np.nan, "B", "B", "B"], 25 | index=regular_time_index, 26 | name="type3", 27 | ) 28 | coni = pd.get_dummies(con, prefix="type3", prefix_sep="_") 29 | 30 | test_targets = [ 31 | (so, so), 32 | (bo, bo), 33 | (bom, bom), 34 | (co, coi), 35 | (con, coni), 36 | (pd.concat([so, bom, con], axis=1), pd.concat([so, bom, coni], axis=1)), 37 | ] 38 | 39 | 40 | @pytest.mark.parametrize("x", test_targets) 41 | def test_series_regular(x): 42 | # regular Series 43 | s = x[0].copy() 44 | sv = validate_series(s, check_categorical=True) 45 | if isinstance(sv, pd.Series): 46 | pd.testing.assert_series_equal(sv, x[1], check_dtype=False) 47 | elif isinstance(sv, pd.DataFrame): 48 | pd.testing.assert_frame_equal(sv, x[1], check_dtype=False) 49 | else: 50 | raise TypeError("Must be pandas Series or DataFrame") 51 | # check if copy instead of view 52 | sc = s.copy() 53 | sv.iloc[0] == 1000 54 | if isinstance(s, pd.Series): 55 | pd.testing.assert_series_equal(s, sc, check_dtype=False) 56 | elif isinstance(s, pd.DataFrame): 57 | pd.testing.assert_frame_equal(s, sc, check_dtype=False) 58 | else: 59 | raise TypeError("Must be pandas Series or DataFrame") 60 | 61 | 62 | @pytest.mark.parametrize("x", test_targets) 63 | def test_series_unsorted(x): 64 | # unsorted Series 65 | s = x[0].copy() 66 | s = s.iloc[[9, 6, 7, 1, 0, 3, 4, 5, 8, 2]] 67 | sv = validate_series(s, check_categorical=True) 68 | if isinstance(sv, pd.Series): 69 | pd.testing.assert_series_equal(sv, x[1], check_dtype=False) 70 | elif isinstance(sv, pd.DataFrame): 71 | pd.testing.assert_frame_equal(sv, x[1], check_dtype=False) 72 | else: 73 | raise TypeError("Must be pandas Series or DataFrame") 74 | # check if copy instead of view 75 | sc = s.copy() 76 | sv.iloc[0] == 1000 77 | if isinstance(s, pd.Series): 78 | pd.testing.assert_series_equal(s, sc, check_dtype=False) 79 | elif isinstance(s, pd.DataFrame): 80 | pd.testing.assert_frame_equal(s, sc, check_dtype=False) 81 | else: 82 | raise TypeError("Must be pandas Series or DataFrame") 83 | 84 | 85 | @pytest.mark.parametrize("x", test_targets) 86 | def test_series_duplicated_timestamp(x): 87 | # Series with duplicated time stamps 88 | s = x[0].copy() 89 | s = s.iloc[[0, 1, 1, 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 9]] 90 | sv = validate_series(s, check_categorical=True) 91 | if isinstance(sv, pd.Series): 92 | pd.testing.assert_series_equal(sv, x[1], check_dtype=False) 93 | elif isinstance(sv, pd.DataFrame): 94 | pd.testing.assert_frame_equal(sv, x[1], check_dtype=False) 95 | else: 96 | raise TypeError("Must be pandas Series or DataFrame") 97 | # check if copy instead of view 98 | sc = s.copy() 99 | sv.iloc[0] == 1000 100 | if isinstance(s, pd.Series): 101 | pd.testing.assert_series_equal(s, sc, check_dtype=False) 102 | elif isinstance(s, pd.DataFrame): 103 | pd.testing.assert_frame_equal(s, sc, check_dtype=False) 104 | else: 105 | raise TypeError("Must be pandas Series or DataFrame") 106 | 107 | 108 | @pytest.mark.parametrize("x", test_targets) 109 | def test_series_missed_timestamp(x): 110 | # Series with missed time stamps 111 | s = x[0].copy() 112 | s = s.iloc[[0, 1, 3, 4, 5, 6, 7, 9]] 113 | ss = x[1].copy() 114 | ss = ss.iloc[[0, 1, 3, 4, 5, 6, 7, 9]] 115 | sv = validate_series(s, check_categorical=True) 116 | if isinstance(sv, pd.Series): 117 | pd.testing.assert_series_equal(sv, ss, check_dtype=False) 118 | elif isinstance(sv, pd.DataFrame): 119 | pd.testing.assert_frame_equal(sv, ss, check_dtype=False) 120 | else: 121 | raise TypeError("Must be pandas Series or DataFrame") 122 | # check if copy instead of view 123 | sc = s.copy() 124 | sv.iloc[0] == 1000 125 | if isinstance(s, pd.Series): 126 | pd.testing.assert_series_equal(s, sc, check_dtype=False) 127 | elif isinstance(s, pd.DataFrame): 128 | pd.testing.assert_frame_equal(s, sc, check_dtype=False) 129 | else: 130 | raise TypeError("Must be pandas Series or DataFrame") 131 | -------------------------------------------------------------------------------- /tests/test_detectorhd.py: -------------------------------------------------------------------------------- 1 | """Test HD detectors on some simple cases.""" 2 | from math import isnan 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import pytest 7 | from sklearn.cluster import KMeans 8 | from sklearn.ensemble import IsolationForest 9 | from sklearn.linear_model import LinearRegression 10 | from sklearn.neighbors import LocalOutlierFactor 11 | 12 | import adtk.detector as detector 13 | from adtk._base import _TrainableModel 14 | 15 | nan = float("nan") 16 | 17 | testCases = [ 18 | { 19 | "model": detector.CustomizedDetectorHD, 20 | "params": {"detect_func": lambda x: x.sum(axis=1) > 0}, 21 | "df": [ 22 | [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 23 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 24 | ], 25 | "a": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 26 | }, 27 | { 28 | "model": detector.CustomizedDetectorHD, 29 | "params": { 30 | "detect_func": lambda x, a: x.sum(axis=1) > a, 31 | "detect_func_params": {"a": 0}, 32 | }, 33 | "df": [ 34 | [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 35 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 36 | ], 37 | "a": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 38 | }, 39 | { 40 | "model": detector.CustomizedDetectorHD, 41 | "params": { 42 | "detect_func": lambda x, a: x.sum(axis=1) > a, 43 | "fit_func": lambda x: {"a": x.sum(axis=1).median()}, 44 | }, 45 | "df": [ 46 | [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 47 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 48 | ], 49 | "a": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 50 | }, 51 | { 52 | "model": detector.CustomizedDetectorHD, 53 | "params": { 54 | "detect_func": lambda x, a: x.sum(axis=1) > a, 55 | "fit_func": lambda x, q: {"a": x.sum(axis=1).quantile(q)}, 56 | "fit_func_params": {"q": 0.5}, 57 | }, 58 | "df": [ 59 | [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 60 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 61 | ], 62 | "a": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 63 | }, 64 | { 65 | "model": detector.CustomizedDetectorHD, 66 | "params": { 67 | "detect_func": lambda x, a, b: (x.sum(axis=1) > a) 68 | | (x.sum(axis=1) < b), 69 | "detect_func_params": {"b": -0.5}, 70 | "fit_func": lambda x: {"a": x.sum(axis=1).median()}, 71 | }, 72 | "df": [ 73 | [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 74 | [0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0], 75 | ], 76 | "a": [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0], 77 | }, 78 | { 79 | "model": detector.CustomizedDetectorHD, 80 | "params": { 81 | "detect_func": lambda x, a, b: (x.sum(axis=1) > a) 82 | | (x.sum(axis=1) < b), 83 | "detect_func_params": {"b": -0.5}, 84 | "fit_func": lambda x, q: {"a": x.sum(axis=1).quantile(q)}, 85 | "fit_func_params": {"q": 0.5}, 86 | }, 87 | "df": [ 88 | [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 89 | [0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0], 90 | ], 91 | "a": [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0], 92 | }, 93 | { 94 | "model": detector.MinClusterDetector, 95 | "params": {"model": KMeans(n_clusters=2)}, 96 | "df": [[0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0]], 97 | "a": [0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0], 98 | }, 99 | { 100 | "model": detector.MinClusterDetector, 101 | "params": {"model": KMeans(n_clusters=2)}, 102 | "df": [ 103 | [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], 104 | [0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0], 105 | ], 106 | "a": [0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0], 107 | }, 108 | { 109 | "model": detector.OutlierDetector, 110 | "params": { 111 | "model": LocalOutlierFactor(n_neighbors=1, contamination=0.1) 112 | }, 113 | "df": [[0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0]], 114 | "a": [0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0], 115 | }, 116 | { 117 | "model": detector.OutlierDetector, 118 | "params": { 119 | "model": LocalOutlierFactor(n_neighbors=1, contamination=0.1) 120 | }, 121 | "df": [ 122 | [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], 123 | [0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0], 124 | ], 125 | "a": [0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0], 126 | }, 127 | { 128 | "model": detector.OutlierDetector, 129 | "params": { 130 | "model": IsolationForest(n_estimators=100, contamination=0.1) 131 | }, 132 | "df": [[0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0]], 133 | "a": [0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0], 134 | }, 135 | { 136 | "model": detector.RegressionAD, 137 | "params": {"target": 2, "regressor": LinearRegression()}, 138 | "df": [ 139 | [0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9], 140 | [0, 2, 4, 6, 8, 10, 12, 14, 14, 16, 18], 141 | [0, 3, 6, 10, 12, 14, 18, 21, nan, 24, 27], 142 | ], 143 | "a": [0, 0, 0, 1, 0, 1, 0, 0, nan, 0, 0], 144 | }, 145 | { 146 | "model": detector.RegressionAD, 147 | "params": { 148 | "target": 2, 149 | "regressor": LinearRegression(), 150 | "side": "negative", 151 | }, 152 | "df": [ 153 | [0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9], 154 | [0, 2, 4, 6, 8, 10, 12, 14, 14, 16, 18], 155 | [0, 3, 6, 10, 12, 14, 18, 21, nan, 24, 27], 156 | ], 157 | "a": [0, 0, 0, 0, 0, 1, 0, 0, nan, 0, 0], 158 | }, 159 | { 160 | "model": detector.RegressionAD, 161 | "params": { 162 | "target": 2, 163 | "regressor": LinearRegression(), 164 | "side": "negative", 165 | "c": 100, 166 | }, 167 | "df": [ 168 | [0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9], 169 | [0, 2, 4, 6, 8, 10, 12, 14, 14, 16, 18], 170 | [0, 3, 6, 10, 12, 14, 18, 21, nan, 24, 27], 171 | ], 172 | "a": [0, 0, 0, 0, 0, 0, 0, 0, nan, 0, 0], 173 | }, 174 | { 175 | "model": detector.PcaAD, 176 | "params": {"k": 1, "c": 3}, 177 | "df": [ 178 | [0, 1, 2, 3, 3.9, 4.1, 5, 6, 7, 7, 8, 9], 179 | [0, 1, 2, 3, 4.1, 3.9, 5, 6, 7, nan, 8, 9], 180 | ], 181 | "a": [0, 0, 0, 0, 1, 1, 0, 0, 0, nan, 0, 0], 182 | }, 183 | ] 184 | 185 | 186 | @pytest.mark.parametrize("testCase", testCases) 187 | def test_fit_detect(testCase): 188 | """Test fit_detect the detector.""" 189 | df = pd.DataFrame( 190 | np.array(testCase["df"]).T, 191 | pd.date_range( 192 | start="2017-1-1", periods=len(testCase["df"][0]), freq="D" 193 | ), 194 | ) 195 | model = testCase["model"](**testCase["params"]) 196 | a_true = pd.Series(testCase["a"], index=df.index) 197 | if isinstance(model, _TrainableModel): 198 | a = model.fit_detect(df) 199 | else: 200 | a = model.detect(df) 201 | pd.testing.assert_series_equal(a, a_true, check_dtype=False) 202 | if a_true.sum() == 0: 203 | assert isnan(model.score(df, a_true, scoring="recall")) 204 | else: 205 | assert model.score(df, a_true, scoring="precision") == 1 206 | 207 | 208 | @pytest.mark.parametrize("testCase", testCases) 209 | def test_fit_and_detect(testCase): 210 | """Test fit the detector and then detect.""" 211 | df = pd.DataFrame( 212 | np.array(testCase["df"]).T, 213 | pd.date_range( 214 | start="2017-1-1", periods=len(testCase["df"][0]), freq="D" 215 | ), 216 | ) 217 | model = testCase["model"](**testCase["params"]) 218 | a_true = pd.Series(testCase["a"], index=df.index) 219 | if isinstance(model, _TrainableModel): 220 | model.fit(df) 221 | a = model.detect(df) 222 | pd.testing.assert_series_equal(a, a_true, check_dtype=False) 223 | if a_true.sum() == 0: 224 | assert isnan(model.score(df, a_true, scoring="f1")) 225 | else: 226 | assert model.score(df, a_true, scoring="iou") == 1 227 | 228 | 229 | @pytest.mark.parametrize("testCase", testCases) 230 | def test_series(testCase): 231 | """Test the detector on series.""" 232 | if len(testCase["df"]) == 1: 233 | s = pd.DataFrame( 234 | testCase["df"][0], 235 | pd.date_range( 236 | start="2017-1-1", periods=len(testCase["df"][0]), freq="D" 237 | ), 238 | ) 239 | model = testCase["model"](**testCase["params"]) 240 | a_true = pd.Series(testCase["a"], index=s.index) 241 | if isinstance(model, _TrainableModel): 242 | a = model.fit_detect(s) 243 | else: 244 | a = model.detect(s) 245 | pd.testing.assert_series_equal(a, a_true, check_dtype=False) 246 | -------------------------------------------------------------------------------- /tests/test_expand_events.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from adtk.data import expand_events 4 | 5 | event_list = [ 6 | pd.Timestamp("2017-1-1 20:04:00"), 7 | (pd.Timestamp("2017-1-1 20:00:00"), pd.Timestamp("2017-1-1 20:05:59")), 8 | (pd.Timestamp("2017-1-1 20:03:00"), pd.Timestamp("2017-1-1 20:08:59")), 9 | pd.Timestamp("2017-1-1 20:30:00"), 10 | pd.Timestamp("2017-1-1 21:00:00"), 11 | (pd.Timestamp("2017-1-1 21:05:00"), pd.Timestamp("2017-1-1 21:06:59")), 12 | pd.Timestamp("2017-1-1 21:03:00"), 13 | ] 14 | 15 | nan = float("nan") 16 | event_labels = pd.Series( 17 | [0, 0, 1, 1, nan, 0, 1, 0, nan, 0, 0, 1], 18 | index=pd.date_range(start="2017-1-1", periods=12, freq="D"), 19 | ) 20 | 21 | 22 | def test_expand_event_series_freq(): 23 | expanded_events = expand_events( 24 | event_labels, 25 | left_expand="1hour", 26 | right_expand="1hour", 27 | freq_as_period=True, 28 | ) 29 | true_expanded_events = pd.Series( 30 | [0, 1, 1, 1, 1, 1, 1, 1, nan, 0, 1, 1], 31 | index=pd.date_range(start="2017-1-1", periods=12, freq="D"), 32 | ) 33 | pd.testing.assert_series_equal( 34 | true_expanded_events, expanded_events, check_dtype=False 35 | ) 36 | 37 | 38 | def test_expand_event_series_no_freq(): 39 | expanded_events = expand_events( 40 | event_labels, 41 | left_expand="1hour", 42 | right_expand="1hour", 43 | freq_as_period=False, 44 | ) 45 | pd.testing.assert_series_equal( 46 | event_labels, expanded_events, check_dtype=False 47 | ) 48 | 49 | 50 | def test_expand_event_df_freq(): 51 | expanded_events = expand_events( 52 | pd.concat( 53 | [event_labels.rename("A"), event_labels.rename("B")], axis=1 54 | ), 55 | left_expand="1hour", 56 | right_expand="1hour", 57 | freq_as_period=True, 58 | ) 59 | true_expanded_events = pd.Series( 60 | [0, 1, 1, 1, 1, 1, 1, 1, nan, 0, 1, 1], 61 | index=pd.date_range(start="2017-1-1", periods=12, freq="D"), 62 | ) 63 | true_expanded_events = pd.concat( 64 | [true_expanded_events.rename("A"), true_expanded_events.rename("B")], 65 | axis=1, 66 | ) 67 | pd.testing.assert_frame_equal( 68 | true_expanded_events, expanded_events, check_dtype=False 69 | ) 70 | 71 | 72 | def test_expand_event_df_no_freq(): 73 | expanded_events = expand_events( 74 | pd.concat( 75 | [event_labels.rename("A"), event_labels.rename("B")], axis=1 76 | ), 77 | left_expand="1hour", 78 | right_expand="1hour", 79 | freq_as_period=False, 80 | ) 81 | 82 | pd.testing.assert_frame_equal( 83 | pd.concat( 84 | [event_labels.rename("A"), event_labels.rename("B")], axis=1 85 | ), 86 | expanded_events, 87 | check_dtype=False, 88 | ) 89 | 90 | 91 | def test_expand_event_list(): 92 | expanded_events = expand_events( 93 | event_list, left_expand="1min", right_expand="3min" 94 | ) 95 | assert expanded_events == [ 96 | (pd.Timestamp("2017-1-1 19:59:00"), pd.Timestamp("2017-1-1 20:11:59")), 97 | (pd.Timestamp("2017-1-1 20:29:00"), pd.Timestamp("2017-1-1 20:33:00")), 98 | (pd.Timestamp("2017-1-1 20:59:00"), pd.Timestamp("2017-1-1 21:09:59")), 99 | ] 100 | 101 | 102 | def test_expand_event_dict(): 103 | expanded_events = expand_events( 104 | {"A": event_list, "B": event_list}, 105 | left_expand="1min", 106 | right_expand="3min", 107 | ) 108 | assert expanded_events == { 109 | "A": [ 110 | ( 111 | pd.Timestamp("2017-1-1 19:59:00"), 112 | pd.Timestamp("2017-1-1 20:11:59"), 113 | ), 114 | ( 115 | pd.Timestamp("2017-1-1 20:29:00"), 116 | pd.Timestamp("2017-1-1 20:33:00"), 117 | ), 118 | ( 119 | pd.Timestamp("2017-1-1 20:59:00"), 120 | pd.Timestamp("2017-1-1 21:09:59"), 121 | ), 122 | ], 123 | "B": [ 124 | ( 125 | pd.Timestamp("2017-1-1 19:59:00"), 126 | pd.Timestamp("2017-1-1 20:11:59"), 127 | ), 128 | ( 129 | pd.Timestamp("2017-1-1 20:29:00"), 130 | pd.Timestamp("2017-1-1 20:33:00"), 131 | ), 132 | ( 133 | pd.Timestamp("2017-1-1 20:59:00"), 134 | pd.Timestamp("2017-1-1 21:09:59"), 135 | ), 136 | ], 137 | } 138 | -------------------------------------------------------------------------------- /tests/test_few_shot_fit.py: -------------------------------------------------------------------------------- 1 | """Check model fitting with short series 2 | """ 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import pytest 7 | 8 | from adtk.detector import ( 9 | AutoregressionAD, 10 | LevelShiftAD, 11 | PersistAD, 12 | VolatilityShiftAD, 13 | ) 14 | 15 | s = pd.Series( 16 | np.sin(np.arange(10)), 17 | index=pd.date_range(start="2017-1-1", periods=10, freq="D"), 18 | ) 19 | 20 | 21 | def test_persist_ad(): 22 | model = PersistAD(window=10) 23 | with pytest.raises(RuntimeError): 24 | model.fit(s) 25 | 26 | model = PersistAD(window=9) 27 | model.fit(s) 28 | 29 | 30 | def test_level_shift_ad(): 31 | model = LevelShiftAD(window=6) 32 | with pytest.raises(RuntimeError): 33 | model.fit(s) 34 | 35 | model = PersistAD(window=5) 36 | model.fit(s) 37 | 38 | 39 | def test_volatility_shift_ad(): 40 | model = VolatilityShiftAD(window=6) 41 | with pytest.raises(RuntimeError): 42 | model.fit(s) 43 | 44 | model = PersistAD(window=5) 45 | model.fit(s) 46 | 47 | 48 | def test_autoregression_ad(): 49 | model = AutoregressionAD(n_steps=3, step_size=4) 50 | with pytest.raises(RuntimeError): 51 | model.fit(s) 52 | 53 | model = AutoregressionAD(n_steps=3, step_size=3) 54 | model.fit(s) 55 | -------------------------------------------------------------------------------- /tests/test_few_shot_predict.py: -------------------------------------------------------------------------------- 1 | """Check model predicting with short series 2 | """ 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from adtk.detector import ( 8 | AutoregressionAD, 9 | LevelShiftAD, 10 | PersistAD, 11 | VolatilityShiftAD, 12 | ) 13 | 14 | s = pd.Series( 15 | np.sin(np.arange(100)), 16 | index=pd.date_range(start="2017-1-1", periods=100, freq="D"), 17 | ) 18 | 19 | 20 | def test_persist_ad(): 21 | model = PersistAD(window=1) 22 | s_train = s.copy().iloc[:-10] 23 | model.fit(s_train) 24 | 25 | s_test = s.copy().iloc[-2:] 26 | s_test.iloc[-1] = 10 27 | pd.testing.assert_series_equal( 28 | model.predict(s_test), pd.Series([np.nan, 1.0], index=s_test.index) 29 | ) 30 | 31 | s_test = s.copy().iloc[-1:] 32 | s_test.iloc[-1] = 10 33 | pd.testing.assert_series_equal( 34 | model.predict(s_test), pd.Series([np.nan], index=s_test.index) 35 | ) 36 | 37 | model = PersistAD(window=5) 38 | s_train = s.copy().iloc[:-10] 39 | model.fit(s_train) 40 | 41 | s_test = s.copy().iloc[-5:] 42 | s_test.iloc[-1] = 10 43 | pd.testing.assert_series_equal( 44 | model.predict(s_test), pd.Series([np.nan] * 5, index=s_test.index) 45 | ) 46 | 47 | s_test = s.copy().iloc[-6:] 48 | s_test.iloc[-1] = 10 49 | pd.testing.assert_series_equal( 50 | model.predict(s_test), 51 | pd.Series([np.nan] * 5 + [1.0], index=s_test.index), 52 | ) 53 | 54 | 55 | def test_level_shift_ad(): 56 | model = LevelShiftAD(window=3) 57 | s_train = s.copy().iloc[:-10] 58 | model.fit(s_train) 59 | 60 | s_test = s.copy().iloc[-5:] 61 | s_test.iloc[-3:] = 10 62 | pd.testing.assert_series_equal( 63 | model.predict(s_test), pd.Series([np.nan] * 5, index=s_test.index) 64 | ) 65 | 66 | s_test = s.copy().iloc[-6:] 67 | s_test.iloc[-3:] = 10 68 | pd.testing.assert_series_equal( 69 | model.predict(s_test), 70 | pd.Series([np.nan] * 3 + [1.0] + [np.nan] * 2, index=s_test.index), 71 | ) 72 | 73 | 74 | def test_volatility_shift_ad(): 75 | model = VolatilityShiftAD(window=3) 76 | s_train = s.copy().iloc[:-10] 77 | model.fit(s_train) 78 | 79 | s_test = s.copy().iloc[-5:] 80 | s_test.iloc[-3:] *= 10 81 | pd.testing.assert_series_equal( 82 | model.predict(s_test), pd.Series([np.nan] * 5, index=s_test.index) 83 | ) 84 | 85 | s_test = s.copy().iloc[-6:] 86 | s_test.iloc[-3:] *= 10 87 | pd.testing.assert_series_equal( 88 | model.predict(s_test), 89 | pd.Series([np.nan] * 3 + [1.0] + [np.nan] * 2, index=s_test.index), 90 | ) 91 | 92 | 93 | def test_autoregression_ad(): 94 | model = AutoregressionAD(n_steps=3, step_size=7) 95 | s_train = s.copy().iloc[:-10] 96 | model.fit(s_train) 97 | 98 | s_test = s.copy().iloc[-21:] 99 | s_test.iloc[-1:] = 10 100 | pd.testing.assert_series_equal( 101 | model.predict(s_test), pd.Series([np.nan] * 21, index=s_test.index) 102 | ) 103 | 104 | s_test = s.copy().iloc[-22:] 105 | s_test.iloc[-1:] = 10 106 | pd.testing.assert_series_equal( 107 | model.predict(s_test), 108 | pd.Series([np.nan] * 21 + [1.0], index=s_test.index), 109 | ) 110 | -------------------------------------------------------------------------------- /tests/test_inconsistent_train_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test raising error when training and testing dataframes are inconsistent in 3 | multivariate trainable models. 4 | """ 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import pytest 9 | from sklearn.cluster import KMeans 10 | from sklearn.linear_model import LinearRegression 11 | from sklearn.neighbors import LocalOutlierFactor 12 | 13 | import adtk.detector as detector 14 | import adtk.transformer as transformer 15 | 16 | models = [ 17 | detector.MinClusterDetector(KMeans(n_clusters=2)), 18 | detector.OutlierDetector( 19 | LocalOutlierFactor(n_neighbors=20, contamination=0.1) 20 | ), 21 | detector.RegressionAD(target="A", regressor=LinearRegression()), 22 | detector.PcaAD(), 23 | transformer.RegressionResidual(target="A", regressor=LinearRegression()), 24 | transformer.PcaReconstructionError(), 25 | transformer.PcaProjection(), 26 | transformer.PcaReconstruction(), 27 | ] 28 | 29 | df_train = pd.DataFrame( 30 | np.arange(40).reshape(20, 2), 31 | columns=["A", "B"], 32 | index=pd.date_range(start="2017-1-1", periods=20, freq="D"), 33 | ) 34 | 35 | df_test_ok = pd.DataFrame( 36 | np.arange(0, -60, -1).reshape(20, 3), 37 | columns=["C", "B", "A"], 38 | index=pd.date_range(start="2017-1-1", periods=20, freq="D"), 39 | ) 40 | 41 | df_test_not_ok = pd.DataFrame( 42 | np.arange(0, -60, -1).reshape(20, 3), 43 | columns=["C", "D", "A"], 44 | index=pd.date_range(start="2017-1-1", periods=20, freq="D"), 45 | ) 46 | 47 | 48 | @pytest.mark.parametrize("model", models) 49 | def test_inconsistent_train_test(model): 50 | model.fit(df_train) 51 | 52 | model.predict(df_test_ok) 53 | 54 | with pytest.raises( 55 | ValueError, 56 | match="The model was trained by a pandas DataFrame with columns", 57 | ): 58 | model.predict(df_test_not_ok) 59 | -------------------------------------------------------------------------------- /tests/test_metric.py: -------------------------------------------------------------------------------- 1 | from math import isnan 2 | 3 | import pandas as pd 4 | import pytest 5 | from pandas import Timestamp 6 | 7 | from adtk.metrics import f1_score, iou, precision, recall 8 | 9 | n = float("nan") 10 | 11 | s_true = pd.Series( 12 | [0, 0, 1, 1, 0, 1, 0, n, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, n, 0, 0, 1, 0, 0], 13 | pd.date_range(start=0, periods=24, freq="1d"), 14 | ) 15 | s_pred = pd.Series( 16 | [0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, n, 1, 1, n, 0, 1, 0, 1, 1], 17 | pd.date_range(start=0, periods=24, freq="1d"), 18 | ) 19 | s0 = pd.Series( 20 | [0, 0, 0, 0, 0, 0, 0, n, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, n, 0, 0, 0, 0, 0], 21 | pd.date_range(start=0, periods=24, freq="1d"), 22 | ) 23 | 24 | df_true = pd.concat([s_true, s_pred], axis=1).rename(columns={0: "A", 1: "B"}) 25 | df_pred = pd.concat([s_pred, s_true], axis=1).rename(columns={0: "A", 1: "B"}) 26 | df0 = pd.concat([s0, s0], axis=1).rename(columns={0: "A", 1: "B"}) 27 | 28 | 29 | l_true = [ 30 | (Timestamp("1970-01-03 00:00:00"), Timestamp("1970-01-04 00:00:00")), 31 | Timestamp("1970-01-06 00:00:00"), 32 | (Timestamp("1970-01-08 00:00:00"), Timestamp("1970-01-10 00:00:00")), 33 | Timestamp("1970-01-12 00:00:00"), 34 | (Timestamp("1970-01-14 00:00:00"), Timestamp("1970-01-18 00:00:00")), 35 | Timestamp("1970-01-22 00:00:00"), 36 | ] 37 | l_pred = [ 38 | (Timestamp("1970-01-02 00:00:00"), Timestamp("1970-01-07 00:00:00")), 39 | (Timestamp("1970-01-09 00:00:00"), Timestamp("1970-01-10 00:00:00")), 40 | Timestamp("1970-01-12 00:00:00"), 41 | Timestamp("1970-01-15 00:00:00"), 42 | (Timestamp("1970-01-17 00:00:00"), Timestamp("1970-01-19 00:00:00")), 43 | Timestamp("1970-01-21 00:00:00"), 44 | (Timestamp("1970-01-23 00:00:00"), Timestamp("1970-01-24 00:00:00")), 45 | ] 46 | l0 = [] 47 | 48 | d_true = {"A": l_true, "B": l_pred} 49 | d_pred = {"A": l_pred, "B": l_true} 50 | d0 = {"A": l0, "B": l0} 51 | 52 | 53 | def test_metric_series(): 54 | assert recall(s_true, s_pred) == 9 / 12 55 | assert isnan(recall(s0, s_pred)) 56 | assert precision(s_true, s_pred) == 9 / 15 57 | assert f1_score(s_true, s_pred) == pytest.approx(2 / 3) 58 | assert isnan(f1_score(s0, s_pred)) 59 | assert isnan(f1_score(1 - s_pred, s_pred)) 60 | assert iou(s_true, s_pred) == 9 / 17 61 | assert iou(s_pred, s_true) == 9 / 17 62 | assert isnan(iou(s0, s0)) 63 | 64 | 65 | def test_metric_list(): 66 | assert recall(l_true, l_pred) == 4 / 6 67 | assert isnan(recall(l0, l_pred)) 68 | assert precision(l_true, l_pred) == 4 / 7 69 | assert recall(l_true, l_pred, thresh=1) == 3 / 6 70 | assert precision(l_true, l_pred, thresh=1) == 3 / 7 71 | assert iou(l_true, l_pred) == 3 / 13 72 | assert isnan(iou(l0, l0)) 73 | 74 | 75 | def test_metric_dataframe(): 76 | assert recall(df_true, df_pred) == {"A": 9 / 12, "B": 9 / 15} 77 | assert all([isnan(x) for x in recall(df0, df_pred).values()]) and ( 78 | recall(df0, df_pred).keys() == {"A": n, "B": n}.keys() 79 | ) 80 | assert precision(df_true, df_pred) == {"A": 9 / 15, "B": 9 / 12} 81 | assert f1_score(df_true, df_pred) == { 82 | "A": pytest.approx(2 / 3), 83 | "B": pytest.approx(2 / 3), 84 | } 85 | assert all([isnan(x) for x in f1_score(df0, df_pred).values()]) and ( 86 | f1_score(df0, df_pred).keys() == {"A": n, "B": n}.keys() 87 | ) 88 | assert iou(df_true, df_pred) == {"A": 9 / 17, "B": 9 / 17} 89 | assert all([isnan(x) for x in iou(df0, df0).values()]) and ( 90 | iou(df0, df0).keys() == {"A": n, "B": n}.keys() 91 | ) 92 | 93 | 94 | def test_metric_dict(): 95 | assert recall(d_true, d_pred) == {"A": 4 / 6, "B": 4 / 7} 96 | assert all([isnan(x) for x in recall(d0, d_pred).values()]) and ( 97 | recall(d0, d_pred).keys() == {"A": n, "B": n}.keys() 98 | ) 99 | assert precision(d_true, d_pred) == {"A": 4 / 7, "B": 4 / 6} 100 | assert f1_score(d_true, d_pred) == { 101 | "A": pytest.approx(2 * 4 / 7 * 4 / 6 / (4 / 7 + 4 / 6)), 102 | "B": pytest.approx(2 * 4 / 7 * 4 / 6 / (4 / 7 + 4 / 6)), 103 | } 104 | assert recall(d_true, d_pred, thresh=1) == {"A": 3 / 6, "B": 3 / 7} 105 | assert precision(d_true, d_pred, thresh=1) == {"A": 3 / 7, "B": 3 / 6} 106 | assert iou(d_true, d_pred) == {"A": 3 / 13, "B": 3 / 13} 107 | -------------------------------------------------------------------------------- /tests/test_print_subclasses.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import adtk.aggregator as aggt 4 | import adtk.detector as detector 5 | import adtk.transformer as transformer 6 | 7 | 8 | def test_print_subclasses(): 9 | """ 10 | get `print_all_models` method for every module 11 | """ 12 | _ = aggt.print_all_models() 13 | _ = detector.print_all_models() 14 | _ = transformer.print_all_models() 15 | -------------------------------------------------------------------------------- /tests/test_series_name.py: -------------------------------------------------------------------------------- 1 | "Check if the series name or column name is correctly kept." 2 | 3 | import sys 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import pytest 8 | from sklearn.cluster import KMeans 9 | from sklearn.linear_model import LinearRegression 10 | from sklearn.neighbors import LocalOutlierFactor 11 | 12 | import adtk.detector as detector 13 | import adtk.transformer as transformer 14 | from adtk._base import _TrainableModel 15 | from adtk._detector_base import ( # _NonTrainableMultivariateDetector, 16 | _NonTrainableUnivariateDetector, 17 | _TrainableMultivariateDetector, 18 | _TrainableUnivariateDetector, 19 | ) 20 | 21 | _Detector = ( 22 | _NonTrainableUnivariateDetector, 23 | # _NonTrainableMultivariateDetector, 24 | _TrainableUnivariateDetector, 25 | _TrainableMultivariateDetector, 26 | ) 27 | 28 | # We have 4 types of models 29 | # - one-to-one: input a univariate series, output a univariate series 30 | # - one-to-many: input a univariate series, output a multivariate series 31 | # - many-to-one: input a multivariate series, output a univariate series 32 | # - many-to-many: input a multivariate series, output a multivariate series 33 | 34 | one2one_models = [ 35 | detector.ThresholdAD(), 36 | detector.QuantileAD(), 37 | detector.InterQuartileRangeAD(), 38 | detector.GeneralizedESDTestAD(), 39 | detector.PersistAD(window=10), 40 | detector.LevelShiftAD(window=10), 41 | detector.VolatilityShiftAD(window=10), 42 | detector.AutoregressionAD(), 43 | detector.SeasonalAD(freq=2), 44 | transformer.RollingAggregate(window=10, agg="median"), 45 | transformer.RollingAggregate( 46 | window=10, agg="quantile", agg_params={"q": 0.5} 47 | ), 48 | transformer.DoubleRollingAggregate(window=10, agg="median"), 49 | transformer.DoubleRollingAggregate( 50 | window=10, agg="quantile", agg_params={"q": [0.1, 0.5, 0.9]} 51 | ), 52 | transformer.DoubleRollingAggregate( 53 | window=10, agg="hist", agg_params={"bins": [30, 50, 70]} 54 | ), 55 | transformer.StandardScale(), 56 | transformer.ClassicSeasonalDecomposition(freq=2), 57 | ] 58 | 59 | one2many_models = [ 60 | transformer.RollingAggregate( 61 | window=10, agg="quantile", agg_params={"q": [0.1, 0.5, 0.9]} 62 | ), 63 | transformer.RollingAggregate( 64 | window=10, agg="hist", agg_params={"bins": [20, 50, 80]} 65 | ), 66 | transformer.Retrospect(n_steps=3), 67 | ] 68 | 69 | many2one_models = [ 70 | detector.MinClusterDetector(KMeans(n_clusters=2)), 71 | detector.OutlierDetector( 72 | LocalOutlierFactor(n_neighbors=20, contamination=0.1) 73 | ), 74 | detector.RegressionAD(target="A", regressor=LinearRegression()), 75 | detector.PcaAD(), 76 | transformer.SumAll(), 77 | transformer.RegressionResidual(target="A", regressor=LinearRegression()), 78 | transformer.PcaReconstructionError(), 79 | ] 80 | 81 | 82 | @pytest.mark.parametrize("model", one2one_models) 83 | def test_one2one_s2s_w_name(model): 84 | """ 85 | if a one-to-one model is applied to a Series, it should keep the Series 86 | name unchanged 87 | """ 88 | s_name = pd.Series( 89 | np.arange(100), 90 | index=pd.date_range(start="2017-1-1", periods=100, freq="D"), 91 | name="A", 92 | ) 93 | if isinstance(model, _TrainableModel): 94 | result = model.fit_predict(s_name) 95 | else: 96 | result = model.predict(s_name) 97 | assert result.name == "A" 98 | 99 | 100 | @pytest.mark.parametrize("model", one2one_models) 101 | def test_one2one_s2s_wo_name(model): 102 | """ 103 | if a one-to-one model is applied to a Series, it should keep the Series 104 | name unchanged 105 | """ 106 | s_no_name = pd.Series( 107 | np.arange(100), 108 | index=pd.date_range(start="2017-1-1", periods=100, freq="D"), 109 | ) 110 | if isinstance(model, _TrainableModel): 111 | result = model.fit_predict(s_no_name) 112 | else: 113 | result = model.predict(s_no_name) 114 | assert result.name is None 115 | 116 | 117 | @pytest.mark.parametrize("model", one2one_models) 118 | def test_one2one_df2df(model): 119 | """ 120 | if a one-to-one model is applied to a DataFrame, it should keep the column 121 | names unchanged 122 | """ 123 | df = pd.DataFrame( 124 | np.arange(300).reshape(100, 3), 125 | index=pd.date_range(start="2017-1-1", periods=100, freq="D"), 126 | columns=["A", "B", "C"], 127 | ) 128 | if isinstance(model, _TrainableModel): 129 | result = model.fit_predict(df) 130 | else: 131 | result = model.predict(df) 132 | assert list(result.columns) == ["A", "B", "C"] 133 | 134 | 135 | @pytest.mark.parametrize("model", one2one_models) 136 | def test_one2one_df2list(model): 137 | """ 138 | if a one-to-one model (detector) is applied to a DataFrame and returns a 139 | dict, the output dict keys should match the input column names 140 | """ 141 | if isinstance(model, _Detector): 142 | df = pd.DataFrame( 143 | np.arange(300).reshape(100, 3), 144 | index=pd.date_range(start="2017-1-1", periods=100, freq="D"), 145 | columns=["A", "B", "C"], 146 | ) 147 | if isinstance(model, _TrainableModel): 148 | result = model.fit_detect(df, return_list=True) 149 | else: 150 | result = model.detect(df, return_list=True) 151 | if sys.version_info[1] >= 6: 152 | assert list(result.keys()) == ["A", "B", "C"] 153 | else: 154 | assert set(result.keys()) == {"A", "B", "C"} 155 | 156 | 157 | @pytest.mark.parametrize("model", one2many_models) 158 | def test_one2many_s2df_w_name(model): 159 | """ 160 | if a one-to-many model is applied to a Series, the output should not have 161 | prefix in column names, no matter whether the input Series has a name. 162 | """ 163 | s_name = pd.Series( 164 | np.arange(100), 165 | index=pd.date_range(start="2017-1-1", periods=100, freq="D"), 166 | name="A", 167 | ) 168 | if isinstance(model, _TrainableModel): 169 | result = model.fit_predict(s_name) 170 | else: 171 | result = model.predict(s_name) 172 | assert all([col[:2] != "A_" for col in result.columns]) 173 | 174 | 175 | @pytest.mark.parametrize("model", one2many_models) 176 | def test_one2many_s2df_wo_name(model): 177 | """ 178 | if a one-to-many model is applied to a Series, the output should not have 179 | prefix in column names, no matter whether the input Series has a name. 180 | """ 181 | s_no_name = pd.Series( 182 | np.arange(100), 183 | index=pd.date_range(start="2017-1-1", periods=100, freq="D"), 184 | ) 185 | if isinstance(model, _TrainableModel): 186 | result = model.fit_predict(s_no_name) 187 | else: 188 | result = model.predict(s_no_name) 189 | assert all([col[:2] != "A_" for col in result.columns]) 190 | 191 | 192 | @pytest.mark.parametrize("model", one2many_models) 193 | def test_one2many_df2df(model): 194 | """ 195 | if a one-to-many model is applied to a DataFrame, the output should have 196 | prefix in column names to indicate the input columns they correspond. 197 | """ 198 | df = pd.DataFrame( 199 | np.arange(300).reshape(100, 3), 200 | index=pd.date_range(start="2017-1-1", periods=100, freq="D"), 201 | columns=["A", "B", "C"], 202 | ) 203 | if isinstance(model, _TrainableModel): 204 | result = model.fit_predict(df) 205 | else: 206 | result = model.predict(df) 207 | n_cols = round(len(result.columns) / 3) 208 | assert all([col[:2] == "A_" for col in result.columns[:n_cols]]) 209 | assert all([col[2:4] != "A_" for col in result.columns[:n_cols]]) 210 | assert all( 211 | [col[:2] == "B_" for col in result.columns[n_cols : 2 * n_cols]] 212 | ) 213 | assert all( 214 | [col[2:4] != "B_" for col in result.columns[n_cols : 2 * n_cols]] 215 | ) 216 | assert all([col[:2] == "C_" for col in result.columns[2 * n_cols :]]) 217 | assert all([col[2:4] != "C_" for col in result.columns[2 * n_cols :]]) 218 | 219 | 220 | @pytest.mark.parametrize("model", many2one_models) 221 | def test_many2one(model): 222 | """ 223 | The output Series from a many-to-one model should NOT have name 224 | """ 225 | df = pd.DataFrame( 226 | np.arange(300).reshape(100, 3), 227 | index=pd.date_range(start="2017-1-1", periods=100, freq="D"), 228 | columns=["A", "B", "C"], 229 | ) 230 | if isinstance(model, _TrainableModel): 231 | result = model.fit_predict(df) 232 | else: 233 | result = model.predict(df) 234 | assert result.name is None 235 | 236 | 237 | def test_pca_reconstruction(): 238 | df = pd.DataFrame( 239 | np.arange(300).reshape(100, 3), 240 | index=pd.date_range(start="2017-1-1", periods=100, freq="D"), 241 | columns=["A", "B", "C"], 242 | ) 243 | result = transformer.PcaReconstruction(k=2).fit_predict(df) 244 | assert list(result.columns) == ["A", "B", "C"] 245 | -------------------------------------------------------------------------------- /tests/test_train_test_split.py: -------------------------------------------------------------------------------- 1 | """ 2 | Unit tests on train-test split 3 | """ 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from adtk.data import split_train_test 8 | 9 | 10 | def test_split_series(): 11 | """ 12 | test all modes on a naive list of from 0 to 99 13 | """ 14 | s = pd.Series(range(100)) 15 | 16 | splits = split_train_test(s, mode=1, n_splits=4, train_ratio=0.8) 17 | ts_train, ts_test = zip(*splits) 18 | assert all( 19 | x.equals(y) 20 | for x, y in zip( 21 | ts_train, 22 | [s.iloc[:20], s.iloc[25:45], s.iloc[50:70], s.iloc[75:95]], 23 | ) 24 | ) 25 | assert all( 26 | x.equals(y) 27 | for x, y in zip( 28 | ts_test, [s.iloc[20:25], s.iloc[45:50], s.iloc[70:75], s.iloc[95:]] 29 | ) 30 | ) 31 | 32 | splits = split_train_test(s, mode=2, n_splits=4, train_ratio=0.8) 33 | ts_train, ts_test = zip(*splits) 34 | assert all( 35 | x.equals(y) 36 | for x, y in zip( 37 | ts_train, [s.iloc[:20], s.iloc[:40], s.iloc[:60], s.iloc[:80]] 38 | ) 39 | ) 40 | assert all( 41 | x.equals(y) 42 | for x, y in zip( 43 | ts_test, [s.iloc[20:25], s.iloc[40:50], s.iloc[60:75], s.iloc[80:]] 44 | ) 45 | ) 46 | 47 | splits = split_train_test(s, mode=3, n_splits=4, train_ratio=0.8) 48 | ts_train, ts_test = zip(*splits) 49 | assert all( 50 | x.equals(y) 51 | for x, y in zip( 52 | ts_train, [s.iloc[:20], s.iloc[:40], s.iloc[:60], s.iloc[:80]] 53 | ) 54 | ) 55 | assert all( 56 | x.equals(y) 57 | for x, y in zip( 58 | ts_test, [s.iloc[20:40], s.iloc[40:60], s.iloc[60:80], s.iloc[80:]] 59 | ) 60 | ) 61 | 62 | splits = split_train_test(s, mode=4, n_splits=4, train_ratio=0.8) 63 | ts_train, ts_test = zip(*splits) 64 | assert all( 65 | x.equals(y) 66 | for x, y in zip( 67 | ts_train, [s.iloc[:20], s.iloc[:40], s.iloc[:60], s.iloc[:80]] 68 | ) 69 | ) 70 | assert all( 71 | x.equals(y) 72 | for x, y in zip( 73 | ts_test, [s.iloc[20:], s.iloc[40:], s.iloc[60:], s.iloc[80:]] 74 | ) 75 | ) 76 | 77 | 78 | def test_split_dataframe(): 79 | """ 80 | test all modes on a naive df of from 0 to 99 81 | """ 82 | s = pd.Series(range(100)) 83 | df = pd.DataFrame({"A": s, "B": s}) 84 | 85 | splits = split_train_test(df, mode=1, n_splits=4, train_ratio=0.8) 86 | ts_train, ts_test = zip(*splits) 87 | assert all( 88 | np.array_equal(x.values, y.values) 89 | for x, y in zip( 90 | ts_train, 91 | [df.iloc[:20], df.iloc[25:45], df.iloc[50:70], df.iloc[75:95]], 92 | ) 93 | ) 94 | assert all( 95 | np.array_equal(x.values, y.values) 96 | for x, y in zip( 97 | ts_test, 98 | [df.iloc[20:25], df.iloc[45:50], df.iloc[70:75], df.iloc[95:]], 99 | ) 100 | ) 101 | 102 | splits = split_train_test(df, mode=2, n_splits=4, train_ratio=0.8) 103 | ts_train, ts_test = zip(*splits) 104 | assert all( 105 | np.array_equal(x.values, y.values) 106 | for x, y in zip( 107 | ts_train, [df.iloc[:20], df.iloc[:40], df.iloc[:60], df.iloc[:80]] 108 | ) 109 | ) 110 | assert all( 111 | np.array_equal(x.values, y.values) 112 | for x, y in zip( 113 | ts_test, 114 | [df.iloc[20:25], df.iloc[40:50], df.iloc[60:75], df.iloc[80:]], 115 | ) 116 | ) 117 | 118 | splits = split_train_test(df, mode=3, n_splits=4, train_ratio=0.8) 119 | ts_train, ts_test = zip(*splits) 120 | assert all( 121 | np.array_equal(x.values, y.values) 122 | for x, y in zip( 123 | ts_train, [df.iloc[:20], df.iloc[:40], df.iloc[:60], df.iloc[:80]] 124 | ) 125 | ) 126 | assert all( 127 | np.array_equal(x.values, y.values) 128 | for x, y in zip( 129 | ts_test, 130 | [df.iloc[20:40], df.iloc[40:60], df.iloc[60:80], df.iloc[80:]], 131 | ) 132 | ) 133 | 134 | splits = split_train_test(df, mode=4, n_splits=4, train_ratio=0.8) 135 | ts_train, ts_test = zip(*splits) 136 | assert all( 137 | np.array_equal(x.values, y.values) 138 | for x, y in zip( 139 | ts_train, [df.iloc[:20], df.iloc[:40], df.iloc[:60], df.iloc[:80]] 140 | ) 141 | ) 142 | assert all( 143 | np.array_equal(x.values, y.values) 144 | for x, y in zip( 145 | ts_test, [df.iloc[20:], df.iloc[40:], df.iloc[60:], df.iloc[80:]] 146 | ) 147 | ) 148 | -------------------------------------------------------------------------------- /tests/test_transformerhd.py: -------------------------------------------------------------------------------- 1 | """Test HD transformers.""" 2 | import numpy as np 3 | import pandas as pd 4 | import pytest 5 | from sklearn.linear_model import LinearRegression 6 | 7 | import adtk.transformer as transformer 8 | from adtk._base import _TrainableModel 9 | 10 | nan = float("nan") 11 | 12 | testCases = [ 13 | { 14 | "model": transformer.CustomizedTransformerHD, 15 | "params": {"transform_func": lambda x: x.sum(axis=1) > 0}, 16 | "df": [ 17 | [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 18 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 19 | ], 20 | "t": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 21 | }, 22 | { 23 | "model": transformer.CustomizedTransformerHD, 24 | "params": { 25 | "transform_func": lambda x, a: x.sum(axis=1) > a, 26 | "transform_func_params": {"a": 0}, 27 | }, 28 | "df": [ 29 | [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 30 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 31 | ], 32 | "t": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 33 | }, 34 | { 35 | "model": transformer.CustomizedTransformerHD, 36 | "params": { 37 | "transform_func": lambda x, a: x.sum(axis=1) > a, 38 | "fit_func": lambda x: {"a": x.sum(axis=1).median()}, 39 | }, 40 | "df": [ 41 | [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 42 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 43 | ], 44 | "t": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 45 | }, 46 | { 47 | "model": transformer.CustomizedTransformerHD, 48 | "params": { 49 | "transform_func": lambda x, a: x.sum(axis=1) > a, 50 | "fit_func": lambda x, q: {"a": x.sum(axis=1).quantile(q)}, 51 | "fit_func_params": {"q": 0.5}, 52 | }, 53 | "df": [ 54 | [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 55 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 56 | ], 57 | "t": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 58 | }, 59 | { 60 | "model": transformer.CustomizedTransformerHD, 61 | "params": { 62 | "transform_func": lambda x, a, b: (x.sum(axis=1) > a) 63 | | (x.sum(axis=1) < b), 64 | "transform_func_params": {"b": -0.5}, 65 | "fit_func": lambda x: {"a": x.sum(axis=1).median()}, 66 | }, 67 | "df": [ 68 | [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 69 | [0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0], 70 | ], 71 | "t": [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0], 72 | }, 73 | { 74 | "model": transformer.CustomizedTransformerHD, 75 | "params": { 76 | "transform_func": lambda x, a, b: (x.sum(axis=1) > a) 77 | | (x.sum(axis=1) < b), 78 | "transform_func_params": {"b": -0.5}, 79 | "fit_func": lambda x, q: {"a": x.sum(axis=1).quantile(q)}, 80 | "fit_func_params": {"q": 0.5}, 81 | }, 82 | "df": [ 83 | [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 84 | [0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0], 85 | ], 86 | "t": [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0], 87 | }, 88 | { 89 | "model": transformer.CustomizedTransformerHD, 90 | "params": { 91 | "transform_func": lambda x: pd.DataFrame( 92 | {"min": x.min(axis=1) > 0, "max": x.max(axis=1) > 0} 93 | ) 94 | }, 95 | "df": [ 96 | [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 97 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 98 | ], 99 | "t": { 100 | "min": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 101 | "max": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 102 | }, 103 | }, 104 | { 105 | "model": transformer.CustomizedTransformerHD, 106 | "params": { 107 | "transform_func": lambda x, a: pd.DataFrame( 108 | {"min": x.min(axis=1) > a, "max": x.max(axis=1) > a} 109 | ), 110 | "transform_func_params": {"a": 0}, 111 | }, 112 | "df": [ 113 | [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 114 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 115 | ], 116 | "t": { 117 | "min": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 118 | "max": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 119 | }, 120 | }, 121 | { 122 | "model": transformer.CustomizedTransformerHD, 123 | "params": { 124 | "transform_func": lambda x, a: pd.DataFrame( 125 | {"min": x.min(axis=1) > a, "max": x.max(axis=1) > a} 126 | ), 127 | "fit_func": lambda x: {"a": x.sum(axis=1).median()}, 128 | }, 129 | "df": [ 130 | [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 131 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 132 | ], 133 | "t": { 134 | "min": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 135 | "max": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 136 | }, 137 | }, 138 | { 139 | "model": transformer.CustomizedTransformerHD, 140 | "params": { 141 | "transform_func": lambda x, a: pd.DataFrame( 142 | {"min": x.min(axis=1) > a, "max": x.max(axis=1) > a} 143 | ), 144 | "fit_func": lambda x, q: {"a": x.sum(axis=1).quantile(q)}, 145 | "fit_func_params": {"q": 0.5}, 146 | }, 147 | "df": [ 148 | [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 149 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 150 | ], 151 | "t": { 152 | "min": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 153 | "max": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 154 | }, 155 | }, 156 | { 157 | "model": transformer.CustomizedTransformerHD, 158 | "params": { 159 | "transform_func": lambda x, a, b: pd.DataFrame( 160 | { 161 | "min": (x.min(axis=1) > a) | (x.min(axis=1) < b), 162 | "max": (x.max(axis=1) > a) | (x.max(axis=1) < b), 163 | } 164 | ), 165 | "transform_func_params": {"b": -0.5}, 166 | "fit_func": lambda x: {"a": x.sum(axis=1).median()}, 167 | }, 168 | "df": [ 169 | [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 170 | [0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0], 171 | ], 172 | "t": { 173 | "min": [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], 174 | "max": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 175 | }, 176 | }, 177 | { 178 | "model": transformer.CustomizedTransformerHD, 179 | "params": { 180 | "transform_func": lambda x, a, b: pd.DataFrame( 181 | { 182 | "min": (x.min(axis=1) > a) | (x.min(axis=1) < b), 183 | "max": (x.max(axis=1) > a) | (x.max(axis=1) < b), 184 | } 185 | ), 186 | "transform_func_params": {"b": -0.5}, 187 | "fit_func": lambda x, q: {"a": x.sum(axis=1).quantile(q)}, 188 | "fit_func_params": {"q": 0.5}, 189 | }, 190 | "df": [ 191 | [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 192 | [0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0], 193 | ], 194 | "t": { 195 | "min": [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], 196 | "max": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 197 | }, 198 | }, 199 | { 200 | "model": transformer.RegressionResidual, 201 | "params": {"regressor": LinearRegression(), "target": 1}, 202 | "df": [ 203 | [0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9], 204 | [9, 8, 7, 6, 5, 4, 3, 2, nan, 1, 0], 205 | [9] * 11, 206 | ], 207 | "t": [0] * 8 + [nan] + [0] * 2, 208 | }, 209 | { 210 | "model": transformer.PcaProjection, 211 | "params": {"k": 1}, 212 | "df": [[0, 1, 2, 3, 4, 4, nan, 5, 6], [0, 1, 2, 3, nan, 4, 5, 5, 6]], 213 | "t": { 214 | "pc0": [ 215 | 3 * 2 ** 0.5, 216 | 2 * 2 ** 0.5, 217 | 1 * 2 ** 0.5, 218 | 0 * 2 ** 0.5, 219 | nan, 220 | -1 * 2 ** 0.5, 221 | nan, 222 | -2 * 2 ** 0.5, 223 | -3 * 2 ** 0.5, 224 | ] 225 | }, 226 | }, 227 | { 228 | "model": transformer.PcaReconstruction, 229 | "params": {"k": 1}, 230 | "df": [ 231 | [0, 1, 2, 3, 3.9, 4.1, 5, 6, 7, 7, 8, 9], 232 | [0, 1, 2, 3, 4.1, 3.9, 5, 6, 7, nan, 8, 9], 233 | ], 234 | "t": { 235 | 0: [0, 1, 2, 3, 4, 4, 5, 6, 7, nan, 8, 9], 236 | 1: [0, 1, 2, 3, 4, 4, 5, 6, 7, nan, 8, 9], 237 | }, 238 | }, 239 | { 240 | "model": transformer.PcaReconstructionError, 241 | "params": {"k": 1}, 242 | "df": [ 243 | [0, 1, 2, 3, 3.9, 4.1, 5, 6, 7, 7, 8, 9], 244 | [0, 1, 2, 3, 4.1, 3.9, 5, 6, 7, nan, 8, 9], 245 | ], 246 | "t": [0, 0, 0, 0, 0.02, 0.02, 0, 0, 0, nan, 0, 0], 247 | }, 248 | ] 249 | 250 | 251 | @pytest.mark.parametrize("testCase", testCases) 252 | def test_fit_transform(testCase): 253 | """Test fit_transform the transformer.""" 254 | df = pd.DataFrame( 255 | np.array(testCase["df"]).T, 256 | pd.date_range( 257 | start="2017-1-1", periods=len(testCase["df"][0]), freq="D" 258 | ), 259 | ) 260 | model = testCase["model"](**testCase["params"]) 261 | if isinstance(model, _TrainableModel): 262 | t = model.fit_transform(df) 263 | else: 264 | t = model.transform(df) 265 | if not isinstance(testCase["t"], dict): 266 | t_true = pd.Series(testCase["t"], index=df.index) 267 | pd.testing.assert_series_equal(t, t_true, check_dtype=False) 268 | else: 269 | t_true = pd.DataFrame(testCase["t"], index=df.index) 270 | pd.testing.assert_frame_equal(t, t_true, check_dtype=False) 271 | 272 | 273 | @pytest.mark.parametrize("testCase", testCases) 274 | def test_fit_and_transform(testCase): 275 | """Test fit the transformer and then transform.""" 276 | df = pd.DataFrame( 277 | np.array(testCase["df"]).T, 278 | pd.date_range( 279 | start="2017-1-1", periods=len(testCase["df"][0]), freq="D" 280 | ), 281 | ) 282 | model = testCase["model"](**testCase["params"]) 283 | if isinstance(model, _TrainableModel): 284 | model.fit(df) 285 | t = model.transform(df) 286 | if not isinstance(testCase["t"], dict): 287 | t_true = pd.Series(testCase["t"], index=df.index) 288 | pd.testing.assert_series_equal(t, t_true, check_dtype=False) 289 | else: 290 | t_true = pd.DataFrame(testCase["t"], index=df.index) 291 | pd.testing.assert_frame_equal(t, t_true, check_dtype=False) 292 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = 3 | py35-pandas24--stats{9,11} 4 | py{36,37}-pandas{24,25,1}-stats{9,11} 5 | py38-pandas{25,1}-stats11 6 | [testenv] 7 | extras = test 8 | deps = 9 | pandas24: pandas>=0.24,<0.25 10 | pandas25: pandas>=0.25,<0.26 11 | pandas1: pandas>=1.0,<1.1 12 | stats9: statsmodels>=0.9,<0.10 13 | stats11: statsmodels>=0.11,<0.12 14 | commands = 15 | pytest 16 | mypy ./src/adtk/ --config-file ./mypy.ini --------------------------------------------------------------------------------