├── .coveragerc
├── .gitattributes
├── .github
    └── workflows
    │   └── pythonpublish.yml
├── .gitignore
├── .isort.cfg
├── .readthedocs.yml
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── README.md
├── docs
    ├── .gitignore
    ├── Makefile
    ├── README.md
    ├── _static
    │   ├── arundo_logo.png
    │   └── arundo_logo_black.png
    ├── api
    │   ├── aggregators.rst
    │   ├── data.rst
    │   ├── detectors.rst
    │   ├── metrics.rst
    │   ├── modules.rst
    │   ├── pipe.rst
    │   ├── transformers.rst
    │   └── visualization.rst
    ├── conf.py
    ├── developer.rst
    ├── examples.rst
    ├── images
    │   ├── cyclic.png
    │   ├── level_shift.png
    │   ├── level_shift_double_rolling.png
    │   ├── local_spike.png
    │   ├── missing_data.png
    │   ├── non_zeros_count.png
    │   ├── quickstart0.png
    │   ├── quickstart1.png
    │   ├── quickstart2.png
    │   ├── quickstart3.png
    │   ├── restart.png
    │   ├── seasonal.png
    │   ├── spike.png
    │   ├── split_1.png
    │   ├── split_2.png
    │   ├── split_3.png
    │   ├── split_4.png
    │   └── volatility_shift_double_rolling.png
    ├── index.rst
    ├── inheritance.rst
    ├── install.rst
    ├── notebooks
    │   ├── data
    │   │   ├── autoregression.csv
    │   │   ├── cpu.csv
    │   │   ├── gaussian2d.csv
    │   │   ├── generator.csv
    │   │   ├── invalid_series.csv
    │   │   ├── pressure.csv
    │   │   ├── price_long.csv
    │   │   ├── price_short.csv
    │   │   ├── pricing.csv
    │   │   ├── quickstart
    │   │   │   ├── known_anomalies.csv
    │   │   │   ├── testing.csv
    │   │   │   └── training.csv
    │   │   ├── seasonal+trend.csv
    │   │   ├── seasonal.csv
    │   │   ├── seismic.csv
    │   │   ├── sin.csv
    │   │   └── temperature.csv
    │   ├── demo.ipynb
    │   └── quickstart.ipynb
    ├── quickstart.rst
    ├── releasehistory.rst
    ├── requirements-docs.txt
    └── userguide.rst
├── mypy.ini
├── pyproject.toml
├── setup.cfg
├── setup.py
├── src
    └── adtk
    │   ├── __init__.py
    │   ├── _aggregator_base.py
    │   ├── _base.py
    │   ├── _detector_base.py
    │   ├── _transformer_base.py
    │   ├── _utils.py
    │   ├── aggregator
    │       ├── __init__.py
    │       └── _aggregator.py
    │   ├── data
    │       ├── __init__.py
    │       └── _data.py
    │   ├── detector
    │       ├── __init__.py
    │       ├── _detector_1d.py
    │       └── _detector_hd.py
    │   ├── metrics
    │       ├── __init__.py
    │       └── _metrics.py
    │   ├── pipe
    │       ├── __init__.py
    │       └── _pipe.py
    │   ├── transformer
    │       ├── __init__.py
    │       ├── _transformer_1d.py
    │       └── _transformer_hd.py
    │   └── visualization
    │       ├── __init__.py
    │       └── _visualization.py
├── tests
    ├── test_aggregators.py
    ├── test_attribute.py
    ├── test_data_validation.py
    ├── test_detector1d.py
    ├── test_detectorhd.py
    ├── test_expand_events.py
    ├── test_few_shot_fit.py
    ├── test_few_shot_predict.py
    ├── test_inconsistent_train_test.py
    ├── test_label_list_convert.py
    ├── test_metric.py
    ├── test_pipe.py
    ├── test_print_subclasses.py
    ├── test_series_name.py
    ├── test_train_test_split.py
    ├── test_transformer1d.py
    ├── test_transformerhd.py
    └── test_visualization.ipynb
└── tox.ini


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [report]
 2 | omit = src/adtk/visualization/*
 3 | show_missing = True
 4 | exclude_lines =
 5 |     pragma: no cover
 6 |     raise
 7 |     warnings.warn
 8 |     pass
 9 |     @property
10 |     @overload
11 |     def plot_flowchart


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | tests/*.ipynb linguist-documentation


--------------------------------------------------------------------------------
/.github/workflows/pythonpublish.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | jobs:
 8 |   deploy:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - uses: actions/checkout@v1
12 |     - name: Set up Python
13 |       uses: actions/setup-python@v1
14 |       with:
15 |         python-version: '3.x'
16 |     - name: Install dependencies
17 |       run: |
18 |         python -m pip install --upgrade pip
19 |         pip install setuptools wheel twine
20 |     - name: Build and publish
21 |       env:
22 |         TWINE_USERNAME: __token__
23 |         TWINE_PASSWORD: ${{ secrets.ARUNDO_PYPI_TOKEN }}
24 |       run: |
25 |         python setup.py sdist bdist_wheel
26 |         twine upload dist/*
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | db.sqlite3-journal
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # IPython
 80 | profile_default/
 81 | ipython_config.py
 82 | 
 83 | # pyenv
 84 | .python-version
 85 | 
 86 | # pipenv
 87 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 88 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 89 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 90 | #   install all needed dependencies.
 91 | #Pipfile.lock
 92 | 
 93 | # celery beat schedule file
 94 | celerybeat-schedule
 95 | 
 96 | # SageMath parsed files
 97 | *.sage.py
 98 | 
 99 | # Environments
100 | .env
101 | .venv
102 | env/
103 | venv/
104 | ENV/
105 | env.bak/
106 | venv.bak/
107 | 
108 | # Spyder project settings
109 | .spyderproject
110 | .spyproject
111 | 
112 | # Rope project settings
113 | .ropeproject
114 | 
115 | # mkdocs documentation
116 | /site
117 | 
118 | # mypy
119 | .mypy_cache/
120 | .dmypy.json
121 | dmypy.json
122 | 
123 | # Pyre type checker
124 | .pyre/
125 | 


--------------------------------------------------------------------------------
/.isort.cfg:
--------------------------------------------------------------------------------
1 | [settings]
2 | multi_line_output=3
3 | include_trailing_comma=True
4 | force_grid_wrap=0
5 | use_parentheses=True
6 | line_length=79


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sphinx:
 4 |   configuration: docs/conf.py
 5 | 
 6 | python:
 7 |     version: 3.7
 8 |     install:
 9 |        - requirements: docs/requirements-docs.txt
10 |        - method: pip
11 |          path: .
12 |          extra_requirements:
13 |              - doc
14 |     system_packages: true
15 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: false
 2 | language: python
 3 | python:
 4 |     - "3.5.2"
 5 |     - "3.6"
 6 |     - "3.7"
 7 |     - "3.8"
 8 | 
 9 | install: pip install tox-travis
10 | 
11 | script:
12 |     - tox
13 | 
14 | after_success:
15 |     - pip install -e .[test,dev]
16 |     - pytest --cov=adtk --cov-config=.coveragerc
17 |     - coveralls
18 |     - black --check ./src/adtk
19 |     - black --check ./tests
20 |     - isort --check-only -rc ./src/adtk
21 |     - isort --check-only -rc ./tests
22 | 
23 | branches:
24 |     only:
25 |     - master
26 |     - develop
27 | 
28 | notifications:
29 |   email: false
30 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Anomaly Detection Toolkit (ADTK)
 2 | 
 3 | [![Build Status](https://travis-ci.com/arundo/adtk.svg?branch=master)](https://travis-ci.com/arundo/adtk)
 4 | [![Documentation Status](https://readthedocs.org/projects/adtk/badge/?version=stable)](https://adtk.readthedocs.io/en/stable)
 5 | [![Coverage Status](https://coveralls.io/repos/github/arundo/adtk/badge.svg?branch=master&service=github)](https://coveralls.io/github/arundo/adtk?branch=master)
 6 | [![PyPI](https://img.shields.io/pypi/v/adtk)](https://pypi.org/project/adtk/)
 7 | [![Downloads](https://pepy.tech/badge/adtk)](https://pepy.tech/project/adtk)
 8 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 9 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/arundo/adtk/master?filepath=docs%2Fnotebooks%2Fdemo.ipynb)
10 | 
11 | Anomaly Detection Toolkit (ADTK) is a Python package for unsupervised /
12 | rule-based time series anomaly detection.
13 | 
14 | As the nature of anomaly varies over different cases, a model may not work
15 | universally for all anomaly detection problems. Choosing and combining
16 | detection algorithms (detectors), feature engineering methods (transformers),
17 | and ensemble methods (aggregators) properly is the key to build an effective
18 | anomaly detection model.
19 | 
20 | This package offers a set of common detectors, transformers and aggregators
21 | with unified APIs, as well as pipe classes that connect them together into
22 | models. It also provides some functions to process and visualize time series
23 | and anomaly events.
24 | 
25 | See https://adtk.readthedocs.io for complete documentation.
26 | 
27 | ## Installation
28 | 
29 | Prerequisites: Python 3.5 or later.
30 | 
31 | It is recommended to install the most recent **stable** release of ADTK from PyPI.
32 | 
33 | ```shell
34 | pip install adtk
35 | ```
36 | 
37 | Alternatively, you could install from source code. This will give you the **latest**, but unstable, version of ADTK.
38 | 
39 | ```shell
40 | git clone https://github.com/arundo/adtk.git
41 | cd adtk/
42 | git checkout develop
43 | pip install ./
44 | ```
45 | 
46 | ## Examples
47 | 
48 | Please see [Quick Start](https://adtk.readthedocs.io/en/stable/quickstart.html) for a simple example.
49 | 
50 | For more detailed examples of each module of ADTK, please refer to
51 | [Examples](https://adtk.readthedocs.io/en/stable/examples.html)
52 | section in the documentation or [an interactive demo notebook](https://mybinder.org/v2/gh/arundo/adtk/master?filepath=docs%2Fnotebooks%2Fdemo.ipynb).
53 | 
54 | ## Contributing
55 | 
56 | Pull requests are welcome. For major changes, please open an issue first to
57 | discuss what you would like to change.
58 | 
59 | Please make sure to update unit tests as appropriate.
60 | 
61 | Please see [Contributing](https://adtk.readthedocs.io/en/stable/developer.html) for more details.
62 | 
63 | 
64 | ## License
65 | 
66 | ADTK is licensed under the Mozilla Public License 2.0 (MPL 2.0). See the
67 | [LICENSE](LICENSE) file for details.
68 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | _build


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = python -msphinx
 7 | SPHINXPROJ    = ADTK
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # ADTK Documentation
 2 | 
 3 | 1. Install necessary sphinx packages if they are not installed yet.
 4 |     ``` bash
 5 |     $ pip install -r requirements-docs.txt
 6 |     ```
 7 | 
 8 | 2. Build documentation.
 9 |     ```bash
10 |     $ make html
11 |     ```
12 | 
13 | 3. Now you may open documentation by opening `_build/html/index.html` in your browser.
14 | 


--------------------------------------------------------------------------------
/docs/_static/arundo_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/_static/arundo_logo.png


--------------------------------------------------------------------------------
/docs/_static/arundo_logo_black.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/_static/arundo_logo_black.png


--------------------------------------------------------------------------------
/docs/api/aggregators.rst:
--------------------------------------------------------------------------------
1 | Aggregators
2 | ===========
3 | .. automodule:: adtk.aggregator
4 |    :members:
5 |    :inherited-members:
6 | 


--------------------------------------------------------------------------------
/docs/api/data.rst:
--------------------------------------------------------------------------------
1 | Data
2 | ===========
3 | 
4 | .. automodule:: adtk.data
5 |    :members:
6 |    :inherited-members:
7 | 


--------------------------------------------------------------------------------
/docs/api/detectors.rst:
--------------------------------------------------------------------------------
1 | Detectors
2 | =========
3 | .. automodule:: adtk.detector
4 |    :members:
5 |    :inherited-members:
6 | 


--------------------------------------------------------------------------------
/docs/api/metrics.rst:
--------------------------------------------------------------------------------
1 | Metrics
2 | ===========
3 | 
4 | .. automodule:: adtk.metrics
5 |    :members:
6 |    :inherited-members:
7 | 


--------------------------------------------------------------------------------
/docs/api/modules.rst:
--------------------------------------------------------------------------------
 1 | Modules
 2 | ======================================================================
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 3
 6 | 
 7 |    detectors
 8 |    transformers
 9 |    aggregators
10 |    pipe
11 |    data
12 |    metrics
13 |    visualization


--------------------------------------------------------------------------------
/docs/api/pipe.rst:
--------------------------------------------------------------------------------
1 | Pipeline and Pipenet
2 | ====================
3 | 
4 | .. automodule:: adtk.pipe
5 |    :members:
6 |    :inherited-members:
7 | 


--------------------------------------------------------------------------------
/docs/api/transformers.rst:
--------------------------------------------------------------------------------
1 | Transformers
2 | ============
3 | .. automodule:: adtk.transformer
4 |    :members:
5 |    :inherited-members:
6 | 


--------------------------------------------------------------------------------
/docs/api/visualization.rst:
--------------------------------------------------------------------------------
1 | Visualization
2 | =============
3 | 
4 | .. automodule:: adtk.visualization
5 |    :members:
6 |    :inherited-members:
7 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # ADTK documentation build configuration file, created by
  5 | # sphinx-quickstart on Wed May  2 11:26:20 2018.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | import os
 21 | import sys
 22 | 
 23 | import sphinx_rtd_theme
 24 | 
 25 | sys.path.insert(0, os.path.abspath(".."))
 26 | 
 27 | 
 28 | # -- General configuration ------------------------------------------------
 29 | 
 30 | # If your documentation needs a minimal Sphinx version, state it here.
 31 | #
 32 | # needs_sphinx = '1.0'
 33 | 
 34 | # Add any Sphinx extension module names here, as strings. They can be
 35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 36 | # ones.
 37 | extensions = [
 38 |     "sphinx.ext.autodoc",
 39 |     "sphinx.ext.todo",
 40 |     "sphinx.ext.mathjax",
 41 |     "sphinx.ext.viewcode",
 42 |     "sphinx.ext.napoleon",
 43 |     "nbsphinx",
 44 |     "sphinx.ext.autodoc.typehints",
 45 | ]
 46 | 
 47 | autodoc_typehints = "description"
 48 | 
 49 | # Add any paths that contain templates here, relative to this directory.
 50 | templates_path = ["_templates"]
 51 | 
 52 | # The suffix(es) of source filenames.
 53 | # You can specify multiple suffix as a list of string:
 54 | #
 55 | # source_suffix = ['.rst', '.md']
 56 | source_suffix = ".rst"
 57 | 
 58 | # The master toctree document.
 59 | master_doc = "index"
 60 | 
 61 | # General information about the project.
 62 | project = "ADTK"
 63 | copyright = "2019-2020, Arundo Analytics, Inc."
 64 | author = "Arundo Analytics, Inc"
 65 | 
 66 | # The version info for the project you're documenting, acts as replacement for
 67 | # |version| and |release|, also used in various other places throughout the
 68 | # built documents.
 69 | #
 70 | # The short X.Y version.
 71 | version = "0.6"
 72 | # The full version, including alpha/beta/rc tags.
 73 | release = "0.6.2"
 74 | 
 75 | # The language for content autogenerated by Sphinx. Refer to documentation
 76 | # for a list of supported languages.
 77 | #
 78 | # This is also used if you do content translation via gettext catalogs.
 79 | # Usually you set "language" from the command line for these cases.
 80 | language = None
 81 | 
 82 | # List of patterns, relative to source directory, that match files and
 83 | # directories to ignore when looking for source files.
 84 | # This patterns also effect to html_static_path and html_extra_path
 85 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "**.ipynb_checkpoints"]
 86 | 
 87 | # The name of the Pygments (syntax highlighting) style to use.
 88 | pygments_style = "sphinx"
 89 | 
 90 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 91 | todo_include_todos = True
 92 | 
 93 | 
 94 | # -- Options for HTML output ----------------------------------------------
 95 | 
 96 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 97 | # a list of builtin themes.
 98 | #
 99 | # html_theme = 'sphinxdoc'
100 | html_theme = "sphinx_rtd_theme"
101 | 
102 | # Theme options are theme-specific and customize the look and feel of a theme
103 | # further.  For a list of options available for each theme, see the
104 | # documentation.
105 | #
106 | html_theme_options = {"logo_only": True}
107 | 
108 | html_logo = "_static/arundo_logo_black.png"
109 | 
110 | # Add any paths that contain custom static files (such as style sheets) here,
111 | # relative to this directory. They are copied after the builtin static files,
112 | # so a file named "default.css" will overwrite the builtin "default.css".
113 | html_static_path = ["_static"]
114 | 
115 | # Custom sidebar templates, must be a dictionary that maps document names
116 | # to template names.
117 | #
118 | # This is required for the alabaster theme
119 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
120 | # html_sidebars = {
121 | #     '**': [
122 | #         'about.html',
123 | #         'navigation.html',
124 | #         'relations.html',  # needs 'show_related': True theme option to display
125 | #         'searchbox.html',
126 | #         'donate.html',
127 | #     ]
128 | # }
129 | 
130 | 
131 | # -- Options for HTMLHelp output ------------------------------------------
132 | 
133 | # Output file base name for HTML help builder.
134 | htmlhelp_basename = "ADTKdoc"
135 | 
136 | 
137 | # -- Options for LaTeX output ---------------------------------------------
138 | 
139 | latex_elements = {
140 |     # The paper size ('letterpaper' or 'a4paper').
141 |     #
142 |     # 'papersize': 'letterpaper',
143 |     # The font size ('10pt', '11pt' or '12pt').
144 |     #
145 |     # 'pointsize': '10pt',
146 |     # Additional stuff for the LaTeX preamble.
147 |     #
148 |     # 'preamble': '',
149 |     # Latex figure (float) alignment
150 |     #
151 |     # 'figure_align': 'htbp',
152 | }
153 | 
154 | # Grouping the document tree into LaTeX files. List of tuples
155 | # (source start file, target name, title,
156 | #  author, documentclass [howto, manual, or own class]).
157 | latex_documents = [
158 |     (
159 |         master_doc,
160 |         "ADTK.tex",
161 |         "ADTK Documentation",
162 |         "Arundo Analytics",
163 |         "manual",
164 |     )
165 | ]
166 | 
167 | 
168 | # -- Options for manual page output ---------------------------------------
169 | 
170 | # One entry per manual page. List of tuples
171 | # (source start file, name, description, authors, manual section).
172 | man_pages = [(master_doc, "ADTK", "ADTK Documentation", [author], 1)]
173 | 
174 | 
175 | # -- Options for Texinfo output -------------------------------------------
176 | 
177 | # Grouping the document tree into Texinfo files. List of tuples
178 | # (source start file, target name, title, author,
179 | #  dir menu entry, description, category)
180 | texinfo_documents = [
181 |     (
182 |         master_doc,
183 |         "ADTK",
184 |         "ADTK Documentation",
185 |         author,
186 |         "ADTK",
187 |         "One line description of project.",
188 |         "Miscellaneous",
189 |     )
190 | ]
191 | 
192 | 
193 | autodoc_member_order = "bysource"
194 | 


--------------------------------------------------------------------------------
/docs/developer.rst:
--------------------------------------------------------------------------------
  1 | .. _developer:
  2 | 
  3 | ************
  4 | Contributing
  5 | ************
  6 | 
  7 | - `I have a question/suggestion`_
  8 | - `I found a bug`_
  9 | - `I want to develop a new detector/transformer/aggregator`_
 10 | - `The inheritance relationship between model classes is confusing`_
 11 | - `Formatter and linter`_
 12 | - `Unit test`_
 13 | - `Documentation`_
 14 | - `My pull request is ready`_
 15 | - `How are branches and releases managed?`_
 16 | 
 17 | ----------
 18 | 
 19 | I have a question/suggestion
 20 | ============================
 21 | Please open a new issue. For questions, please use label **question**. For suggestions, please use label **enhancement**.
 22 | 
 23 | I found a bug
 24 | =============
 25 | Please check first whether the bug has been noticed in `Issues <https://github.com/arundo/adtk/issues>`_ or `Pull requests <https://github.com/arundo/adtk/pulls>`_.
 26 | 
 27 | If not, please open a new issue with label **bug**. We do not enforce an issue template for now, but we recommend a bug issue to include a description of the bug, configurations of your Python environment, and code that may reproduce the bug.
 28 | 
 29 | If you already know how the problem could be fixed, you are more than welcomed to open a pull request with label **bug** and fix it. Again, we do not enforce a PR template for now, but we recommend you to follow best practice. A unit test is required to cover the found bug. Rules on merging a PR is in `My pull request is ready`_.
 30 | 
 31 | 
 32 | I want to develop a new detector/transformer/aggregator
 33 | =======================================================
 34 | Adding a new detector/transformer/aggregator is usually a task requiring a significant time commitment. Therefore, we want to discuss with you about the necessity of the proposed new component first. Please open a new issue with label **enhancement**. Please do NOT open a PR until the plan of implementation is discussed thoroughly.
 35 | 
 36 | 
 37 | The inheritance relationship between model classes is confusing
 38 | ===============================================================
 39 | Yes, it is somehow confusing, but we think it is logical and minimizes duplication of reusable code.
 40 | You may see :ref:`inheritance` for the full relationship.
 41 | 
 42 | Formatter and linter
 43 | ====================
 44 | `Black <https://black.readthedocs.io/en/stable/>`_ v19.3b0 is the required formatter of ADTK.
 45 | We required **79** characters as maximal line length in ADTK, which is different to the default value in Black.
 46 | A configuration file `pyproject.toml` is included with this setting.
 47 | 
 48 | `isort <https://timothycrosley.github.io/isort/>`_ v4.3.21 is also required to sort imports in ADTK.
 49 | A black-compatible configuration is included in `.isort.cfg`.
 50 | 
 51 | You may install the required version of `Black` and `isort` along with ADTK using extra **dev**.
 52 | 
 53 | .. code-block:: console
 54 | 
 55 |     $ pip install adtk[dev]
 56 | 
 57 | We recommend `Pylint <https://www.pylint.org/>`_ and/or `flask8 <http://flake8.pycqa.org/en/latest/>`_ as the Python linter.
 58 | 
 59 | Unit test
 60 | =========
 61 | `pytest <https://docs.pytest.org/en/latest/>`_ is the required unit test framework of ADTK.
 62 | Unit test coverage is checked by `Coverage.py <https://coverage.readthedocs.io>`_ and pytest plugin `pytest-cov <https://pytest-cov.readthedocs.io>`_.
 63 | We use `tox <https://tox.readthedocs.io>`_ to automate tests in different Python environments.
 64 | 
 65 | You may install all these dependencies along with ADTK using extra **test**.
 66 | 
 67 | .. code-block:: console
 68 | 
 69 |     $ pip install adtk[test]
 70 | 
 71 | Documentation
 72 | =============
 73 | The documentation is generated with `Sphinx <http://www.sphinx-doc.org/>`_.
 74 | You may install all necessary packages for compiling documentation along with ADTK using extra **doc**.
 75 | 
 76 | .. code-block:: console
 77 | 
 78 |     $ pip install adtk[doc]
 79 | 
 80 | My pull request is ready
 81 | ========================
 82 | Here are some general guides about pull requests:
 83 | 
 84 | - Before your pull request is ready for review, please keep a **WIP** label.
 85 | - Your pull request must be reviewed by at least one reviewer AND pass all test before it can be merged.
 86 | - Remember to create unit tests for anything you added/modified.
 87 | - Select the base branch to merge to (for more information about the definition of branches, please see `How are branches and releases managed?`_):
 88 | 
 89 |     - If your pull request does not change the API, please select branch **master**.
 90 |     - If your pull request changes the API, please select branch **develop**.
 91 | 
 92 | - Only repository administrator can merge into branches `master` and `develop`. `Squash and merge <https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-request-merges#squash-and-merge-your-pull-request-commits>`_ is always required.
 93 | - Don't worry about updating version number and changelog. The administrator who merges your pull request will take care of them before merging.
 94 | 
 95 | 
 96 | How are branches and releases managed?
 97 | ======================================
 98 | This is a guideline of managing branches and releases of ADTK.
 99 | 
100 | - The versioning of ADTK follows `SemVer <https://semver.org/>`_.
101 | - ADTK is in major version zero currently (0.Y.Z), which indicates that the public API is unstable.
102 | - ADTK only supports one stable version. If the most recent release is 0.Y.Z, the previous versions (0.y.z | y < Y) are **NOT** supported.
103 | - Release versions
104 | 
105 |     - An increment of minor version Y (0.[Y+1].Z) introduces modifications that change the API, for example adding new features to existing models, adding new models, etc.
106 |     - An increment of patch version Z (0.Y.[Z+1]) introduces modifications that do not change the API, for example bug fix, minor changes to documentation, etc.
107 |     - A new version is released when a set of modifications are accumulated, depending on the importance of the new functionalities and urgency of the bug fix.
108 |     - A release is published to `PyPI <https://pypi.org/project/adtk/>`_ and `GitHub <https://github.com/arundo/adtk/releases>`_.
109 |     - The `stable documentation <https://arundo-adtk.readthedocs-hosted.com/en/stable/>`_ corresponds to the most recent release.
110 | 
111 | - Pre-release versions
112 | 
113 |     - Every time a pull request is merged into branch **master** or **develop**, a new pre-release version is defined.
114 |     - A pull request that changes the public API is versioned as (0.[Y+1].0-dev.N+pr.M), where N is a monotonic increasing index and M is the index of the pull request.
115 | 
116 |       .. admonition:: Example
117 | 
118 |          Assume the latest release is version **0.1.2**. A new parameter is added to an existing function in pull request **#37**. The new functionality will eventually be included in release version 0.2.0. Merging this pull request to the branch **develop** is versioned as **0.2.0-dev.1+pr.37**.
119 | 
120 |          Assume a new function is then created in pull request **#39**. The function is also expected to be released in version 0.2.0. Merging this pull request to the branch **develop** is versioned as **0.2.0-dev.2+pr.39**.
121 | 
122 |     - A pull request that does not change the API is versioned as (0.Y.[Z+1]-dev.N+pr.M), where N is a monotonic increasing index and M is the index of the pull request.
123 | 
124 |       .. admonition:: Example
125 | 
126 |          Assume the latest release is version **0.1.2**. A bug is fixed in pull request **#38**. The new functionality will eventually be included in release version 0.1.3. Merging this pull request to the branch **master** is versioned as **0.1.3-dev.1+pr.38**.
127 | 
128 |          Assume a typo in documentation is then fixed in pull request **#41**. The function is also expected to be released in version 0.1.3. Merging this pull request to the branch **master** is versioned as **0.1.3-dev.2+pr.41**.
129 | 
130 |       .. attention::
131 |         If the modification should also be included in the next "major" release (0.[Y+1].0), a separate pull request to merge the modifications into branch **develop** should be opened.
132 | 
133 |     - The `latest documentation <https://arundo-adtk.readthedocs-hosted.com/en/latest/>`_ corresponds to the most recent pre-release in branch **develop**.
134 | 
135 | 


--------------------------------------------------------------------------------
/docs/examples.rst:
--------------------------------------------------------------------------------
 1 | .. _examples:
 2 | 
 3 | ********
 4 | Examples
 5 | ********
 6 | 
 7 | **Launch an interactive demo notebook in Binder ⇒** |binder|
 8 | 
 9 | .. |binder| image:: https://mybinder.org/badge_logo.svg
10 |  :target: https://mybinder.org/v2/gh/arundo/adtk/master?filepath=docs%2Fnotebooks%2Fdemo.ipynb
11 | 
12 | .. toctree::
13 |    :maxdepth: 2
14 | 
15 |    notebooks/demo.ipynb
16 | 


--------------------------------------------------------------------------------
/docs/images/cyclic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/cyclic.png


--------------------------------------------------------------------------------
/docs/images/level_shift.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/level_shift.png


--------------------------------------------------------------------------------
/docs/images/level_shift_double_rolling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/level_shift_double_rolling.png


--------------------------------------------------------------------------------
/docs/images/local_spike.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/local_spike.png


--------------------------------------------------------------------------------
/docs/images/missing_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/missing_data.png


--------------------------------------------------------------------------------
/docs/images/non_zeros_count.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/non_zeros_count.png


--------------------------------------------------------------------------------
/docs/images/quickstart0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/quickstart0.png


--------------------------------------------------------------------------------
/docs/images/quickstart1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/quickstart1.png


--------------------------------------------------------------------------------
/docs/images/quickstart2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/quickstart2.png


--------------------------------------------------------------------------------
/docs/images/quickstart3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/quickstart3.png


--------------------------------------------------------------------------------
/docs/images/restart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/restart.png


--------------------------------------------------------------------------------
/docs/images/seasonal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/seasonal.png


--------------------------------------------------------------------------------
/docs/images/spike.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/spike.png


--------------------------------------------------------------------------------
/docs/images/split_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/split_1.png


--------------------------------------------------------------------------------
/docs/images/split_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/split_2.png


--------------------------------------------------------------------------------
/docs/images/split_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/split_3.png


--------------------------------------------------------------------------------
/docs/images/split_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/split_4.png


--------------------------------------------------------------------------------
/docs/images/volatility_shift_double_rolling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arundo/adtk/6041f5b9a41a57263d8988bdc26a2dfc7ad675c2/docs/images/volatility_shift_double_rolling.png


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | ================================
 2 | Anomaly Detection Toolkit (ADTK)
 3 | ================================
 4 | 
 5 | Anomaly Detection Toolkit (ADTK) is a Python package for unsupervised /
 6 | rule-based time series anomaly detection.
 7 | 
 8 | As the nature of anomaly varies over different cases, a model may not work
 9 | universally for all anomaly detection problems. Choosing and combining
10 | detection algorithms (detectors), feature engineering methods (transformers),
11 | and ensemble methods (aggregators) properly is the key to build an effective
12 | anomaly detection model.
13 | 
14 | This package offers a set of common detectors, transformers and aggregators
15 | with unified APIs, as well as pipe classes that connect them together into a
16 | model. It also provides some functions to process and visualize time series and
17 | anomaly events.
18 | 
19 | .. include::
20 |    install.rst
21 | 
22 | .. include::
23 |    quickstart.rst
24 | 
25 | .. toctree::
26 |    :caption: Table of Contents
27 |    :maxdepth: 1
28 | 
29 |    install
30 |    quickstart
31 |    userguide
32 |    examples
33 |    api/modules
34 |    developer
35 |    releasehistory
36 | 
37 | 
38 | Indices and tables
39 | ==================
40 | 
41 | * :ref:`genindex`
42 | * :ref:`modindex`
43 | * :ref:`search`
44 | 


--------------------------------------------------------------------------------
/docs/inheritance.rst:
--------------------------------------------------------------------------------
 1 | .. _inheritance:
 2 | 
 3 | Model Classes Inheritance Diagram
 4 | ==================================
 5 | 
 6 |     .. code-block:: console
 7 | 
 8 |         _Model
 9 |             |-- _NonTrainableModel
10 |             |       |-- _NonTrainableUnivariateModel
11 |             |       |       |-- _NonTrainableUnivariateDetector
12 |             |       |       |       └-- ThresholdAD
13 |             |       |       |
14 |             |       |       └-- _NonTrainableUnivariateTransformer
15 |             |       |               |-- RollingAggregate
16 |             |       |               |-- DoubleRollingAggregate
17 |             |       |               |-- Retrospect
18 |             |       |               └-- StandardScale
19 |             |       |
20 |             |       └-- _NonTrainableMultivariateModel
21 |             |               └-- _NonTrainableMultivariateTransformer
22 |             |                       └-- SumAll
23 |             |
24 |             |-- _TrainableModel
25 |             |       |-- _TrainableUnivariateModel
26 |             |       |       |-- _TrainableUnivariateDetector
27 |             |       |       |       |-- QuantileAD
28 |             |       |       |       |-- InterQuartileRangeAD
29 |             |       |       |       |-- GeneralizedESDTestAD
30 |             |       |       |       |-- PersistAD
31 |             |       |       |       |-- LevelShiftAD
32 |             |       |       |       |-- VolatilityShiftAD
33 |             |       |       |       |-- SeasonalAD
34 |             |       |       |       |-- AutoregressionAD
35 |             |       |       |       └-- CustomizedDetector1D
36 |             |       |       |
37 |             |       |       └-- _TrainableUnivariateTransformer
38 |             |       |               |-- ClassicSeasonalDecomposition
39 |             |       |               └-- CustomizedTransformer1D
40 |             |       |
41 |             |       └-- _TrainableMultivariateModel
42 |             |               |-- _TrainableMultivariateDetector
43 |             |               |       |-- MinClusterDetector
44 |             |               |       |-- OutlierDetector
45 |             |               |       |-- RegressionAD
46 |             |               |       |-- PcaAD
47 |             |               |       └-- CustomizedDetectorHD
48 |             |               |
49 |             |               └-- _TrainableMultivariateTransformer
50 |             |                       |-- RegressionResidual
51 |             |                       |-- PcaProjection
52 |             |                       |-- PcaReconstruction
53 |             |                       |-- PcaReconstructionError
54 |             |                       └-- CustomizedTransformerHD
55 |             |
56 |             └-- _Aggregator
57 |                     |-- AndAggregator
58 |                     |-- OrAggregator
59 |                     └-- CustomizedAggregator
60 | 


--------------------------------------------------------------------------------
/docs/install.rst:
--------------------------------------------------------------------------------
 1 | ************
 2 | Installation
 3 | ************
 4 | 
 5 | Prerequisites: Python 3.5 or later.
 6 | 
 7 | It is recommended to install the most recent **stable** release of ADTK from PyPI.
 8 | 
 9 | .. code-block:: console
10 | 
11 |     $ pip install adtk
12 | 
13 | 
14 | Alternatively, you could install from source code. This will give you the **latest**, but unstable, version of ADTK.
15 | 
16 | .. code-block:: console
17 | 
18 |     $ git clone https://github.com/arundo/adtk.git
19 |     $ cd adtk/
20 |     $ git checkout develop
21 |     $ pip install ./
22 | 


--------------------------------------------------------------------------------
/docs/notebooks/data/invalid_series.csv:
--------------------------------------------------------------------------------
 1 | time,value,category
 2 | 2017-01-02,2,"even"
 3 | 2017-01-01,1,"odd"
 4 | 2017-01-03,3,"odd"
 5 | 2017-01-03,3.5,"odd"
 6 | 2017-01-06,6,"even"
 7 | 2017-01-04,4,"even"
 8 | 2017-01-05,5,"odd"
 9 | 2017-01-07,7,"odd"
10 | 2017-01-04,4.5,"even"
11 | 2017-01-08,8,"even"


--------------------------------------------------------------------------------
/docs/notebooks/data/pressure.csv:
--------------------------------------------------------------------------------
  1 | Time,Pressure (psi)
  2 | 2017-05-02 17:08:37,15.239709722009712
  3 | 2017-05-02 17:08:38,15.36847291056765
  4 | 2017-05-02 17:08:39,15.090272828735273
  5 | 2017-05-02 17:08:40,15.088763583688477
  6 | 2017-05-02 17:08:41,15.200615328290896
  7 | 2017-05-02 17:08:42,15.070986867017195
  8 | 2017-05-02 17:08:43,15.11878376808163
  9 | 2017-05-02 17:08:44,15.138324471477922
 10 | 2017-05-02 17:08:45,15.13909700688064
 11 | 2017-05-02 17:08:46,15.107665802212917
 12 | 2017-05-02 17:08:47,15.18561837029227
 13 | 2017-05-02 17:08:48,15.402186917322068
 14 | 2017-05-02 17:08:49,15.228457593028624
 15 | 2017-05-02 17:08:50,15.413565577090209
 16 | 2017-05-02 17:08:51,15.454723147225732
 17 | 2017-05-02 17:08:52,15.534855991090339
 18 | 2017-05-02 17:08:53,15.330246978165752
 19 | 2017-05-02 17:08:54,15.354574067292686
 20 | 2017-05-02 17:08:55,15.621817652985236
 21 | 2017-05-02 17:08:56,15.427433188894016
 22 | 2017-05-02 17:08:57,15.071406227898967
 23 | 2017-05-02 17:08:58,14.75756759472466
 24 | 2017-05-02 17:08:59,14.805465050015666
 25 | 2017-05-02 17:09:00,15.234802426201176
 26 | 2017-05-02 17:09:01,15.437087232441085
 27 | 2017-05-02 17:09:02,15.738189567539767
 28 | 2017-05-02 17:09:03,15.756038224680434
 29 | 2017-05-02 17:09:04,16.007702928313606
 30 | 2017-05-02 17:09:05,15.95887337079757
 31 | 2017-05-02 17:09:06,16.069261245886455
 32 | 2017-05-02 17:09:07,16.021139303536756
 33 | 2017-05-02 17:09:08,15.92225371806386
 34 | 2017-05-02 17:09:09,15.946143618259113
 35 | 2017-05-02 17:09:10,15.86042941968364
 36 | 2017-05-02 17:09:11,16.095978306292412
 37 | 2017-05-02 17:09:12,16.131084233551512
 38 | 2017-05-02 17:09:13,16.20312929013024
 39 | 2017-05-02 17:09:14,16.14234934574565
 40 | 2017-05-02 17:09:15,16.368537142125092
 41 | 2017-05-02 17:09:16,16.236769355879016
 42 | 2017-05-02 17:09:17,16.355623009859208
 43 | 2017-05-02 17:09:18,16.292459705562237
 44 | 2017-05-02 17:09:19,16.123344662103197
 45 | 2017-05-02 17:09:20,16.035255087851237
 46 | 2017-05-02 17:09:21,15.89041408920272
 47 | 2017-05-02 17:09:22,15.852124656492714
 48 | 2017-05-02 17:09:23,15.791073261142838
 49 | 2017-05-02 17:09:24,15.847269511571456
 50 | 2017-05-02 17:09:25,15.949007242403294
 51 | 2017-05-02 17:09:26,15.922468922219998
 52 | 2017-05-02 17:09:27,15.917805491742506
 53 | 2017-05-02 17:09:28,15.969847691790006
 54 | 2017-05-02 17:09:29,15.872659667049081
 55 | 2017-05-02 17:09:30,16.000141314684107
 56 | 2017-05-02 17:09:31,16.151757043201414
 57 | 2017-05-02 17:09:32,16.18840527910839
 58 | 2017-05-02 17:09:33,16.619495372035075
 59 | 2017-05-02 17:09:34,16.78465462093398
 60 | 2017-05-02 17:09:35,16.764443539809445
 61 | 2017-05-02 17:09:36,16.69924449038038
 62 | 2017-05-02 17:09:37,16.6574474731717
 63 | 2017-05-02 17:09:38,16.567127171281328
 64 | 2017-05-02 17:09:39,16.770328971856458
 65 | 2017-05-02 17:09:40,16.644710510043257
 66 | 2017-05-02 17:09:41,16.63010704562598
 67 | 2017-05-02 17:09:42,16.53482415109097
 68 | 2017-05-02 17:09:43,16.723156040156468
 69 | 2017-05-02 17:09:44,16.467790226245164
 70 | 2017-05-02 17:09:45,16.402533798552703
 71 | 2017-05-02 17:09:46,16.38058856311423
 72 | 2017-05-02 17:09:47,16.438078511145683
 73 | 2017-05-02 17:09:48,16.52104939294012
 74 | 2017-05-02 17:09:49,16.482203298493655
 75 | 2017-05-02 17:09:50,16.66025427986016
 76 | 2017-05-02 17:09:51,16.71691566880221
 77 | 2017-05-02 17:09:52,17.161137615270192
 78 | 2017-05-02 17:09:53,16.8896123908346
 79 | 2017-05-02 17:09:54,17.001331766141796
 80 | 2017-05-02 17:09:55,16.81318895359905
 81 | 2017-05-02 17:09:56,
 82 | 2017-05-02 17:09:57,17.026884123119356
 83 | 2017-05-02 17:09:58,
 84 | 2017-05-02 17:09:59,
 85 | 2017-05-02 17:10:00,16.96673195701269
 86 | 2017-05-02 17:10:01,16.95487815012832
 87 | 2017-05-02 17:10:02,16.57113130728771
 88 | 2017-05-02 17:10:03,16.56234092959741
 89 | 2017-05-02 17:10:04,
 90 | 2017-05-02 17:10:05,
 91 | 2017-05-02 17:10:06,
 92 | 2017-05-02 17:10:07,16.51264167119444
 93 | 2017-05-02 17:10:08,
 94 | 2017-05-02 17:10:09,16.545518734116744
 95 | 2017-05-02 17:10:10,16.40980432766589
 96 | 2017-05-02 17:10:11,16.537969863590902
 97 | 2017-05-02 17:10:12,
 98 | 2017-05-02 17:10:13,16.616489456883517
 99 | 2017-05-02 17:10:14,
100 | 2017-05-02 17:10:15,16.78149160354748
101 | 2017-05-02 17:10:16,
102 | 2017-05-02 17:10:17,16.45079172221274
103 | 2017-05-02 17:10:18,16.107414886400854
104 | 2017-05-02 17:10:19,15.941274479727273
105 | 2017-05-02 17:10:20,16.025838257843454
106 | 2017-05-02 17:10:21,15.999849622088048
107 | 2017-05-02 17:10:22,15.927817461336595
108 | 2017-05-02 17:10:23,15.874534999485718
109 | 2017-05-02 17:10:24,16.027227933248774
110 | 2017-05-02 17:10:25,16.154470235037703
111 | 2017-05-02 17:10:26,16.012747549449983
112 | 2017-05-02 17:10:27,16.065482193047696
113 | 2017-05-02 17:10:28,15.980728348649485
114 | 2017-05-02 17:10:29,16.413550565480858
115 | 2017-05-02 17:10:30,16.280452942679567
116 | 2017-05-02 17:10:31,16.224138092517475
117 | 2017-05-02 17:10:32,16.161333019493096
118 | 2017-05-02 17:10:33,16.082300334261678
119 | 2017-05-02 17:10:34,16.107701643749454
120 | 2017-05-02 17:10:35,16.156855473719947
121 | 2017-05-02 17:10:36,15.865483382752707
122 | 2017-05-02 17:10:37,15.762312582993582
123 | 2017-05-02 17:10:38,15.524608075496365
124 | 2017-05-02 17:10:39,15.747147455527768
125 | 2017-05-02 17:10:40,16.1909506844219
126 | 2017-05-02 17:10:41,16.43995757371415
127 | 2017-05-02 17:10:42,16.50228581418069
128 | 2017-05-02 17:10:43,16.686357259074
129 | 2017-05-02 17:10:44,16.716386132610687
130 | 2017-05-02 17:10:45,17.01462517323702
131 | 2017-05-02 17:10:46,17.134857416482305
132 | 2017-05-02 17:10:47,17.093462850011885
133 | 2017-05-02 17:10:48,16.8901231481046
134 | 2017-05-02 17:10:49,16.7749182722062
135 | 2017-05-02 17:10:50,16.831413458097444
136 | 2017-05-02 17:10:51,16.610740748317834
137 | 2017-05-02 17:10:52,16.570813961056842
138 | 2017-05-02 17:10:53,16.812024999376426
139 | 2017-05-02 17:10:54,16.81730127894263
140 | 2017-05-02 17:10:55,17.17470672260042
141 | 2017-05-02 17:10:56,17.435222778269644
142 | 2017-05-02 17:10:57,17.38344426701696
143 | 2017-05-02 17:10:58,17.13874370975417
144 | 2017-05-02 17:10:59,17.130095399877185
145 | 2017-05-02 17:11:00,17.04354495851162
146 | 2017-05-02 17:11:01,17.111547365373095
147 | 2017-05-02 17:11:02,17.402170081578568
148 | 2017-05-02 17:11:03,17.199990985262467
149 | 2017-05-02 17:11:04,17.130012904387513
150 | 2017-05-02 17:11:05,17.1898419232046
151 | 2017-05-02 17:11:06,17.39415767960812
152 | 2017-05-02 17:11:07,17.486095536595165
153 | 2017-05-02 17:11:08,17.560676626214136
154 | 2017-05-02 17:11:09,17.802719908797613
155 | 2017-05-02 17:11:10,17.89156161689412
156 | 2017-05-02 17:11:11,17.84889656508481
157 | 2017-05-02 17:11:12,17.859208931105794
158 | 2017-05-02 17:11:13,17.731923246329142
159 | 2017-05-02 17:11:14,17.830332832061277
160 | 2017-05-02 17:11:15,18.0
161 | 2017-05-02 17:11:16,17.46361853963141
162 | 2017-05-02 17:11:17,17.68290742651825
163 | 2017-05-02 17:11:18,17.672732671235057
164 | 2017-05-02 17:11:19,17.755700951898312
165 | 2017-05-02 17:11:20,17.87494028762646
166 | 2017-05-02 17:11:21,17.461780061638102
167 | 2017-05-02 17:11:22,17.2464910947515
168 | 2017-05-02 17:11:23,17.186526844040376
169 | 2017-05-02 17:11:24,17.04406215164424
170 | 2017-05-02 17:11:25,17.09341822491084
171 | 2017-05-02 17:11:26,17.000725099835716
172 | 2017-05-02 17:11:27,16.692701250287314
173 | 2017-05-02 17:11:28,16.802925871903067
174 | 2017-05-02 17:11:29,17.000969112504883
175 | 2017-05-02 17:11:30,17.102562803035664
176 | 2017-05-02 17:11:31,16.974245976212465
177 | 2017-05-02 17:11:32,16.927286538809113
178 | 2017-05-02 17:11:33,17.026013537698333
179 | 2017-05-02 17:11:34,17.134600307088746
180 | 2017-05-02 17:11:35,17.315853573694895
181 | 2017-05-02 17:11:36,17.262851552297864
182 | 2017-05-02 17:11:37,17.007067288759316
183 | 2017-05-02 17:11:38,16.884844555126286
184 | 2017-05-02 17:11:39,16.980953503984082
185 | 2017-05-02 17:11:40,17.11488239028685
186 | 2017-05-02 17:11:41,17.515431681471497
187 | 2017-05-02 17:11:42,17.536526462450738
188 | 2017-05-02 17:11:43,17.58056151272379
189 | 2017-05-02 17:11:44,17.548644508941294
190 | 2017-05-02 17:11:45,17.4756058351621
191 | 2017-05-02 17:11:46,17.616339548651887
192 | 2017-05-02 17:11:47,17.67997193023491
193 | 2017-05-02 17:11:48,17.642657863333127
194 | 2017-05-02 17:11:49,17.448292751520448
195 | 2017-05-02 17:11:50,17.42613795024508
196 | 2017-05-02 17:11:51,17.35575406556324
197 | 2017-05-02 17:11:52,17.581708102622727
198 | 2017-05-02 17:11:53,17.752207974920502
199 | 2017-05-02 17:11:54,17.568173508445064
200 | 2017-05-02 17:11:55,17.77828498393766
201 | 2017-05-02 17:11:56,17.675318783481277
202 | 


--------------------------------------------------------------------------------
/docs/notebooks/data/price_short.csv:
--------------------------------------------------------------------------------
  1 | Time,Price ($)
  2 | 2017-05-02 00:00:00,21.33
  3 | 2017-05-02 01:00:00,22.05
  4 | 2017-05-02 02:00:00,20.5
  5 | 2017-05-02 03:00:00,20.49
  6 | 2017-05-02 04:00:00,21.11
  7 | 2017-05-02 05:00:00,20.39
  8 | 2017-05-02 06:00:00,20.66
  9 | 2017-05-02 07:00:00,20.77
 10 | 2017-05-02 08:00:00,20.77
 11 | 2017-05-02 09:00:00,20.6
 12 | 2017-05-02 10:00:00,21.03
 13 | 2017-05-02 11:00:00,22.23
 14 | 2017-05-02 12:00:00,21.27
 15 | 2017-05-02 13:00:00,22.3
 16 | 2017-05-02 14:00:00,22.53
 17 | 2017-05-02 15:00:00,22.97
 18 | 2017-05-02 16:00:00,21.83
 19 | 2017-05-02 17:00:00,21.97
 20 | 2017-05-02 18:00:00,23.45
 21 | 2017-05-02 19:00:00,22.37
 22 | 2017-05-02 20:00:00,20.4
 23 | 2017-05-02 21:00:00,18.65
 24 | 2017-05-02 22:00:00,18.92
 25 | 2017-05-02 23:00:00,21.3
 26 | 2017-05-03 00:00:00,22.43
 27 | 2017-05-03 01:00:00,24.1
 28 | 2017-05-03 02:00:00,24.2
 29 | 2017-05-03 03:00:00,25.6
 30 | 2017-05-03 04:00:00,25.33
 31 | 2017-05-03 05:00:00,25.94
 32 | 2017-05-03 06:00:00,25.67
 33 | 2017-05-03 07:00:00,25.12
 34 | 2017-05-03 08:00:00,25.26
 35 | 2017-05-03 09:00:00,24.78
 36 | 2017-05-03 10:00:00,26.09
 37 | 2017-05-03 11:00:00,26.28
 38 | 2017-05-03 12:00:00,26.68
 39 | 2017-05-03 13:00:00,26.35
 40 | 2017-05-03 14:00:00,27.6
 41 | 2017-05-03 15:00:00,26.87
 42 | 2017-05-03 16:00:00,27.53
 43 | 2017-05-03 17:00:00,27.18
 44 | 2017-05-03 18:00:00,26.24
 45 | 2017-05-03 19:00:00,25.75
 46 | 2017-05-03 20:00:00,24.95
 47 | 2017-05-03 21:00:00,24.73
 48 | 2017-05-03 22:00:00,24.39
 49 | 2017-05-03 23:00:00,24.71
 50 | 2017-05-04 00:00:00,25.27
 51 | 2017-05-04 01:00:00,25.12
 52 | 2017-05-04 02:00:00,25.1
 53 | 2017-05-04 03:00:00,25.39
 54 | 2017-05-04 04:00:00,24.85
 55 | 2017-05-04 05:00:00,25.56
 56 | 2017-05-04 06:00:00,26.4
 57 | 2017-05-04 07:00:00,26.6
 58 | 2017-05-04 08:00:00,29.0
 59 | 2017-05-04 09:00:00,29.91
 60 | 2017-05-04 10:00:00,29.8
 61 | 2017-05-04 11:00:00,29.44
 62 | 2017-05-04 12:00:00,29.21
 63 | 2017-05-04 13:00:00,28.71
 64 | 2017-05-04 14:00:00,29.83
 65 | 2017-05-04 15:00:00,29.14
 66 | 2017-05-04 16:00:00,29.06
 67 | 2017-05-04 17:00:00,28.53
 68 | 2017-05-04 18:00:00,29.57
 69 | 2017-05-04 19:00:00,28.15
 70 | 2017-05-04 20:00:00,27.79
 71 | 2017-05-04 21:00:00,27.67
 72 | 2017-05-04 22:00:00,27.99
 73 | 2017-05-04 23:00:00,28.45
 74 | 2017-05-05 00:00:00,28.23
 75 | 2017-05-05 01:00:00,29.22
 76 | 2017-05-05 02:00:00,29.54
 77 | 2017-05-05 03:00:00,32.01
 78 | 2017-05-05 04:00:00,30.5
 79 | 2017-05-05 05:00:00,31.12
 80 | 2017-05-05 06:00:00,30.07
 81 | 2017-05-05 07:00:00,29.27
 82 | 2017-05-05 08:00:00,31.26
 83 | 2017-05-05 09:00:00,33.0
 84 | 2017-05-05 10:00:00,31.15
 85 | 2017-05-05 11:00:00,45.93
 86 | 2017-05-05 12:00:00,45.86
 87 | 2017-05-05 13:00:00,43.730000000000004
 88 | 2017-05-05 14:00:00,43.68
 89 | 2017-05-05 15:00:00,44.07
 90 | 2017-05-05 16:00:00,44.29
 91 | 2017-05-05 17:00:00,42.3
 92 | 2017-05-05 18:00:00,43.4
 93 | 2017-05-05 19:00:00,43.65
 94 | 2017-05-05 20:00:00,43.59
 95 | 2017-05-05 21:00:00,42.83
 96 | 2017-05-05 22:00:00,43.54
 97 | 2017-05-05 23:00:00,44.46
 98 | 2017-05-06 00:00:00,43.980000000000004
 99 | 2017-05-06 01:00:00,44.07
100 | 2017-05-06 02:00:00,44.9
101 | 2017-05-06 03:00:00,42.94
102 | 2017-05-06 04:00:00,43.06
103 | 2017-05-06 05:00:00,41.15
104 | 2017-05-06 06:00:00,40.230000000000004
105 | 2017-05-06 07:00:00,40.7
106 | 2017-05-06 08:00:00,40.55
107 | 2017-05-06 09:00:00,40.15
108 | 2017-05-06 10:00:00,39.86
109 | 2017-05-06 11:00:00,40.71
110 | 2017-05-06 12:00:00,41.41
111 | 2017-05-06 13:00:00,40.629999999999995
112 | 2017-05-06 14:00:00,40.92
113 | 2017-05-06 15:00:00,40.45
114 | 2017-05-06 16:00:00,42.85
115 | 2017-05-06 17:00:00,42.11
116 | 2017-05-06 18:00:00,41.8
117 | 2017-05-06 19:00:00,41.45
118 | 2017-05-06 20:00:00,41.010000000000005
119 | 2017-05-06 21:00:00,41.15
120 | 2017-05-06 22:00:00,41.43
121 | 2017-05-06 23:00:00,39.81
122 | 2017-05-07 00:00:00,39.230000000000004
123 | 2017-05-07 01:00:00,37.91
124 | 2017-05-07 02:00:00,39.15
125 | 2017-05-07 03:00:00,41.620000000000005
126 | 2017-05-07 04:00:00,43.0
127 | 2017-05-07 05:00:00,43.35
128 | 2017-05-07 06:00:00,44.370000000000005
129 | 2017-05-07 07:00:00,44.53
130 | 2017-05-07 08:00:00,46.19
131 | 2017-05-07 09:00:00,46.86
132 | 2017-05-07 10:00:00,46.629999999999995
133 | 2017-05-07 11:00:00,45.5
134 | 2017-05-07 12:00:00,44.86
135 | 2017-05-07 13:00:00,45.17
136 | 2017-05-07 14:00:00,43.95
137 | 2017-05-07 15:00:00,43.730000000000004
138 | 2017-05-07 16:00:00,45.07
139 | 2017-05-07 17:00:00,45.1
140 | 2017-05-07 18:00:00,47.08
141 | 2017-05-07 19:00:00,48.53
142 | 2017-05-07 20:00:00,48.24
143 | 2017-05-07 21:00:00,46.879999999999995
144 | 2017-05-07 22:00:00,46.83
145 | 2017-05-07 23:00:00,46.35
146 | 2017-05-08 00:00:00,46.730000000000004
147 | 2017-05-08 01:00:00,48.34
148 | 2017-05-08 02:00:00,47.22
149 | 2017-05-08 03:00:00,46.83
150 | 2017-05-08 04:00:00,47.16
151 | 2017-05-08 05:00:00,48.3
152 | 2017-05-08 06:00:00,48.81
153 | 2017-05-08 07:00:00,49.22
154 | 2017-05-08 08:00:00,50.57
155 | 2017-05-08 09:00:00,51.06
156 | 2017-05-08 10:00:00,50.83
157 | 2017-05-08 11:00:00,50.88
158 | 2017-05-08 12:00:00,50.18
159 | 2017-05-08 13:00:00,50.72
160 | 2017-05-08 14:00:00,51.66
161 | 2017-05-08 15:00:00,48.69
162 | 2017-05-08 16:00:00,49.9
163 | 2017-05-08 17:00:00,49.85
164 | 2017-05-08 18:00:00,50.31
165 | 2017-05-08 19:00:00,50.97
166 | 2017-05-08 20:00:00,48.68
167 | 2017-05-08 21:00:00,47.48
168 | 2017-05-08 22:00:00,47.15
169 | 2017-05-08 23:00:00,46.35
170 | 2017-05-09 00:00:00,46.629999999999995
171 | 2017-05-09 01:00:00,46.11
172 | 2017-05-09 02:00:00,44.4
173 | 2017-05-09 03:00:00,45.019999999999996
174 | 2017-05-09 04:00:00,46.120000000000005
175 | 2017-05-09 05:00:00,46.68
176 | 2017-05-09 06:00:00,45.97
177 | 2017-05-09 07:00:00,45.71
178 | 2017-05-09 08:00:00,46.25
179 | 2017-05-09 09:00:00,46.86
180 | 2017-05-09 10:00:00,47.86
181 | 2017-05-09 11:00:00,47.57
182 | 2017-05-09 12:00:00,46.15
183 | 2017-05-09 13:00:00,45.47
184 | 2017-05-09 14:00:00,46.0
185 | 2017-05-09 15:00:00,46.75
186 | 2017-05-09 16:00:00,48.97
187 | 2017-05-09 17:00:00,49.09
188 | 2017-05-09 18:00:00,49.34
189 | 2017-05-09 19:00:00,49.16
190 | 2017-05-09 20:00:00,48.75
191 | 2017-05-09 21:00:00,49.53
192 | 2017-05-09 22:00:00,49.89
193 | 2017-05-09 23:00:00,49.68
194 | 2017-05-10 00:00:00,48.6
195 | 2017-05-10 01:00:00,48.48
196 | 2017-05-10 02:00:00,48.09
197 | 2017-05-10 03:00:00,49.34
198 | 2017-05-10 04:00:00,50.29
199 | 2017-05-10 05:00:00,49.27
200 | 2017-05-10 06:00:00,50.43
201 | 2017-05-10 07:00:00,49.86
202 | 


--------------------------------------------------------------------------------
/docs/notebooks/data/seasonal+trend.csv:
--------------------------------------------------------------------------------
  1 | Time,Value
  2 | 2017-05-07,0.0
  3 | 2017-05-14,1.2
  4 | 2017-05-21,2.4
  5 | 2017-05-28,3.6
  6 | 2017-06-04,4.8
  7 | 2017-06-11,6.0
  8 | 2017-06-18,7.2
  9 | 2017-06-25,1.4
 10 | 2017-07-02,2.6
 11 | 2017-07-09,3.8
 12 | 2017-07-16,5.0
 13 | 2017-07-23,6.2
 14 | 2017-07-30,7.4
 15 | 2017-08-06,8.6
 16 | 2017-08-13,2.8
 17 | 2017-08-20,4.0
 18 | 2017-08-27,5.2
 19 | 2017-09-03,6.4
 20 | 2017-09-10,7.6
 21 | 2017-09-17,8.8
 22 | 2017-09-24,10.0
 23 | 2017-10-01,4.2
 24 | 2017-10-08,5.4
 25 | 2017-10-15,6.6
 26 | 2017-10-22,7.8
 27 | 2017-10-29,9.0
 28 | 2017-11-05,10.2
 29 | 2017-11-12,11.4
 30 | 2017-11-19,5.6
 31 | 2017-11-26,6.8
 32 | 2017-12-03,13.0
 33 | 2017-12-10,9.2
 34 | 2017-12-17,10.4
 35 | 2017-12-24,11.6
 36 | 2017-12-31,12.8
 37 | 2018-01-07,7.0
 38 | 2018-01-14,8.2
 39 | 2018-01-21,9.4
 40 | 2018-01-28,10.6
 41 | 2018-02-04,11.8
 42 | 2018-02-11,13.0
 43 | 2018-02-18,14.2
 44 | 2018-02-25,8.4
 45 | 2018-03-04,9.6
 46 | 2018-03-11,10.8
 47 | 2018-03-18,12.0
 48 | 2018-03-25,13.2
 49 | 2018-04-01,14.4
 50 | 2018-04-08,15.6
 51 | 2018-04-15,9.8
 52 | 2018-04-22,11.0
 53 | 2018-04-29,12.2
 54 | 2018-05-06,13.4
 55 | 2018-05-13,14.6
 56 | 2018-05-20,15.8
 57 | 2018-05-27,17.0
 58 | 2018-06-03,11.2
 59 | 2018-06-10,12.4
 60 | 2018-06-17,13.6
 61 | 2018-06-24,14.8
 62 | 2018-07-01,16.0
 63 | 2018-07-08,17.2
 64 | 2018-07-15,18.4
 65 | 2018-07-22,12.6
 66 | 2018-07-29,13.8
 67 | 2018-08-05,15.0
 68 | 2018-08-12,16.2
 69 | 2018-08-19,17.4
 70 | 2018-08-26,18.6
 71 | 2018-09-02,19.8
 72 | 2018-09-09,14.0
 73 | 2018-09-16,15.2
 74 | 2018-09-23,16.4
 75 | 2018-09-30,17.6
 76 | 2018-10-07,18.8
 77 | 2018-10-14,20.0
 78 | 2018-10-21,21.2
 79 | 2018-10-28,15.4
 80 | 2018-11-04,16.6
 81 | 2018-11-11,17.8
 82 | 2018-11-18,19.0
 83 | 2018-11-25,20.2
 84 | 2018-12-02,21.4
 85 | 2018-12-09,22.6
 86 | 2018-12-16,16.8
 87 | 2018-12-23,18.0
 88 | 2018-12-30,19.2
 89 | 2019-01-06,20.4
 90 | 2019-01-13,21.6
 91 | 2019-01-20,22.8
 92 | 2019-01-27,24.0
 93 | 2019-02-03,18.2
 94 | 2019-02-10,19.4
 95 | 2019-02-17,20.6
 96 | 2019-02-24,21.8
 97 | 2019-03-03,23.0
 98 | 2019-03-10,24.2
 99 | 2019-03-17,25.4
100 | 2019-03-24,19.6
101 | 2019-03-31,20.8
102 | 


--------------------------------------------------------------------------------
/docs/notebooks/data/temperature.csv:
--------------------------------------------------------------------------------
  1 | Time,Temperature (C)
  2 | 2017-05-02 00:00:00,18.91
  3 | 2017-05-02 01:00:00,19.91
  4 | 2017-05-02 02:00:00,20.19
  5 | 2017-05-02 03:00:00,18.69
  6 | 2017-05-02 04:00:00,18.11
  7 | 2017-05-02 05:00:00,19.76
  8 | 2017-05-02 06:00:00,17.33
  9 | 2017-05-02 07:00:00,16.91
 10 | 2017-05-02 08:00:00,18.17
 11 | 2017-05-02 09:00:00,17.3
 12 | 2017-05-02 10:00:00,16.63
 13 | 2017-05-02 11:00:00,16.53
 14 | 2017-05-02 12:00:00,18.02
 15 | 2017-05-02 13:00:00,17.38
 16 | 2017-05-02 14:00:00,16.94
 17 | 2017-05-02 15:00:00,16.51
 18 | 2017-05-02 16:00:00,18.71
 19 | 2017-05-02 17:00:00,20.9
 20 | 2017-05-02 18:00:00,21.9
 21 | 2017-05-02 19:00:00,22.29
 22 | 2017-05-02 20:00:00,23.03
 23 | 2017-05-02 21:00:00,24.52
 24 | 2017-05-02 22:00:00,23.58
 25 | 2017-05-02 23:00:00,24.76
 26 | 2017-05-03 00:00:00,23.5
 27 | 2017-05-03 01:00:00,22.86
 28 | 2017-05-03 02:00:00,23.77
 29 | 2017-05-03 03:00:00,22.34
 30 | 2017-05-03 04:00:00,22.2
 31 | 2017-05-03 05:00:00,21.34
 32 | 2017-05-03 06:00:00,21.09
 33 | 2017-05-03 07:00:00,18.29
 34 | 2017-05-03 08:00:00,16.52
 35 | 2017-05-03 09:00:00,15.82
 36 | 2017-05-03 10:00:00,16.74
 37 | 2017-05-03 11:00:00,16.57
 38 | 2017-05-03 12:00:00,16.57
 39 | 2017-05-03 13:00:00,17.26
 40 | 2017-05-03 14:00:00,14.38
 41 | 2017-05-03 15:00:00,14.66
 42 | 2017-05-03 16:00:00,14.86
 43 | 2017-05-03 17:00:00,14.13
 44 | 2017-05-03 18:00:00,14.74
 45 | 2017-05-03 19:00:00,10.31
 46 | 2017-05-03 20:00:00,8.65
 47 | 2017-05-03 21:00:00,9.64
 48 | 2017-05-03 22:00:00,11.03
 49 | 2017-05-03 23:00:00,13.45
 50 | 2017-05-04 00:00:00,17.43
 51 | 2017-05-04 01:00:00,20.66
 52 | 2017-05-04 02:00:00,19.37
 53 | 2017-05-04 03:00:00,18.33
 54 | 2017-05-04 04:00:00,20.07
 55 | 2017-05-04 05:00:00,19.28
 56 | 2017-05-04 06:00:00,19.31
 57 | 2017-05-04 07:00:00,20.38
 58 | 2017-05-04 08:00:00,21.27
 59 | 2017-05-04 09:00:00,23.02
 60 | 2017-05-04 10:00:00,24.52
 61 | 2017-05-04 11:00:00,25.59
 62 | 2017-05-04 12:00:00,24.81
 63 | 2017-05-04 13:00:00,25.61
 64 | 2017-05-04 14:00:00,25.92
 65 | 2017-05-04 15:00:00,24.6
 66 | 2017-05-04 16:00:00,26.01
 67 | 2017-05-04 17:00:00,26.82
 68 | 2017-05-04 18:00:00,26.87
 69 | 2017-05-04 19:00:00,26.63
 70 | 2017-05-04 20:00:00,25.43
 71 | 2017-05-04 21:00:00,25.63
 72 | 2017-05-04 22:00:00,26.1
 73 | 2017-05-04 23:00:00,25.27
 74 | 2017-05-05 00:00:00,26.43
 75 | 2017-05-05 01:00:00,25.34
 76 | 2017-05-05 02:00:00,23.21
 77 | 2017-05-05 03:00:00,24.25
 78 | 2017-05-05 04:00:00,23.85
 79 | 2017-05-05 05:00:00,23.72
 80 | 2017-05-05 06:00:00,22.89
 81 | 2017-05-05 07:00:00,21.28
 82 | 2017-05-05 08:00:00,22.54
 83 | 2017-05-05 09:00:00,21.85
 84 | 2017-05-05 10:00:00,23.51
 85 | 2017-05-05 11:00:00,24.31
 86 | 2017-05-05 12:00:00,24.0
 87 | 2017-05-05 13:00:00,22.91
 88 | 2017-05-05 14:00:00,22.18
 89 | 2017-05-05 15:00:00,20.97
 90 | 2017-05-05 16:00:00,23.06
 91 | 2017-05-05 17:00:00,23.22
 92 | 2017-05-05 18:00:00,24.37
 93 | 2017-05-05 19:00:00,23.1
 94 | 2017-05-05 20:00:00,23.28
 95 | 2017-05-05 21:00:00,24.46
 96 | 2017-05-05 22:00:00,24.13
 97 | 2017-05-05 23:00:00,25.16
 98 | 2017-05-06 00:00:00,24.07
 99 | 2017-05-06 01:00:00,22.71
100 | 2017-05-06 02:00:00,23.09
101 | 2017-05-06 03:00:00,22.71
102 | 2017-05-06 04:00:00,23.35
103 | 2017-05-06 05:00:00,21.38
104 | 2017-05-06 06:00:00,22.09
105 | 2017-05-06 07:00:00,24.69
106 | 2017-05-06 08:00:00,24.66
107 | 2017-05-06 09:00:00,24.7
108 | 2017-05-06 10:00:00,24.87
109 | 2017-05-06 11:00:00,23.01
110 | 2017-05-06 12:00:00,23.44
111 | 2017-05-06 13:00:00,21.83
112 | 2017-05-06 14:00:00,21.41
113 | 2017-05-06 15:00:00,22.65
114 | 2017-05-06 16:00:00,21.91
115 | 2017-05-06 17:00:00,22.41
116 | 2017-05-06 18:00:00,23.43
117 | 2017-05-06 19:00:00,23.71
118 | 2017-05-06 20:00:00,22.34
119 | 2017-05-06 21:00:00,22.0
120 | 2017-05-06 22:00:00,23.96
121 | 2017-05-06 23:00:00,21.94
122 | 2017-05-07 00:00:00,21.66
123 | 2017-05-07 01:00:00,21.11
124 | 2017-05-07 02:00:00,21.23
125 | 2017-05-07 03:00:00,21.98
126 | 2017-05-07 04:00:00,23.59
127 | 2017-05-07 05:00:00,23.32
128 | 2017-05-07 06:00:00,24.13
129 | 2017-05-07 07:00:00,24.63
130 | 2017-05-07 08:00:00,25.1
131 | 2017-05-07 09:00:00,24.54
132 | 2017-05-07 10:00:00,23.54
133 | 2017-05-07 11:00:00,22.44
134 | 2017-05-07 12:00:00,21.69
135 | 2017-05-07 13:00:00,22.01
136 | 2017-05-07 14:00:00,22.77
137 | 2017-05-07 15:00:00,23.09
138 | 2017-05-07 16:00:00,22.54
139 | 2017-05-07 17:00:00,24.35
140 | 2017-05-07 18:00:00,25.87
141 | 2017-05-07 19:00:00,25.51
142 | 2017-05-07 20:00:00,24.69
143 | 2017-05-07 21:00:00,24.82
144 | 2017-05-07 22:00:00,26.09
145 | 2017-05-07 23:00:00,26.42
146 | 2017-05-08 00:00:00,26.98
147 | 2017-05-08 01:00:00,26.76
148 | 2017-05-08 02:00:00,27.22
149 | 2017-05-08 03:00:00,28.77
150 | 2017-05-08 04:00:00,28.53
151 | 2017-05-08 05:00:00,28.67
152 | 2017-05-08 06:00:00,28.92
153 | 2017-05-08 07:00:00,29.21
154 | 2017-05-08 08:00:00,27.79
155 | 2017-05-08 09:00:00,25.92
156 | 2017-05-08 10:00:00,24.9
157 | 2017-05-08 11:00:00,25.07
158 | 2017-05-08 12:00:00,25.62
159 | 2017-05-08 13:00:00,25.09
160 | 2017-05-08 14:00:00,26.47
161 | 2017-05-08 15:00:00,26.32
162 | 2017-05-08 16:00:00,26.34
163 | 2017-05-08 17:00:00,26.15
164 | 2017-05-08 18:00:00,26.28
165 | 2017-05-08 19:00:00,27.99
166 | 2017-05-08 20:00:00,27.65
167 | 2017-05-08 21:00:00,28.76
168 | 2017-05-08 22:00:00,29.28
169 | 2017-05-08 23:00:00,31.18
170 | 2017-05-09 00:00:00,31.26
171 | 2017-05-09 01:00:00,32.99
172 | 2017-05-09 02:00:00,35.94
173 | 2017-05-09 03:00:00,35.86
174 | 2017-05-09 04:00:00,32.12
175 | 2017-05-09 05:00:00,29.2
176 | 2017-05-09 06:00:00,25.6
177 | 2017-05-09 07:00:00,27.07
178 | 2017-05-09 08:00:00,27.38
179 | 2017-05-09 09:00:00,26.77
180 | 2017-05-09 10:00:00,26.38
181 | 2017-05-09 11:00:00,26.52
182 | 2017-05-09 12:00:00,26.61
183 | 2017-05-09 13:00:00,28.07
184 | 2017-05-09 14:00:00,29.46
185 | 2017-05-09 15:00:00,29.11
186 | 2017-05-09 16:00:00,28.56
187 | 2017-05-09 17:00:00,26.0
188 | 2017-05-09 18:00:00,25.45
189 | 2017-05-09 19:00:00,24.47
190 | 2017-05-09 20:00:00,24.12
191 | 2017-05-09 21:00:00,24.51
192 | 2017-05-09 22:00:00,24.69
193 | 2017-05-09 23:00:00,24.66
194 | 2017-05-10 00:00:00,24.86
195 | 2017-05-10 01:00:00,24.73
196 | 2017-05-10 02:00:00,24.93
197 | 2017-05-10 03:00:00,21.7
198 | 2017-05-10 04:00:00,21.43
199 | 2017-05-10 05:00:00,21.32
200 | 2017-05-10 06:00:00,20.98
201 | 2017-05-10 07:00:00,20.76
202 | 


--------------------------------------------------------------------------------
/docs/quickstart.rst:
--------------------------------------------------------------------------------
  1 | ***********
  2 | Quick Start
  3 | ***********
  4 | 
  5 | In this example, we build a model to detect violation of seasonal (weekly and
  6 | daily) traffic pattern. The data used here is the NYC taxi traffic dataset from
  7 | `Numenta Anomaly Benchmark <https://github.com/numenta/NAB>`_.
  8 | 
  9 | 1. Load and validate time series for training.
 10 | 
 11 |         .. code-block:: python
 12 | 
 13 |                 >>> import pandas as pd
 14 |                 >>> s_train = pd.read_csv("./training.csv", index_col="Datetime", parse_dates=True, squeeze=True)
 15 |                 >>> from adtk.data import validate_series
 16 |                 >>> s_train = validate_series(s_train)
 17 |                 >>> print(s_train)
 18 |                 Time
 19 |                 2014-07-01 00:00:00    10844
 20 |                 2014-07-01 00:30:00     8127
 21 |                 2014-07-01 01:00:00     6210
 22 |                 2014-07-01 01:30:00     4656
 23 |                 2014-07-01 02:00:00     3820
 24 |                                        ...
 25 |                 2015-01-04 09:30:00     9284
 26 |                 2015-01-04 10:00:00    10955
 27 |                 2015-01-04 10:30:00    13348
 28 |                 2015-01-04 11:00:00    13517
 29 |                 2015-01-04 11:30:00    14443
 30 |                 Freq: 30T, Name: Traffic, Length: 9000, dtype: int64
 31 | 
 32 | 2. Visualize training time series.
 33 | 
 34 |         .. code-block:: python
 35 | 
 36 |                 >>> from adtk.visualization import plot
 37 |                 >>> plot(s_train)
 38 | 
 39 |         .. figure:: images/quickstart0.png
 40 |                 :width: 800px
 41 |                 :align: center
 42 |                 :height: 150
 43 |                 :alt: quickstart0
 44 | 
 45 | 3. Detect violation of seasonal pattern.
 46 | 
 47 |         .. code-block:: python
 48 | 
 49 |                 >>> from adtk.detector import SeasonalAD
 50 |                 >>> seasonal_ad = SeasonalAD()
 51 |                 >>> anomalies = seasonal_ad.fit_detect(s_train)
 52 |                 >>> plot(s_train, anomaly=anomalies, anomaly_color="red", anomaly_tag="marker")
 53 | 
 54 |         .. figure:: images/quickstart1.png
 55 |                 :width: 800px
 56 |                 :align: center
 57 |                 :height: 150
 58 |                 :alt: quickstart1
 59 | 
 60 | 4. If known anomalies are available, cross check with detection results.
 61 | 
 62 |         .. code-block:: python
 63 | 
 64 |                 >>> known_anomalies = pd.read_csv("./known_anomalies.csv", index_col="Datetime", parse_dates=True, squeeze=True)
 65 |                 >>> from adtk.data import to_events
 66 |                 >>> known_anomalies = to_events(known_anomalies)
 67 |                 >>> print(known_anomalies)
 68 |                 [(Timestamp('2014-07-03 07:00:00', freq='30T'),
 69 |                   Timestamp('2014-07-06 14:59:59.999999999', freq='30T')),
 70 |                  (Timestamp('2014-08-31 18:30:00', freq='30T'),
 71 |                   Timestamp('2014-09-01 21:59:59.999999999', freq='30T')),
 72 |                  (Timestamp('2014-10-31 14:30:00', freq='30T'),
 73 |                   Timestamp('2014-11-02 13:59:59.999999999', freq='30T')),
 74 |                  (Timestamp('2014-11-26 19:00:00', freq='30T'),
 75 |                   Timestamp('2014-11-29 14:29:59.999999999', freq='30T')),
 76 |                  (Timestamp('2014-12-23 19:00:00', freq='30T'),
 77 |                   Timestamp('2014-12-28 13:59:59.999999999', freq='30T')),
 78 |                  (Timestamp('2014-12-28 19:30:00', freq='30T'),
 79 |                   Timestamp('2015-01-02 21:29:59.999999999', freq='30T'))]
 80 |                 >>> plot(s_train,
 81 |                          anomaly={"Known": known_anomalies, "Model": anomalies},
 82 |                          anomaly_tag={"Known": "span", "Model": "marker"},
 83 |                          anomaly_color={"Known": "orange", "Model": "red"})
 84 | 
 85 |         .. figure:: images/quickstart2.png
 86 |                 :width: 800px
 87 |                 :align: center
 88 |                 :height: 150
 89 |                 :alt: quickstart2
 90 | 
 91 | 
 92 | 5. Apply the trained model to new data.
 93 | 
 94 |         .. code-block:: python
 95 | 
 96 |                 >>> s_test = pd.read_csv("./testing.csv", index_col="Datetime", parse_dates=True, squeeze=True)
 97 |                 >>> s_test = validate_series(s_test)
 98 |                 >>> print(s_test)
 99 |                 Datetime
100 |                 2015-01-04 12:00:00    15285
101 |                 2015-01-04 12:30:00    16028
102 |                 2015-01-04 13:00:00    16329
103 |                 2015-01-04 13:30:00    15891
104 |                 2015-01-04 14:00:00    15960
105 |                                        ...
106 |                 2015-01-31 21:30:00    24670
107 |                 2015-01-31 22:00:00    25721
108 |                 2015-01-31 22:30:00    27309
109 |                 2015-01-31 23:00:00    26591
110 |                 2015-01-31 23:30:00    26288
111 |                 Freq: 30T, Name: Traffic, Length: 1320, dtype: int64
112 |                 >>> anomalies_pred = seasonal_ad.detect(s_test)
113 |                 >>> plot(s_test, anomaly=anomalies_pred,
114 |                          ts_linewidth=1, anomaly_color='red', anomaly_tag="marker")
115 | 
116 |         .. figure:: images/quickstart3.png
117 |                 :width: 800px
118 |                 :align: center
119 |                 :height: 150
120 |                 :alt: quickstart3
121 | 
122 | For more examples, please check :ref:`examples`. But before that, we recommend
123 | you to read :ref:`userguide` first.
124 | 


--------------------------------------------------------------------------------
/docs/releasehistory.rst:
--------------------------------------------------------------------------------
  1 | ***************
  2 | Release History
  3 | ***************
  4 | 
  5 | Version 0.6.2 (Apr 16, 2020)
  6 | ===================================
  7 | - Hot fix of wrong documentation url
  8 | 
  9 | Version 0.6.1 (Apr 16, 2020)
 10 | ===================================
 11 | - Migrated the documentation to a new host
 12 | - Fixed minor typos in the documentation
 13 | - Fixed a minor type hinting bug
 14 | 
 15 | Version 0.6.0 (Mar 10, 2020)
 16 | ===================================
 17 | - Re-designed the API of :py:mod:`adtk.visualization.plot`
 18 | - Removed :py:mod:`adtk.data.resample` because its functionality is highly overlapped with pandas resampler module
 19 | - Made :py:mod:`adtk.data.expand_events` accept events in the form of pandas Series/DataFrame
 20 | - Made :py:mod:`adtk.data.expand_events` accept time delta in the form of `str` or `int`
 21 | - Changed the output type of :py:mod:`adtk.data.split_train_test` from a 2-tuple of lists to a list of 2-tuples
 22 | - Turned the following model parameters required from optional
 23 | 
 24 |     - `window` in :py:mod:`adtk.detector.LevelShiftAD`
 25 |     - `window` in :py:mod:`adtk.detector.VolatilityShiftAD`
 26 |     - `window` in :py:mod:`adtk.transformer.RollingAggregate`
 27 |     - `window` in :py:mod:`adtk.transformer.DoubleRollingAggregate`
 28 |     - `model` in :py:mod:`adtk.detector.MinClusterDetector`
 29 |     - `model` in :py:mod:`adtk.detector.OutlierDetector`
 30 |     - `target` and `regressor` in :py:mod:`adtk.detector.RegressionAD`
 31 |     - `target` and `regressor` in :py:mod:`adtk.transformer.RegressionResidual`
 32 |     - `aggregate_func` in :py:mod:`adtk.aggregator.CustomizedAggregator`
 33 |     - `detect_func` in :py:mod:`adtk.detector.CustomizedDetector1D`
 34 |     - `detect_func` in :py:mod:`adtk.detector.CustomizedDetectorHD`
 35 |     - `transform_func` in :py:mod:`adtk.transformer.CustomizedTransformer1D`
 36 |     - `transform_func` in :py:mod:`adtk.detector.CustomizedTransformer1D`
 37 |     - `steps` in :py:mod:`adtk.pipe.Pipeline`
 38 | 
 39 | - Added consistency check between training and testing inputs in multivariate models
 40 | - Improved time index check in time-dependent models
 41 | - Turned all second-order sub-modules private, and a user now can only import from the following first-order modules
 42 | 
 43 |     - :py:mod:`adtk.detector`
 44 |     - :py:mod:`adtk.transformer`
 45 |     - :py:mod:`adtk.aggregator`
 46 |     - :py:mod:`adtk.pipe`
 47 |     - :py:mod:`adtk.data`
 48 |     - :py:mod:`adtk.metrics`
 49 |     - :py:mod:`adtk.visualization`
 50 | 
 51 | - Refactored the inheritance structure of model components (see :ref:`inheritance`)
 52 | - Added Python 3.8 support
 53 | - Fixed compatibility issues with statsmodels v0.11
 54 | - Fixed compatibility issues with pandas v1.0
 55 | - Created an interactive demo notebook in Binder
 56 | - Added type hints, and added type checking in CI/CD test
 57 | - Added `Black` and `isort` to developer requirement and CI/CD check
 58 | - Optimized release process by publishing package to PyPI through GitHub Actions
 59 | - Improved docstrings and API documentation
 60 | - Fixed many minor bugs and typos
 61 | 
 62 | Version 0.5.5 (Feb 24, 2020)
 63 | ===================================
 64 | - Fixed a bug that empty lists were ignored by AndAggregator
 65 | - Fixed some typo in the documentation
 66 | 
 67 | Version 0.5.4 (Feb 18, 2020)
 68 | ===================================
 69 | - Optimized the workflow of how a univariate model is applied to pandas DataFrame
 70 | 
 71 |     - Added more informative error messages
 72 |     - Fixed some bugs resulting in model-column matching error due to inconsistency between output Series names and DataFrame columns
 73 |     - Clarified the workflow in the documentation
 74 | 
 75 | Version 0.5.3 (Feb 12, 2020)
 76 | ===================================
 77 | - Quick hotfix to avoid errors caused by statsmodels v0.11 by requiring statsmodels dependency <0.11
 78 | 
 79 | Version 0.5.2 (Jan 14, 2020)
 80 | ===================================
 81 | - Formalized the management of releases and pre-releases, including rules of branches and versioning
 82 | - Added more rules for developers to the documentation
 83 | 
 84 | Version 0.5.1 (Jan 2, 2020)
 85 | ===================================
 86 | - Added many new unit tests, and modified some old unit test
 87 | - Removed seaborn from dependencies (use matplotlib built-in style now)
 88 | - Fixed a bug in the metric module of dict objects as input
 89 | - Fixed a bug in the detector OutlierDetector that output series has dtype object if NaN is present
 90 | - Fixed a bug in transformer pipeline that detect and transform methods are confused
 91 | - Fixed a bug in pipenet that an aggregator node may crash if its input is from a node where subset contains a single item
 92 | - Fixed a bug in pipenet summary that subset column are always "all" even if not
 93 | - Some minor optimization of code
 94 | 
 95 | Version 0.5.0 (Dec 18, 2019)
 96 | ===================================
 97 | - Changed the parameter `steps` of pipenet from list to dict
 98 | - Added method `summary` to pipenet
 99 | - Corrected some major algorithmic issues on seasonal decomposition
100 | 
101 |     - Removed STL decomposition transformer, and hence the corresponding option in SeasonalAD detector
102 |     - Recreated classic seasonal decomposition transformer
103 | 
104 | - Updated the demo notebook in the documentation
105 | - Added an option to hide legend in the plotting function
106 | - Added some package setup options for developers
107 | - Fixed an issue of tracking Travis and Coveralls status
108 | - Some minor internal optimization in the code
109 | - Fixed some format issues and typos in the documentation
110 | 
111 | Version 0.4.1 (Nov 21, 2019)
112 | ===================================
113 | - Fixed an issue of tox environments
114 | - Minor spelling/grammar fix in documentation
115 | 
116 | Version 0.4.0 (Nov 18, 2019)
117 | ===================================
118 | - Added support to Python 3.5
119 | - Better unit tests on dependencies
120 | - Minor typo fix in documentation
121 | - Minor code optimization
122 | - Added download statistics to README
123 | - Added coverage test
124 | 
125 | Version 0.3.0 (Sep 27, 2019)
126 | ===================================
127 | - Initial release


--------------------------------------------------------------------------------
/docs/requirements-docs.txt:
--------------------------------------------------------------------------------
1 | sphinx>=2.0
2 | sphinx_rtd_theme<0.5
3 | nbsphinx>=0.4
4 | python-dateutil>=2.5
5 | jupyter>=1
6 | 


--------------------------------------------------------------------------------
/docs/userguide.rst:
--------------------------------------------------------------------------------
  1 | .. _userguide:
  2 | 
  3 | **********
  4 | User Guide
  5 | **********
  6 | 
  7 | This is a brief guide of how to build an anomaly detection model for time series with ADTK. We recommend all users to read through this guide before starting to use ADTK.
  8 | 
  9 | 
 10 | - `Unsupervised vs. Supervised`_
 11 | - `Anomaly Types`_
 12 | - `Univariate vs. Multivariate`_
 13 | - `Detector, Transformer, Aggregator, and Pipe`_
 14 | 
 15 | ----------
 16 | 
 17 | Unsupervised vs. Supervised
 18 | ===================================
 19 | 
 20 | The first thing a user needs to decide before building a model is whether to formulate the problem as a supervised learning problem or an unsupervised problem. Supervised learning methods train models based on time series and normal/anomalous labels in the training set, while unsupervised methods build models only based on time series and domain knowledge, and do not require data labeled.
 21 | 
 22 | Real-world anomaly detection problems usually suffer from lack of labeled historical anomalies, which may prevent users from building a robust supervised model. In this case, an unsupervised/rule-based method is a better choice. ADTK is a package for unsupervised/rule-based models of time series anomaly detection. If a user formulates a task as a supervised learning problem, alternative tools will be needed.
 23 | 
 24 | Anomaly Types
 25 | =====================
 26 | 
 27 | Anomaly is a broad concept, which may refer to many different types of events in time series. A spike of value, a shift of volatility, a violation of seasonal pattern, etc. could all be anomalous or normal, depending on the specific context. ADTK offers a set of common components that can be combined into various types of anomaly detection models for different scenarios. However, ADTK does not select or build a model for a user automatically. A user should know what type of anomaly to detect, therefore can build a model accordingly.
 28 | 
 29 | Outlier
 30 | ```````
 31 | 
 32 | An *outlier* is a data point whose value is significantly different from others. An outlier point in a time series time exceeds the normal range of this series, without considering the temporal relationship between data points. In other words, even regarding all data points as time-independent, an outlier point still outstands.
 33 | 
 34 | .. figure:: images/spike.png
 35 |     :width: 600px
 36 |     :align: center
 37 |     :height: 200px
 38 |     :alt: spike
 39 | 
 40 |     Outlier
 41 | 
 42 | To detect outliers, the *normal range* of time series values is what a detector needs to learn. It can be defined with user-given absolute thresholds (:py:mod:`adtk.detector.ThresholdAD`). Alternatively, a user may create a detector to learn the normal range from historical data (:py:mod:`adtk.detector.QuantileAD`, :py:mod:`adtk.detector.InterQuartileRangeAD`, and  :py:mod:`adtk.detector.GeneralizedESDTestAD`).
 43 | 
 44 | 
 45 | **Outlier is the most basic type of anomaly. Anomaly detection methods targeting at other types often transform a time series into a new one to which outlier detection is applied. Most advanced detectors in ADTK follow this strategy.**
 46 | 
 47 | Spike and Level Shift
 48 | `````````````````````
 49 | 
 50 | In some situations, whether a time point is normal depends on if its value is aligned with its near past. An abrupt increase or decrease of value is called a *spike* if the change is temporary, or a *level shift* if the change is permanent. Please note that, although a spike appears similar to an outlier, it is time-dependent while an outlier is time-independent. The value of a spike could be normal if examing with all data points without considering temporal order (see figure below).
 51 | 
 52 | .. figure:: images/local_spike.png
 53 |     :width: 600px
 54 |     :align: center
 55 |     :height: 200px
 56 |     :alt: local_spike
 57 | 
 58 |     Spike
 59 | 
 60 | .. figure:: images/level_shift.png
 61 |     :width: 600px
 62 |     :align: center
 63 |     :height: 200px
 64 |     :alt: level_shift
 65 | 
 66 |     Level shift
 67 | 
 68 | We may slide two time windows side-by-side and keep tracking the difference between their mean or median values. This difference over time, which is a new time series, is examed by an outlier detector. Whenever the statistics in left and right windows are significantly different, it indicates an abrupt change around this time point. The length of time window controls the time scale of changes to detect: for spikes, the left window is longer than the right one to capture representative information of the near past; on the other hand, for level shifts, both windows should be long enough to capture stable status.
 69 | 
 70 | :py:mod:`adtk.detector.PersistAD` and :py:mod:`adtk.detector.LevelShiftAD` are detectors of spikes and level shifts respectively. Both are implemented with transformer :py:mod:`adtk.transformer.DoubleRollingAggregate` which transforms a time series to the new series with two time windows as mentioned above.
 71 | 
 72 | .. figure:: images/level_shift_double_rolling.png
 73 |     :width: 600px
 74 |     :align: center
 75 |     :height: 400px
 76 |     :alt: level_shift_double_rolling
 77 | 
 78 |     Transform a time series with level shift using `DoubleRollingAggregate` with mean as time window statistic.
 79 | 
 80 | Pattern Change
 81 | ``````````````
 82 | The strategy mentioned above could be generalized to detect the shift of patterns other than value. For example, if shifts of volatility is of interest, the statistic to track in time windows can be standard deviation instead of mean/median. :py:mod:`adtk.transformer.DoubleRollingAggregate` supports 16 common statistics that could be used to quantify the pattern of interest.
 83 | 
 84 | .. figure:: images/volatility_shift_double_rolling.png
 85 |     :width: 600px
 86 |     :align: center
 87 |     :height: 400px
 88 |     :alt: volatility_shift_double_rolling
 89 | 
 90 |     Transform a time series with volatility level shift using `DoubleRollingAggregate` with standard deviation as metric.
 91 | 
 92 | For detecting temporal changes of pattern, :py:mod:`adtk.transformer.RollingAggregate` could also be a good choice. It slides a time window and returns a statistic measured inside the window that quantifies a temporal pattern. For example, if a user wants to detect temporary anomalously high number of visit to a system, tracking the number of visits in sliding window is an effective approach.
 93 | 
 94 | .. figure:: images/non_zeros_count.png
 95 |     :width: 600px
 96 |     :align: center
 97 |     :height: 400px
 98 |     :alt: non_zeros_count
 99 | 
100 |     Transform a time series with temporary high frequency of requests using `RollingAggregate` with number of non-zeros values as metric.
101 | 
102 | Seasonality
103 | ```````````
104 | A seasonal pattern exists when a time series is influenced by seasonal factors (e.g. the hour of the day, the day of the week, the month of the year). Detector :py:mod:`adtk.detector.SeasonalAD` uses transformer :py:mod:`adtk.transformer.ClassicSeasonalDecomposition` to remove the seasonal pattern from the original time series, and highlight time period when the time series does not follow the seasonal pattern normally by examing the residual series.
105 | 
106 | .. figure:: images/seasonal.png
107 |     :width: 600px
108 |     :align: center
109 |     :height: 400px
110 |     :alt: seasonal
111 | 
112 |     Remove the seasonal pattern from time series of NYC traffic using `ClassicSeasonalDecomposition` with the period as a week (data from `Numenta Anomaly Benchmark <https://github.com/numenta/NAB>`_)
113 | 
114 | A user needs to be careful about distinguishing seasonal series and cyclic series. A seasonal series always has a fixed, usually interpretable and known, period because of its seasonal nature. A cyclic time series does not follow a fixed periodic pattern because of its physics nature, even if it appears repeating similar subseries. For example, the trajectory of a moving part in rotating equipment is a 3-D cyclic time series, whose cycle length depends on rotation speed and is not necessarily fixed. Applying seasonality decomposition to it would be problematic, because every cycle may last a slightly different length, and decomposition residuals will be misleading for anomaly detection purpose.
115 | 
116 | .. figure:: images/cyclic.png
117 |     :width: 600px
118 |     :align: center
119 |     :height: 400px
120 |     :alt: cyclic
121 | 
122 |     Applying `ClassicSeasonalDecomposition` to a cyclic series fails to detect anomalous behavior.
123 | 
124 | Currently, ADTK does not provide a transformer that removes cyclic patterns from cyclic (but not seasonal) time series. However, :py:mod:`adtk.detector.AutoregressionAD` can capture changes of autoregressive relationship (the relationship between a data point and points in its near past) and could be used for cyclic (but not seasonal) series in some situations.
125 | 
126 | 
127 | Univariate vs. Multivariate
128 | ===========================
129 | 
130 | If the time series to detect anomalies from is univariate, anomaly detection models should use univariate transformers in :py:mod:`adtk.transformer` and univariate detectors in :py:mod:`adtk.detector`.
131 | 
132 | If the time series is multivariate, a user should understand whether the anomaly detection task is *separable* over series or not. In many cases, detecting anomalies along each series in parallel satisfies the need. For example, if a user has a two-dimensional time series, temperature and humidity, and is trying to detect anomalous temperature or humidity, then applying univariate detector to both temperature and humidity respectively and then aggregating the results will satisfy the need. For users' convenience, when a univariate detector or univariate transformer is applied to a multivariate series (i.e. pandas DataFrame), it applies to every series automatically.
133 | 
134 | Sometimes, a user needs to use intrinsic multivariate algorithms, if the type of anomalies to detect cannot be represented by single dimensions separately. For the previous example, if the user tries to detect anomalous `heat index <https://www.weather.gov/safety/heat-index>`_ (a hybrid metric of temperature and humidity), multivariate transformers and detectors should be considered, because anomalies must be detected based on temperature and humidity simultaneously.
135 | 
136 | Detector, Transformer, Aggregator, and Pipe
137 | ===========================================
138 | 
139 | ADTK provides three types of components to be combined into a model.
140 | A detector is a component that scans time series and returns anomalous time points. They are all included in module :py:mod:`adtk.detector`.
141 | A transformer is a component that transforms time series such that useful information is extracted. It can also be interpreted as a feature engineering component. They are all included in module :py:mod:`adtk.transformer`.
142 | An Aggregator is a component that combines different detection results (anomaly lists). It is an ensemble component. They are all included in module :py:mod:`adtk.aggregator`.
143 | 
144 | A model can be a single detector or a combination of multiple components. If the combination is sequential, i.e. one or several transformers connected with a detector sequentially, it can be connected by an :py:mod:`adtk.pipe.Pipeline` object. If the combination is more complicated and not sequential, it can be connected by an :py:mod:`adtk.pipe.Pipenet` object.
145 | Many detectors in :py:mod:`adtk.detector` are internally implemented as a Pipeline or Pipenet object, but are listed in module :py:mod:`adtk.detector` for users' convenience.
146 | 
147 | For any component that has yet been implemented, a user may implement it as a function and use components :py:mod:`adtk.detector.CustomizedDetector1D`, :py:mod:`adtk.detector.CustomizedDetectorHD`, :py:mod:`adtk.transformer.CustomizedTransformer1D`, :py:mod:`adtk.transformer.CustomizedTransformerHD`, or :py:mod:`adtk.aggregator.CustomizedAggregator` to convert a function into an ADTK component. Then it has the unified APIs and can be used as a normal ADTK component (for example, to be connected with other components using Pipeline or Pipenet). Users are always welcomed to contribute their algorithm into the package permanently. More information for contributors can be found in :ref:`developer`.
148 | 
149 | A user may check :ref:`examples` for examples of ADTK components.
150 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | # mypy.ini
 2 | [mypy]
 3 | disallow_untyped_defs = True
 4 | disallow_untyped_calls = True
 5 | 
 6 | [mypy-scipy.stats]
 7 | ignore_missing_imports = True
 8 | 
 9 | [mypy-sklearn.decomposition]
10 | ignore_missing_imports = True
11 | 
12 | [mypy-sklearn.linear_model]
13 | ignore_missing_imports = True
14 | 
15 | [mypy-numpy]
16 | ignore_missing_imports = True
17 | 
18 | [mypy-pandas]
19 | ignore_missing_imports = True
20 | 
21 | [mypy-matplotlib]
22 | ignore_missing_imports = True
23 | 
24 | [mypy-matplotlib.pyplot]
25 | ignore_missing_imports = True
26 | 
27 | [mypy-matplotlib.collections]
28 | ignore_missing_imports = True
29 | 
30 | [mypy-matplotlib.lines]
31 | ignore_missing_imports = True
32 | 
33 | [mypy-matplotlib.patches]
34 | ignore_missing_imports = True
35 | 
36 | [mypy-statsmodels.tsa.seasonal]
37 | ignore_missing_imports = True
38 | 
39 | [mypy-statsmodels.tsa.stattools]
40 | ignore_missing_imports = True
41 | 
42 | [mypy-pandas.plotting]
43 | ignore_missing_imports = True
44 | 
45 | [mypy-statsmodels]
46 | ignore_missing_imports = True
47 | 
48 | ; we didn't typing the visualization module because there are a lot recursion
49 | ;  on nested tree structure which would be messy if we type rigorously
50 | [mypy-adtk.visualization.*]
51 | ignore_errors = True
52 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | line-length = 79


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = adtk
 3 | version = 0.6.2
 4 | author = Arundo Analytics, Inc.
 5 | maintainer = Tailai Wen
 6 | maintainer_email = tailai.wen@arundo.com
 7 | url = https://github.com/arundo/adtk
 8 | description = A package for unsupervised time series anomaly detection
 9 | long_description = file: README.md
10 | long_description_content_type= text/markdown
11 | keywords = anomaly detection, time series
12 | classifiers =
13 |     Development Status :: 5 - Production/Stable
14 |     License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
15 |     Topic :: Scientific/Engineering
16 |     Programming Language :: Python :: 3.5
17 |     Programming Language :: Python :: 3.6
18 |     Programming Language :: Python :: 3.7
19 |     Programming Language :: Python :: 3.8
20 |     Operating System :: POSIX :: Linux
21 |     Operating System :: Unix
22 |     Operating System :: MacOS
23 |     Operating System :: Microsoft :: Windows
24 | license = Mozilla Public License 2.0 (MPL 2.0)
25 | 
26 | [options]
27 | zip_safe = False
28 | python_requires = >=3.5
29 | package_dir =
30 |     =src
31 | packages = find:
32 | install_requires =
33 |     numpy>=1.15
34 |     pandas>=0.23
35 |     matplotlib>=3.0
36 |     scikit-learn>=0.20
37 |     statsmodels>=0.9
38 |     packaging>=17.0
39 |     tabulate>=0.8
40 | 
41 | [options.packages.find]
42 | where = src
43 | exclude =
44 |     tests
45 |     docs
46 | 
47 | [options.extras_require]
48 | test =
49 |     pytest>=4
50 |     tox>=3
51 |     coverage>3.6,<5
52 |     pytest-cov>=2.7
53 |     coveralls>=1.7
54 |     mypy>=0.641
55 | doc =
56 |     sphinx>=2.4,<3
57 |     sphinx_rtd_theme<0.5
58 |     nbsphinx>=0.4
59 |     python-dateutil>=2.5
60 |     jupyter>=1
61 | dev =
62 |     black==19.3b0
63 |     isort==4.3.21


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | 
3 | setup()
4 | 


--------------------------------------------------------------------------------
/src/adtk/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Anomaly Detection Toolkit
 3 | =========================
 4 | 
 5 | Anomaly Detection Toolkit (ADTK) is a Python package for unsupervised /
 6 | rule-based time series anomaly detection.
 7 | 
 8 | As the nature of anomaly varies over different cases, a model may not work
 9 | universally for all anomaly detection problems. Choosing and combining
10 | detection algorithms (detectors), feature engineering methods (transformers),
11 | and ensemble methods (aggregators) properly is the key to build an effective
12 | anomaly detection model.
13 | 
14 | This package offers a set of common detectors, transformers and aggregators
15 | with unified APIs, as well as pipe classes that connect them together into
16 | models. It also provides some functions to process and visualize time series
17 | and anomaly events.
18 | 
19 | See https://adtk.readthedocs.io for complete documentation.
20 | 
21 | """
22 | 
23 | __version__ = "0.6.2"
24 | 


--------------------------------------------------------------------------------
/src/adtk/_aggregator_base.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List, Tuple, Union
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from ._base import _NonTrainableModel
 6 | 
 7 | 
 8 | class _Aggregator(_NonTrainableModel):
 9 |     def _predict(
10 |         self,
11 |         lists: Union[
12 |             pd.DataFrame,
13 |             Dict[str, Union[pd.Series, pd.DataFrame]],
14 |             Dict[
15 |                 str,
16 |                 List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]],
17 |             ],
18 |         ],
19 |     ) -> Union[
20 |         pd.Series, List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]]
21 |     ]:
22 |         if isinstance(lists, dict):
23 |             if not (
24 |                 all([isinstance(lst, list) for lst in lists.values()])
25 |                 or all(
26 |                     [
27 |                         isinstance(lst, (pd.Series, pd.DataFrame))
28 |                         for lst in lists.values()
29 |                     ]
30 |                 )
31 |             ):
32 |                 raise TypeError(
33 |                     "Input must be a pandas DataFrame, a dict of lists, or a "
34 |                     "dict of pandas Series/DataFrame."
35 |                 )
36 |         elif isinstance(lists, pd.DataFrame):
37 |             pass
38 |         else:
39 |             raise TypeError(
40 |                 "Input must be a pandas DataFrame, a dict of lists, or a dict "
41 |                 "of pandas Series/DataFrame."
42 |             )
43 |         return self._predict_core(lists)
44 | 
45 |     def predict(
46 |         self,
47 |         lists: Union[
48 |             pd.DataFrame,
49 |             Dict[str, Union[pd.Series, pd.DataFrame]],
50 |             Dict[
51 |                 str,
52 |                 List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]],
53 |             ],
54 |         ],
55 |     ) -> Union[
56 |         pd.Series, List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]]
57 |     ]:
58 |         """Aggregate multiple lists of anomalies into one.
59 | 
60 |         Parameters
61 |         ----------
62 |         lists: pandas.DataFrame, a dict of Series/DataFrame, or a dict of lists
63 |             Anomaly lists to be aggregated.
64 | 
65 |             - If a pandas DataFrame, every column is a binary Series
66 |               representing a type of anomaly.
67 |             - If a dict of pandas Series/DataFrame, every value of the dict is
68 |               a binary Series/DataFrame representing a type or some types of
69 |               anomaly;
70 |             - If a dict of list, every value of the dict is a type of anomaly
71 |               as a list of events, where each event is represented as a pandas
72 |               Timestamp if it is instantaneous or a 2-tuple of pandas
73 |               Timestamps if it is a closed time interval.
74 | 
75 |         Returns
76 |         -------
77 |         list or a binary pandas Series
78 |             Aggregated list of anomalies.
79 | 
80 |             - If input is a pandas DataFrame or a dict of Series/DataFrame,
81 |               return a single binary pandas Series;
82 |             - If input is a dict of lists, return a single list of events.
83 | 
84 |         """
85 |         return self._predict(lists)
86 | 
87 |     aggregate = predict
88 | 


--------------------------------------------------------------------------------
/src/adtk/_base.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | from copy import deepcopy
  3 | from typing import Any, Dict, List, Tuple, Union
  4 | 
  5 | import pandas as pd
  6 | 
  7 | 
  8 | class _Model(ABC):
  9 |     "Base class for all models (detectors, transformers, and aggregators)."
 10 | 
 11 |     def __init__(self, *args: Any, **kwargs: Any) -> None:
 12 |         pass
 13 | 
 14 |     def get_params(self) -> Dict[str, Any]:
 15 |         """Get the parameters of this model.
 16 | 
 17 |         Returns
 18 |         -------
 19 |         dict
 20 |             Model parameters.
 21 | 
 22 |         """
 23 |         return {key: getattr(self, key) for key in self._param_names}
 24 | 
 25 |     def set_params(self, **params: Any) -> None:
 26 |         """Set the parameters of this model.
 27 | 
 28 |         Parameters
 29 |         ----------
 30 |         **params
 31 |             Model parameters to set.
 32 | 
 33 |         """
 34 |         for key in params.keys():
 35 |             if key not in self._param_names:
 36 |                 raise KeyError(
 37 |                     "'{}' is not a valid parameter name.".format(key)
 38 |                 )
 39 |         for key, value in params.items():
 40 |             setattr(self, key, value)
 41 | 
 42 |     @property
 43 |     @abstractmethod
 44 |     def _param_names(self) -> Tuple[str, ...]:
 45 |         return tuple()
 46 | 
 47 | 
 48 | class _NonTrainableModel(_Model):
 49 |     "Base class of models that do not need training."
 50 | 
 51 |     @abstractmethod
 52 |     def _predict(self, input: Any) -> Any:
 53 |         pass
 54 | 
 55 |     @abstractmethod
 56 |     def _predict_core(self, input: Any) -> Any:
 57 |         pass
 58 | 
 59 |     @abstractmethod
 60 |     def predict(self, input: Any) -> Any:
 61 |         pass
 62 | 
 63 | 
 64 | class _TrainableModel(_Model):
 65 |     "Base class of models that need training."
 66 | 
 67 |     def __init__(self, *args: Any, **kwargs: Any) -> None:
 68 |         super().__init__(*args, **kwargs)
 69 |         # 0 for not fitted, 1 for fitted, 2 for univariate model fitted by DF
 70 |         self._fitted = 0  # type: int
 71 | 
 72 |     @abstractmethod
 73 |     def _fit(self, input: Any) -> None:
 74 |         pass
 75 | 
 76 |     @abstractmethod
 77 |     def _fit_core(self, input: Any) -> None:
 78 |         pass
 79 | 
 80 |     @abstractmethod
 81 |     def fit(self, input: Any) -> None:
 82 |         pass
 83 | 
 84 |     @abstractmethod
 85 |     def _predict(self, input: Any) -> Any:
 86 |         pass
 87 | 
 88 |     @abstractmethod
 89 |     def _predict_core(self, input: Any) -> Any:
 90 |         pass
 91 | 
 92 |     @abstractmethod
 93 |     def predict(self, input: Any) -> Any:
 94 |         pass
 95 | 
 96 |     @abstractmethod
 97 |     def fit_predict(self, input: Any) -> Any:
 98 |         pass
 99 | 
100 | 
101 | class _NonTrainableUnivariateModel(_NonTrainableModel):
102 |     "Base class of univariate detectors and transformers."
103 | 
104 |     def _predict(
105 |         self, ts: Union[pd.Series, pd.DataFrame]
106 |     ) -> Union[pd.Series, pd.DataFrame]:
107 |         if isinstance(ts, pd.Series):
108 |             s = ts.copy()  # type: pd.Series
109 |             if not isinstance(s.index, pd.DatetimeIndex):
110 |                 raise TypeError(
111 |                     "Index of the input time series must be a pandas "
112 |                     "DatetimeIndex object."
113 |                 )
114 |             predicted = self._predict_core(s)
115 |             # if a Series-to-Series operation, make sure Series name keeps
116 |             if isinstance(predicted, pd.Series):
117 |                 predicted.name = ts.name
118 |         elif isinstance(ts, pd.DataFrame):
119 |             df = ts.copy()  # type: pd.DataFrame
120 |             if df.columns.duplicated().any():
121 |                 raise ValueError(
122 |                     "Input DataFrame must have unique column names."
123 |                 )
124 |             # apply the model to each column
125 |             predicted_all_cols = []
126 |             for col in df.columns:
127 |                 predicted_this_col = self._predict(df[col])
128 |                 # if a Series-to-DF operation, update column name
129 |                 if isinstance(predicted_this_col, pd.DataFrame):
130 |                     predicted_this_col = predicted_this_col.rename(
131 |                         columns={
132 |                             col1: "{}_{}".format(col, col1)
133 |                             for col1 in predicted_this_col.columns
134 |                         }
135 |                     )
136 |                 predicted_all_cols.append(predicted_this_col)
137 |             predicted = pd.concat(predicted_all_cols, axis=1)
138 |         else:
139 |             raise TypeError("Input must be a pandas Series or DataFrame.")
140 |         # make sure index freq is the same (because pandas has a bug that some
141 |         # operation, e.g. concat, may change freq)
142 |         predicted.index.freq = ts.index.freq
143 |         return predicted
144 | 
145 | 
146 | class _TrainableUnivariateModel(_TrainableModel):
147 |     def __init__(self, *args: Any, **kwargs: Any) -> None:
148 |         super().__init__(*args, **kwargs)
149 |         self._models = dict()  # type: Dict[str, _TrainableUnivariateModel]
150 | 
151 |     def _fit(self, ts: Union[pd.Series, pd.DataFrame]) -> None:
152 |         if isinstance(ts, pd.Series):
153 |             s = ts.copy()  # type: pd.Series
154 |             self._fit_core(s)
155 |             self._fitted = 1
156 |         elif isinstance(ts, pd.DataFrame):
157 |             df = ts.copy()
158 |             if not isinstance(df.index, pd.DatetimeIndex):
159 |                 raise TypeError(
160 |                     "Index of the input time series must be a pandas "
161 |                     "DatetimeIndex object."
162 |                 )
163 |             if df.columns.duplicated().any():
164 |                 raise ValueError(
165 |                     "Input DataFrame must have unique column names."
166 |                 )
167 |             # create internal models
168 |             self._models = {
169 |                 col: self.__class__(**deepcopy(self.get_params()))
170 |                 for col in df.columns
171 |             }
172 |             # fit model for each column
173 |             for col in df.columns:
174 |                 self._models[col].fit(df[col])
175 |             self._fitted = 2
176 |         else:
177 |             raise TypeError("Input must be a pandas Series or DataFrame.")
178 | 
179 |     def _predict(
180 |         self, ts: Union[pd.Series, pd.DataFrame]
181 |     ) -> Union[pd.Series, pd.DataFrame]:
182 |         if self._fitted == 0:
183 |             raise RuntimeError("The model must be trained first.")
184 | 
185 |         if isinstance(ts, pd.Series):
186 |             if self._fitted == 2:
187 |                 raise RuntimeError(
188 |                     "The model was trained by a pandas DataFrame object, "
189 |                     "it can only be applied to a pandas DataFrame object with "
190 |                     "the same column names as the one used for training."
191 |                 )
192 |             s = ts.copy()
193 |             if not isinstance(s.index, pd.DatetimeIndex):
194 |                 raise TypeError(
195 |                     "Index of the input time series must be a pandas "
196 |                     "DatetimeIndex object."
197 |                 )
198 |             predicted = self._predict_core(s)
199 |             # if a Series-to-Series operation, make sure Series name keeps
200 |             if isinstance(predicted, pd.Series):
201 |                 predicted.name = ts.name
202 |         elif isinstance(ts, pd.DataFrame):
203 |             df = ts.copy()
204 |             if not isinstance(df.index, pd.DatetimeIndex):
205 |                 raise TypeError(
206 |                     "Index of the input time series must be a pandas "
207 |                     "DatetimeIndex object."
208 |                 )
209 |             if df.columns.duplicated().any():
210 |                 raise ValueError(
211 |                     "Input DataFrame must have unique column names."
212 |                 )
213 |             if self._fitted == 1:
214 |                 # apply the model to each column
215 |                 predicted_all_cols = []
216 |                 for col in df.columns:
217 |                     predicted_this_col = self._predict(df[col])
218 |                     if isinstance(predicted_this_col, pd.DataFrame):
219 |                         predicted_this_col = predicted_this_col.rename(
220 |                             columns={
221 |                                 col1: "{}_{}".format(col, col1)
222 |                                 for col1 in predicted_this_col.columns
223 |                             }
224 |                         )
225 |                     predicted_all_cols.append(predicted_this_col)
226 |                 predicted = pd.concat(predicted_all_cols, axis=1)
227 |             else:
228 |                 # predict for each column
229 |                 if not (set(self._models.keys()) >= set(df.columns)):
230 |                     raise ValueError(
231 |                         "The model was trained by a pandas DataFrame with "
232 |                         "columns {}, but the input DataFrame contains columns "
233 |                         "{} which are unknown to the model.".format(
234 |                             list(set(self._models.keys())),
235 |                             list(set(df.columns) - set(self._models.keys())),
236 |                         )
237 |                     )
238 |                 predicted = pd.concat(
239 |                     [
240 |                         self._models[col]._predict(df[col])
241 |                         for col in df.columns
242 |                     ],
243 |                     axis=1,
244 |                 )
245 |         else:
246 |             raise TypeError("Input must be a pandas Series or DataFrame.")
247 |         # make sure index freq is the same (because pandas has a bug that some
248 |         # operation, e.g. concat, may change freq)
249 |         predicted.index.freq = ts.index.freq
250 |         return predicted
251 | 
252 | 
253 | class _NonTrainableMultivariateModel(_NonTrainableModel):
254 |     def _predict(self, df: pd.DataFrame) -> Union[pd.Series, pd.DataFrame]:
255 |         if isinstance(df, pd.DataFrame):
256 |             if df.columns.duplicated().any():
257 |                 raise ValueError(
258 |                     "Input DataFrame must have unique column names."
259 |                 )
260 |             df_copy = df.copy()
261 |             predicted = self._predict_core(df_copy)
262 |         else:
263 |             raise TypeError("Input must be a pandas DataFrame.")
264 |         # make sure index freq is the same (because pandas has a bug that some
265 |         # operation, e.g. concat, may change freq)
266 |         predicted.index.freq = df.index.freq
267 |         return predicted
268 | 
269 | 
270 | class _TrainableMultivariateModel(_TrainableModel):
271 |     def __init__(self, *args: Any, **kwargs: Any) -> None:
272 |         super().__init__(*args, **kwargs)
273 |         self._cols = []  # type: List[str]
274 | 
275 |     def _fit(self, df: pd.DataFrame) -> None:
276 |         if isinstance(df, pd.DataFrame):
277 |             if df.columns.duplicated().any():
278 |                 raise ValueError(
279 |                     "Input DataFrame must have unique column names."
280 |                 )
281 |             df_copy = df.copy()
282 |             self._fit_core(df_copy)
283 |         else:
284 |             raise TypeError("Input must be a pandas DataFrame.")
285 |         self._cols = list(df.columns)
286 |         self._fitted = 1
287 | 
288 |     def _predict(self, df: pd.DataFrame) -> Union[pd.Series, pd.DataFrame]:
289 |         if self._fitted == 0:
290 |             raise RuntimeError("The model must be trained first.")
291 |         if isinstance(df, pd.DataFrame):
292 |             if df.columns.duplicated().any():
293 |                 raise ValueError(
294 |                     "Input DataFrame must have unique column names."
295 |                 )
296 |             if not (set(df.columns) >= set(self._cols)):
297 |                 raise ValueError(
298 |                     "The model was trained by a pandas DataFrame with columns "
299 |                     "{}, but the input DataFrame does not contain columns {}.".format(
300 |                         self._cols, list(set(self._cols) - set(df.columns))
301 |                     )
302 |                 )
303 |             df_copy = (
304 |                 df.loc[:, self._cols].copy() if self._cols else df.copy()
305 |             )  # in a customized hd model that doesn't need fit, self._cols is empty
306 |             predicted = self._predict_core(df_copy)
307 |         else:
308 |             raise TypeError("Input must be a pandas DataFrame.")
309 |         # make sure index freq is the same (because pandas has a bug that some
310 |         # operation, e.g. concat, may change freq)
311 |         predicted.index.freq = df.index.freq
312 |         return predicted
313 | 


--------------------------------------------------------------------------------
/src/adtk/_transformer_base.py:
--------------------------------------------------------------------------------
  1 | from typing import Union
  2 | 
  3 | import pandas as pd
  4 | 
  5 | from ._base import (
  6 |     _NonTrainableMultivariateModel,
  7 |     _NonTrainableUnivariateModel,
  8 |     _TrainableMultivariateModel,
  9 |     _TrainableUnivariateModel,
 10 | )
 11 | 
 12 | 
 13 | class _NonTrainableUnivariateTransformer(_NonTrainableUnivariateModel):
 14 |     def predict(
 15 |         self, ts: Union[pd.Series, pd.DataFrame]
 16 |     ) -> Union[pd.Series, pd.DataFrame]:
 17 |         """Transform time series.
 18 | 
 19 |         Parameters
 20 |         ----------
 21 |         ts: pandas.Series or pandas.DataFrame
 22 |             Time series to be transformed. If a DataFrame with k columns, it is
 23 |             treated as k independent univariate time series and the transformer
 24 |             will be applied to each univariate series independently.
 25 | 
 26 |         Returns
 27 |         -------
 28 |         pandas.Series or pandas.DataFrame
 29 |             Transformed time series.
 30 | 
 31 |         """
 32 |         return self._predict(ts)
 33 | 
 34 |     transform = predict
 35 | 
 36 | 
 37 | class _TrainableUnivariateTransformer(_TrainableUnivariateModel):
 38 |     def fit(self, ts: Union[pd.Series, pd.DataFrame]) -> None:
 39 |         """Train the transformer with given time series.
 40 | 
 41 |         Parameters
 42 |         ----------
 43 |         ts: pandas.Series or pandas.DataFrame
 44 |             Time series to be used to train the transformer.
 45 |             If a DataFrame with k columns, k univariate transformers will be
 46 |             trained independently.
 47 | 
 48 |         """
 49 |         self._fit(ts)
 50 | 
 51 |     def predict(
 52 |         self, ts: Union[pd.Series, pd.DataFrame]
 53 |     ) -> Union[pd.Series, pd.DataFrame]:
 54 |         """Transform time series.
 55 | 
 56 |         Parameters
 57 |         ----------
 58 |         ts: pandas.Series or pandas.DataFrame
 59 |             Time series to be transformed. If a DataFrame with k columns, it is
 60 |             treated as k independent univariate time series.
 61 | 
 62 |             - If the transformer was trained with a Series, the transformer
 63 |               will be applied to each univariate series independently;
 64 |             - If the transformer was trained with a DataFrame, i.e. the
 65 |               transformer is essentially k transformers, those transformers
 66 |               will be applied to each univariate series respectively.
 67 | 
 68 |         Returns
 69 |         -------
 70 |         pandas.Series or pandas.DataFrame
 71 |             Transformed time series.
 72 | 
 73 |         """
 74 |         return self._predict(ts)
 75 | 
 76 |     def fit_predict(
 77 |         self, ts: Union[pd.Series, pd.DataFrame]
 78 |     ) -> Union[pd.Series, pd.DataFrame]:
 79 |         """Train the transformer, and tranform the time series used for
 80 |         training.
 81 | 
 82 |         Parameters
 83 |         ----------
 84 |         ts: pandas.Series or pandas.DataFrame
 85 |             Time series to be used for training and be transformed.
 86 |             If a DataFrame with k columns, it is treated as k independent
 87 |             univariate time series, and k univariate transformers will be
 88 |             trained and applied to each series independently.
 89 | 
 90 |         Returns
 91 |         -------
 92 |         pandas.Series or pandas.DataFrame
 93 |             Transformed time series.
 94 | 
 95 |         """
 96 |         self.fit(ts)
 97 |         return self.predict(ts)
 98 | 
 99 |     transform = predict
100 |     fit_transform = fit_predict
101 | 
102 | 
103 | class _NonTrainableMultivariateTransformer(_NonTrainableMultivariateModel):
104 |     def predict(self, df: pd.DataFrame) -> Union[pd.Series, pd.DataFrame]:
105 |         """Transform time series.
106 | 
107 |         Parameters
108 |         ----------
109 |         df: pandas.DataFrame
110 |             Time series to be transformed.
111 | 
112 |         Returns
113 |         -------
114 |         pandas.Series or pandas.DataFrame
115 |             Transformed time series.
116 | 
117 |         """
118 |         return self._predict(df)
119 | 
120 |     transform = predict
121 | 
122 | 
123 | class _TrainableMultivariateTransformer(_TrainableMultivariateModel):
124 |     def fit(self, df: pd.DataFrame) -> None:
125 |         """Train the transformer with given time series.
126 | 
127 |         Parameters
128 |         ----------
129 |         df: pandas.DataFrame
130 |             Time series to be used to train the transformer.
131 | 
132 |         """
133 |         self._fit(df)
134 | 
135 |     def predict(self, df: pd.DataFrame) -> Union[pd.Series, pd.DataFrame]:
136 |         """Transform time series.
137 | 
138 |         Parameters
139 |         ----------
140 |         df: pandas.DataFrame
141 |             Time series to be transformed.
142 | 
143 |         Returns
144 |         -------
145 |         pandas.Series or pandas.DataFrame
146 |             Transformed time series.
147 | 
148 |         """
149 |         return self._predict(df)
150 | 
151 |     def fit_predict(self, df: pd.DataFrame) -> Union[pd.Series, pd.DataFrame]:
152 |         """Train the transformer, and tranform the time series used for
153 |         training.
154 | 
155 |         Parameters
156 |         ----------
157 |         df: pandas.DataFrame
158 |             Time series to be used for training and be transformed.
159 | 
160 |         Returns
161 |         -------
162 |         pandas.Series or pandas.DataFrame
163 |             Transformed time series.
164 | 
165 |         """
166 |         self.fit(df)
167 |         return self.predict(df)
168 | 
169 |     transform = predict
170 |     fit_transform = fit_predict
171 | 


--------------------------------------------------------------------------------
/src/adtk/_utils.py:
--------------------------------------------------------------------------------
 1 | """Module for all utility functions.
 2 | 
 3 | """
 4 | 
 5 | from typing import Dict, Optional, Type
 6 | 
 7 | 
 8 | def _get_all_subclasses_from_superclass(
 9 |     superclass: Type
10 | ) -> Dict[str, Optional[str]]:
11 |     result = dict()
12 |     for sb in superclass.__subclasses__():
13 |         if sb.__name__[0] != "_":
14 |             result.update({sb.__name__: sb.__doc__})
15 |         else:
16 |             result.update(_get_all_subclasses_from_superclass(sb))
17 |     return result
18 | 
19 | 
20 | class PandasBugError(Exception):
21 |     def __init__(self) -> None:
22 |         msg = (
23 |             """Pandas before v0.25 has a known bug in method `rolling` when """
24 |             """parameter `window` is offset and `closed` is 'left'. Your """
25 |             """current execution is impacted by this bug. If you are using """
26 |             """Python 3.5.3 or later, please upgrade pandas to v0.25 or """
27 |             """later. If you are using Python 3.5.2 or earlier, please """
28 |             """consider using integer instead of offset to define the left """
29 |             """rolling window."""
30 |         )
31 |         super().__init__(msg)
32 | 


--------------------------------------------------------------------------------
/src/adtk/aggregator/__init__.py:
--------------------------------------------------------------------------------
 1 | """Module of aggregators.
 2 | 
 3 | An aggregator combines multiple lists of anomalies into one.
 4 | 
 5 | """
 6 | from typing import Dict, Optional
 7 | 
 8 | from .._aggregator_base import _Aggregator
 9 | from .._utils import _get_all_subclasses_from_superclass
10 | from ._aggregator import AndAggregator, CustomizedAggregator, OrAggregator
11 | 
12 | 
13 | def print_all_models() -> None:
14 |     """
15 |     Print description of every model in this module.
16 |     """
17 |     model_desc = _get_all_subclasses_from_superclass(
18 |         _Aggregator
19 |     )  # type: Dict[str, Optional[str]]
20 |     for key, value in model_desc.items():
21 |         print("-" * 80)
22 |         print(key)
23 |         print(value)
24 | 
25 | 
26 | __all__ = [
27 |     "OrAggregator",
28 |     "AndAggregator",
29 |     "CustomizedAggregator",
30 |     "print_all_models",
31 | ]
32 | 


--------------------------------------------------------------------------------
/src/adtk/aggregator/_aggregator.py:
--------------------------------------------------------------------------------
  1 | """Module for aggregators.
  2 | 
  3 | An aggregator combines multiple lists of anomalies into one.
  4 | 
  5 | """
  6 | 
  7 | from typing import Any, Callable, Dict, List, Optional, Tuple, Union
  8 | 
  9 | import pandas as pd
 10 | 
 11 | from .._aggregator_base import _Aggregator
 12 | from ..data import validate_events
 13 | 
 14 | 
 15 | class CustomizedAggregator(_Aggregator):
 16 |     """Aggregator derived from a user-given function and parameters.
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     aggregate_func: function
 21 |         A function aggregating multiple types of anomaly.
 22 | 
 23 |         The first input argument must be a pandas DataFrame, a dict of pandas
 24 |         Series/DataFrame, or a dict of event lists.
 25 | 
 26 |         - If a pandas DataFrame, every column is a binary Series representing a
 27 |           type of anomaly.
 28 |         - If a dict of pandas Series/DataFrame, every value of the dict is a
 29 |           binary Series/DataFrame representing a type or some types of anomaly;
 30 |         - If a dict of list, every value of the dict is a type of anomaly as a
 31 |           list of events, where each event is represented as a pandas Timestamp
 32 |           if it is instantaneous or a 2-tuple of pandas Timestamps if it is a
 33 |           closed time interval.
 34 | 
 35 |         Optional input argument may be accepted through parameter
 36 |         `aggregate_func_params`.
 37 | 
 38 |         The output must be a list of pandas Timestamps.
 39 | 
 40 |         - If input is a pandas DataFrame or a dict of Series/DataFrame, return
 41 |           a single binary pandas Series;
 42 |         - If input is a dict of lists, return a single list of events.
 43 | 
 44 |     aggregate_func_params: dict, optional
 45 |         Parameters of `aggregate_func`. Default: None.
 46 | 
 47 |     """
 48 | 
 49 |     def __init__(
 50 |         self,
 51 |         aggregate_func: Callable,
 52 |         aggregate_func_params: Optional[Dict[str, Any]] = None,
 53 |     ) -> None:
 54 |         super().__init__()
 55 |         self.aggregate_func = aggregate_func
 56 |         self.aggregate_func_params = aggregate_func_params
 57 | 
 58 |     @property
 59 |     def _param_names(self) -> Tuple[str, ...]:
 60 |         return ("aggregate_func", "aggregate_func_params")
 61 | 
 62 |     def _predict_core(
 63 |         self,
 64 |         lists: Union[
 65 |             pd.DataFrame,
 66 |             Dict[str, Union[pd.Series, pd.DataFrame]],
 67 |             Dict[
 68 |                 str,
 69 |                 List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]],
 70 |             ],
 71 |         ],
 72 |     ) -> Union[
 73 |         pd.Series, List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]]
 74 |     ]:
 75 |         if self.aggregate_func_params is None:
 76 |             aggregate_func_params = {}
 77 |         else:
 78 |             aggregate_func_params = self.aggregate_func_params
 79 |         return self.aggregate_func(lists, **aggregate_func_params)
 80 | 
 81 | 
 82 | class OrAggregator(_Aggregator):
 83 |     """Aggregator that identifies a time point as anomalous as long as it is
 84 |     included in one of the input anomaly lists.
 85 |     """
 86 | 
 87 |     def __init__(self) -> None:
 88 |         super().__init__()
 89 | 
 90 |     @property
 91 |     def _param_names(self) -> Tuple[str, ...]:
 92 |         return tuple()
 93 | 
 94 |     def _predict_core(
 95 |         self,
 96 |         lists: Union[
 97 |             pd.DataFrame,
 98 |             Dict[str, Union[pd.Series, pd.DataFrame]],
 99 |             Dict[
100 |                 str,
101 |                 List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]],
102 |             ],
103 |         ],
104 |     ) -> Union[
105 |         pd.Series, List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]]
106 |     ]:
107 |         if isinstance(lists, dict):
108 |             if isinstance(next(iter(lists.values())), list):
109 |                 clean_lists = {
110 |                     key: validate_events(value) for key, value in lists.items()
111 |                 }
112 |                 return validate_events(
113 |                     [
114 |                         window
115 |                         for clean_predict in clean_lists.values()
116 |                         for window in clean_predict
117 |                     ]
118 |                 )
119 |             else:  # a dict of pandas Series/DataFrame
120 |                 return self._predict_core(
121 |                     pd.concat(lists, join="outer", axis=1)
122 |                 )
123 |         else:  # pandas DataFrame
124 |             predicted = lists.any(axis=1)
125 |             predicted[~predicted & lists.isna().any(axis=1)] = float("nan")
126 |             return predicted
127 | 
128 | 
129 | class AndAggregator(_Aggregator):
130 |     """Aggregator that identifies a time point as anomalous only if it is
131 |     included in all the input anomaly lists.
132 |     """
133 | 
134 |     def __init__(self) -> None:
135 |         super().__init__()
136 | 
137 |     @property
138 |     def _param_names(self) -> Tuple[str, ...]:
139 |         return tuple()
140 | 
141 |     def _predict_core(
142 |         self,
143 |         lists: Union[
144 |             pd.DataFrame,
145 |             Dict[str, Union[pd.Series, pd.DataFrame]],
146 |             Dict[
147 |                 str,
148 |                 List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]],
149 |             ],
150 |         ],
151 |     ) -> Union[
152 |         pd.Series, List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]]
153 |     ]:
154 |         if isinstance(lists, dict):
155 |             if isinstance(next(iter(lists.values())), list):
156 |                 clean_lists = {
157 |                     key: validate_events(value, point_as_interval=True)
158 |                     for key, value in lists.items()
159 |                 }
160 |                 time_window_stats = {
161 |                     key: pd.Series(
162 |                         [0] * len(clean_predict)
163 |                         + [1] * 2 * len(clean_predict)
164 |                         + [0] * len(clean_predict),
165 |                         index=(
166 |                             [
167 |                                 window[0] - pd.Timedelta("1ns")
168 |                                 for window in clean_predict
169 |                             ]
170 |                             + [window[0] for window in clean_predict]
171 |                             + [window[1] for window in clean_predict]
172 |                             + [
173 |                                 window[1] + pd.Timedelta("1ns")
174 |                                 for window in clean_predict
175 |                             ]
176 |                         ),
177 |                         dtype=int,
178 |                     ).sort_index()
179 |                     for key, clean_predict in clean_lists.items()
180 |                 }  # type: Union[Dict, pd.Series]
181 |                 time_window_stats = {
182 |                     key: value[~value.index.duplicated()]
183 |                     for key, value in time_window_stats.items()
184 |                 }
185 |                 time_window_stats = (
186 |                     pd.concat(time_window_stats, axis=1, join="outer")
187 |                     .fillna(method="ffill")
188 |                     .fillna(method="bfill")
189 |                     .fillna(0)
190 |                 )
191 |                 time_window_stats = time_window_stats.all(axis=1)
192 |                 status = 0
193 |                 last_t = None
194 |                 aggregated_predict = []
195 |                 for t, v in time_window_stats.items():
196 |                     if (status == 0) and (v == 1):
197 |                         start = t
198 |                         status = 1
199 |                     if (status == 1) and (v == 0):
200 |                         end = last_t
201 |                         aggregated_predict.append((start, end))
202 |                         status = 0
203 |                     last_t = t
204 |                 return validate_events(aggregated_predict)
205 |             else:  # a dict of pandas Series/DataFrame
206 |                 return self._predict_core(
207 |                     pd.concat(lists, join="outer", axis=1)
208 |                 )
209 |         else:  # pandas DataFrame
210 |             predicted = lists.all(axis=1)
211 |             predicted[predicted & lists.isna().any(axis=1)] = float("nan")
212 |             return predicted
213 | 


--------------------------------------------------------------------------------
/src/adtk/data/__init__.py:
--------------------------------------------------------------------------------
 1 | """Module of data processing."""
 2 | 
 3 | from ._data import (
 4 |     expand_events,
 5 |     split_train_test,
 6 |     to_events,
 7 |     to_labels,
 8 |     validate_events,
 9 |     validate_series,
10 | )
11 | 
12 | __all__ = [
13 |     "validate_series",
14 |     "to_events",
15 |     "to_labels",
16 |     "expand_events",
17 |     "validate_events",
18 |     "split_train_test",
19 | ]
20 | 


--------------------------------------------------------------------------------
/src/adtk/detector/__init__.py:
--------------------------------------------------------------------------------
 1 | """Module of detectors.
 2 | 
 3 | A detector detects anomalous time points from time series.
 4 | 
 5 | """
 6 | from .._detector_base import (  # _NonTrainableMultivariateDetector,
 7 |     _NonTrainableUnivariateDetector,
 8 |     _TrainableMultivariateDetector,
 9 |     _TrainableUnivariateDetector,
10 | )
11 | from .._utils import _get_all_subclasses_from_superclass
12 | from ._detector_1d import (
13 |     AutoregressionAD,
14 |     CustomizedDetector1D,
15 |     GeneralizedESDTestAD,
16 |     InterQuartileRangeAD,
17 |     LevelShiftAD,
18 |     PersistAD,
19 |     QuantileAD,
20 |     SeasonalAD,
21 |     ThresholdAD,
22 |     VolatilityShiftAD,
23 | )
24 | from ._detector_hd import (
25 |     CustomizedDetectorHD,
26 |     MinClusterDetector,
27 |     OutlierDetector,
28 |     PcaAD,
29 |     RegressionAD,
30 | )
31 | 
32 | 
33 | def print_all_models() -> None:
34 |     """
35 |     Print description of every model in this module.
36 |     """
37 |     model_desc = _get_all_subclasses_from_superclass(
38 |         _NonTrainableUnivariateDetector
39 |     )
40 |     # model_desc.update(
41 |     # _get_all_subclasses_from_superclass(_NonTrainableMultivariateDetector)
42 |     # )
43 |     model_desc.update(
44 |         _get_all_subclasses_from_superclass(_TrainableUnivariateDetector)
45 |     )
46 |     model_desc.update(
47 |         _get_all_subclasses_from_superclass(_TrainableMultivariateDetector)
48 |     )
49 |     for key, value in model_desc.items():
50 |         print("-" * 80)
51 |         print(key)
52 |         print(value)
53 | 
54 | 
55 | __all__ = [
56 |     "ThresholdAD",
57 |     "QuantileAD",
58 |     "InterQuartileRangeAD",
59 |     "GeneralizedESDTestAD",
60 |     "PersistAD",
61 |     "LevelShiftAD",
62 |     "VolatilityShiftAD",
63 |     "AutoregressionAD",
64 |     "SeasonalAD",
65 |     "CustomizedDetector1D",
66 |     "MinClusterDetector",
67 |     "OutlierDetector",
68 |     "RegressionAD",
69 |     "PcaAD",
70 |     "CustomizedDetectorHD",
71 |     "print_all_models",
72 | ]
73 | 


--------------------------------------------------------------------------------
/src/adtk/detector/_detector_hd.py:
--------------------------------------------------------------------------------
  1 | """Module for high-dimensional detectors.
  2 | 
  3 | High-dimensional detectors detect anomalies from high-dimensional time series,
  4 | i.e. from pandas DataFrame.
  5 | """
  6 | 
  7 | from collections import Counter
  8 | from typing import Any, Callable, Dict, Optional, Tuple
  9 | 
 10 | import pandas as pd
 11 | 
 12 | from .._detector_base import _TrainableMultivariateDetector
 13 | from ..aggregator import AndAggregator
 14 | from ..detector import InterQuartileRangeAD, ThresholdAD
 15 | from ..pipe import Pipeline, Pipenet
 16 | from ..transformer import (
 17 |     CustomizedTransformer1D,
 18 |     PcaReconstructionError,
 19 |     RegressionResidual,
 20 | )
 21 | 
 22 | 
 23 | class CustomizedDetectorHD(_TrainableMultivariateDetector):
 24 |     """Multivariate detector derived from a user-given function and parameters.
 25 | 
 26 |     Parameters
 27 |     ----------
 28 |     detect_func: function
 29 |         A function detecting anomalies from multivariate time series.
 30 | 
 31 |         The first input argument must be a pandas DataFrame, optional input
 32 |         argument may be accepted through parameter `detect_func_params` and the
 33 |         output of `fit_func`, and the output must be a binary pandas Series
 34 |         with the same index as input.
 35 | 
 36 |     detect_func_params: dict, optional
 37 |         Parameters of `detect_func`. Default: None.
 38 | 
 39 |     fit_func: function, optional
 40 |         A function training parameters of `detect_func` with multivariate time
 41 |         series.
 42 | 
 43 |         The first input argument must be a pandas Series, optional input
 44 |         argument may be accepted through parameter `fit_func_params`, and the
 45 |         output must be a dict that can be used by `detect_func` as parameters.
 46 |         Default: None.
 47 | 
 48 |     fit_func_params: dict, optional
 49 |         Parameters of `fit_func`. Default: None.
 50 | 
 51 |     """
 52 | 
 53 |     def __init__(
 54 |         self,
 55 |         detect_func: Callable,
 56 |         detect_func_params: Optional[Dict[str, Any]] = None,
 57 |         fit_func: Optional[Callable] = None,
 58 |         fit_func_params: Optional[Dict[str, Any]] = None,
 59 |     ) -> None:
 60 |         self._fitted_detect_func_params = {}  # type: Dict
 61 |         super().__init__()
 62 |         self.detect_func = detect_func
 63 |         self.detect_func_params = detect_func_params
 64 |         self.fit_func = fit_func
 65 |         self.fit_func_params = fit_func_params
 66 |         if self.fit_func is None:
 67 |             self._fitted = 1
 68 | 
 69 |     @property
 70 |     def _param_names(self) -> Tuple[str, ...]:
 71 |         return (
 72 |             "detect_func",
 73 |             "detect_func_params",
 74 |             "fit_func",
 75 |             "fit_func_params",
 76 |         )
 77 | 
 78 |     def _fit_core(self, df: pd.DataFrame) -> None:
 79 |         if self.fit_func is not None:
 80 |             if self.fit_func_params is not None:
 81 |                 fit_func_params = self.fit_func_params
 82 |             else:
 83 |                 fit_func_params = {}
 84 |             self._fitted_detect_func_params = self.fit_func(
 85 |                 df, **fit_func_params
 86 |             )
 87 | 
 88 |     def _predict_core(self, df: pd.DataFrame) -> pd.Series:
 89 |         if self.detect_func_params is not None:
 90 |             detect_func_params = self.detect_func_params
 91 |         else:
 92 |             detect_func_params = {}
 93 |         if self.fit_func is not None:
 94 |             return self.detect_func(
 95 |                 df, **{**self._fitted_detect_func_params, **detect_func_params}
 96 |             )
 97 |         else:
 98 |             return self.detect_func(df, **detect_func_params)
 99 | 
100 | 
101 | class MinClusterDetector(_TrainableMultivariateDetector):
102 |     """Detector that detects anomaly based on clustering of historical data.
103 | 
104 |     This detector peforms clustering using a clustering model, and identifies
105 |     a time points as anomalous if it belongs to the minimal cluster.
106 | 
107 |     Parameters
108 |     ----------
109 |     model: object
110 |         A clustering model to be used for clustering time series values. Same
111 |         as a clustering model in scikit-learn, the model should minimally have
112 |         a `fit` method and a `predict` method. The `predict` method should
113 |         return an array of cluster labels.
114 | 
115 |     """
116 | 
117 |     def __init__(self, model: Any) -> None:
118 |         super().__init__()
119 |         self.model = model
120 | 
121 |     @property
122 |     def _param_names(self) -> Tuple[str, ...]:
123 |         return ("model",)
124 | 
125 |     def _fit_core(self, df: pd.DataFrame) -> None:
126 |         if df.dropna().empty:
127 |             raise RuntimeError("Valid values are not enough for training.")
128 |         clustering_result = self.model.fit_predict(df.dropna())
129 |         cluster_count = Counter(clustering_result)  # type: Counter
130 |         self._anomalous_cluster_id = cluster_count.most_common()[-1][0]
131 | 
132 |     def _predict_core(self, df: pd.DataFrame) -> pd.Series:
133 |         cluster_id = pd.Series(float("nan"), index=df.index)
134 |         if not df.dropna().empty:
135 |             cluster_id.loc[df.dropna().index] = self.model.predict(df.dropna())
136 |         predicted = pd.Series(
137 |             cluster_id == self._anomalous_cluster_id, index=df.index
138 |         )
139 |         predicted[cluster_id.isna()] = float("nan")
140 |         return predicted
141 | 
142 | 
143 | class OutlierDetector(_TrainableMultivariateDetector):
144 |     """Detector that detects anomaly based on a outlier detection model.
145 | 
146 |     This detector peforms time-independent outlier detection using given model,
147 |     and identifies a time points as anomalous if it is labelled as an outlier.
148 | 
149 |     Parameters
150 |     ----------
151 |     model: object
152 |         An outlier detection model to be used. Same as a outlier detection
153 |         model in scikit-learn (e.g. EllipticEnvelope, IsolationForest,
154 |         LocalOutlierFactor), the model should minimally have a `fit_predict`
155 |         method, or `fit` and `predict` methods. The `fit_predict` or `predict`
156 |         method should return an array of outlier indicators where outliers are
157 |         marked by -1.
158 | 
159 |     """
160 | 
161 |     def __init__(self, model: Any) -> None:
162 |         super().__init__()
163 |         self.model = model
164 | 
165 |     @property
166 |     def _param_names(self) -> Tuple[str, ...]:
167 |         return ("model",)
168 | 
169 |     def _fit_core(self, df: pd.DataFrame) -> None:
170 |         if hasattr(self.model, "fit"):
171 |             if df.dropna().empty:
172 |                 raise RuntimeError("Valid values are not enough for training.")
173 |             self.model.fit(df.dropna())
174 | 
175 |     def _predict_core(self, df: pd.DataFrame) -> pd.Series:
176 |         is_outliers = pd.Series(float("nan"), index=df.index)
177 |         if not df.dropna().empty:
178 |             if hasattr(self.model, "predict"):
179 |                 is_outliers.loc[df.dropna().index] = (
180 |                     self.model.predict(df.dropna()) == -1
181 |                 )
182 |             else:
183 |                 is_outliers.loc[df.dropna().index] = (
184 |                     self.model.fit_predict(df.dropna()) == -1
185 |                 )
186 |         predicted = pd.Series(is_outliers == 1, index=df.index)
187 |         predicted[is_outliers.isna()] = float("nan")
188 |         return predicted
189 | 
190 | 
191 | # =============================================================================
192 | # PLEASE PUT PIPE-DERIVED DETECTOR CLASSES BELOW THIS LINE
193 | # =============================================================================
194 | 
195 | 
196 | class RegressionAD(_TrainableMultivariateDetector):
197 |     """Detector that detects anomalous inter-series relationship.
198 | 
199 |     This detector performs regression to build relationship between a target
200 |     series and the rest of series, and identifies a time point as anomalous
201 |     when the residual of regression is anomalously large.
202 | 
203 |     This detector is internally implemented as a `Pipenet` object. Advanced
204 |     users may learn more details by checking attribute `pipe_`.
205 | 
206 |     Parameters
207 |     ----------
208 |     target: str
209 |         Name of the column to be regarded as target variable.
210 | 
211 |     regressor: object
212 |         Regressor to be used. Same as a scikit-learn regressor, it should
213 |         minimally have `fit` and `predict` methods.
214 | 
215 |     c: float, optional
216 |         Factor used to determine the bound of normal range based on historical
217 |         interquartile range. Default: 3.0.
218 | 
219 |     side: str, optional
220 |         - If "both", to detect anomalous positive and negative residuals;
221 |         - If "positive", to only detect anomalous positive residuals;
222 |         - If "negative", to only detect anomalous negative residuals.
223 | 
224 |         Default: "both".
225 | 
226 |     Attributes
227 |     ----------
228 |     pipe_: adtk.pipe.Pipenet
229 |         Internal pipenet object.
230 | 
231 |     """
232 | 
233 |     def __init__(
234 |         self, regressor: Any, target: str, c: float = 3.0, side: str = "both"
235 |     ) -> None:
236 |         self.pipe_ = Pipenet(
237 |             {
238 |                 "regression_residual": {
239 |                     "model": RegressionResidual(
240 |                         regressor=regressor, target=target
241 |                     ),
242 |                     "input": "original",
243 |                 },
244 |                 "abs_residual": {
245 |                     "model": CustomizedTransformer1D(transform_func=abs),
246 |                     "input": "regression_residual",
247 |                 },
248 |                 "iqr_ad": {
249 |                     "model": InterQuartileRangeAD((None, c)),
250 |                     "input": "abs_residual",
251 |                 },
252 |                 "sign_check": {
253 |                     "model": ThresholdAD(
254 |                         high=(
255 |                             0.0
256 |                             if side == "positive"
257 |                             else (
258 |                                 float("inf")
259 |                                 if side == "negative"
260 |                                 else -float("inf")
261 |                             )
262 |                         ),
263 |                         low=(
264 |                             0.0
265 |                             if side == "negative"
266 |                             else (
267 |                                 -float("inf")
268 |                                 if side == "positive"
269 |                                 else float("inf")
270 |                             )
271 |                         ),
272 |                     ),
273 |                     "input": "regression_residual",
274 |                 },
275 |                 "and": {
276 |                     "model": AndAggregator(),
277 |                     "input": ["iqr_ad", "sign_check"],
278 |                 },
279 |             }
280 |         )
281 |         super().__init__()
282 |         self.regressor = regressor
283 |         self.target = target
284 |         self.side = side
285 |         self.c = c
286 |         self._sync_params()
287 | 
288 |     @property
289 |     def _param_names(self) -> Tuple[str, ...]:
290 |         return ("regressor", "target", "c", "side")
291 | 
292 |     def _sync_params(self) -> None:
293 |         if self.side not in ["both", "positive", "negative"]:
294 |             raise ValueError(
295 |                 "Parameter `side` must be 'both', 'positive' or 'negative'."
296 |             )
297 |         self.pipe_.steps["regression_residual"][
298 |             "model"
299 |         ].regressor = self.regressor
300 |         self.pipe_.steps["regression_residual"]["model"].set_params(
301 |             target=self.target
302 |         )
303 |         self.pipe_.steps["iqr_ad"]["model"].set_params(c=(None, self.c))
304 |         self.pipe_.steps["sign_check"]["model"].set_params(
305 |             high=(
306 |                 0.0
307 |                 if self.side == "positive"
308 |                 else (
309 |                     float("inf") if self.side == "negative" else -float("inf")
310 |                 )
311 |             ),
312 |             low=(
313 |                 0.0
314 |                 if self.side == "negative"
315 |                 else (
316 |                     -float("inf") if self.side == "positive" else float("inf")
317 |                 )
318 |             ),
319 |         )
320 | 
321 |     def _fit_core(self, s: pd.DataFrame) -> None:
322 |         self._sync_params()
323 |         self.pipe_.fit(s)
324 | 
325 |     def _predict_core(self, s: pd.DataFrame) -> pd.Series:
326 |         self._sync_params()
327 |         return self.pipe_.detect(s)
328 | 
329 | 
330 | class PcaAD(_TrainableMultivariateDetector):
331 |     """Detector that detects outlier point with principal component analysis.
332 | 
333 |     This detector performs principal component analysis (PCA) to the
334 |     multivariate time series (every time point is treated as a point in high-
335 |     dimensional space), measures reconstruction error at every time point, and
336 |     identifies a time point as anomalous when the recontruction error is beyond
337 |     anomalously large.
338 | 
339 |     This detector is internally implemented as a `Pipeline` object. Advanced
340 |     users may learn more details by checking attribute `pipe_`.
341 | 
342 |     Parameters
343 |     ----------
344 |     k: int, optional
345 |         Number of principal components to use. Default: 1.
346 | 
347 |     c: float, optional
348 |         Factor used to determine the bound of normal range based on historical
349 |         interquartile range. Default: 5.0.
350 | 
351 |     Attributes
352 |     ----------
353 |     pipe_: adtk.pipe.Pipenet
354 |         Internal pipenet object.
355 |     """
356 | 
357 |     def __init__(self, k: int = 1, c: float = 5.0) -> None:
358 |         self.pipe_ = Pipeline(
359 |             [
360 |                 ("pca_reconstruct_error", PcaReconstructionError(k=k)),
361 |                 ("ad", InterQuartileRangeAD(c=c)),
362 |             ]
363 |         )
364 |         super().__init__()
365 |         self.k = k
366 |         self.c = c
367 |         self._sync_params()
368 | 
369 |     @property
370 |     def _param_names(self) -> Tuple[str, ...]:
371 |         return ("k", "c")
372 | 
373 |     def _sync_params(self) -> None:
374 |         self.pipe_.steps[0][1].set_params(k=self.k)
375 |         self.pipe_.steps[1][1].set_params(c=self.c)
376 | 
377 |     def _fit_core(self, s: pd.DataFrame) -> None:
378 |         self._sync_params()
379 |         self.pipe_.fit(s)
380 | 
381 |     def _predict_core(self, s: pd.DataFrame) -> pd.Series:
382 |         self._sync_params()
383 |         return self.pipe_.detect(s)
384 | 


--------------------------------------------------------------------------------
/src/adtk/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Module of metrics that measure the quality of detection results against true
3 | anomalies.
4 | """
5 | 
6 | from ._metrics import f1_score, iou, precision, recall
7 | 
8 | __all__ = ["recall", "precision", "f1_score", "iou"]
9 | 


--------------------------------------------------------------------------------
/src/adtk/pipe/__init__.py:
--------------------------------------------------------------------------------
 1 | """Module of model pipeline and pipenet.
 2 | 
 3 | Pipeline or Pipenet connects multiple components (transformers, detectors,
 4 | and/or aggregators) into a model that may perform complex anomaly detection
 5 | process.
 6 | 
 7 | """
 8 | 
 9 | from ._pipe import Pipeline, Pipenet
10 | 
11 | __all__ = ["Pipeline", "Pipenet"]
12 | 


--------------------------------------------------------------------------------
/src/adtk/transformer/__init__.py:
--------------------------------------------------------------------------------
 1 | """Module of transformers.
 2 | 
 3 | A transformer transforms time series to extract useful information.
 4 | 
 5 | """
 6 | from .._transformer_base import (
 7 |     _NonTrainableMultivariateTransformer,
 8 |     _NonTrainableUnivariateTransformer,
 9 |     _TrainableMultivariateTransformer,
10 |     _TrainableUnivariateTransformer,
11 | )
12 | from .._utils import _get_all_subclasses_from_superclass
13 | from ._transformer_1d import (
14 |     ClassicSeasonalDecomposition,
15 |     CustomizedTransformer1D,
16 |     DoubleRollingAggregate,
17 |     Retrospect,
18 |     RollingAggregate,
19 |     StandardScale,
20 | )
21 | from ._transformer_hd import (
22 |     CustomizedTransformerHD,
23 |     PcaProjection,
24 |     PcaReconstruction,
25 |     PcaReconstructionError,
26 |     RegressionResidual,
27 |     SumAll,
28 | )
29 | 
30 | 
31 | def print_all_models() -> None:
32 |     """
33 |     Print description of every model in this module.
34 |     """
35 |     model_desc = _get_all_subclasses_from_superclass(
36 |         _NonTrainableUnivariateTransformer
37 |     )
38 |     model_desc.update(
39 |         _get_all_subclasses_from_superclass(
40 |             _NonTrainableMultivariateTransformer
41 |         )
42 |     )
43 |     model_desc.update(
44 |         _get_all_subclasses_from_superclass(_TrainableUnivariateTransformer)
45 |     )
46 |     model_desc.update(
47 |         _get_all_subclasses_from_superclass(_TrainableMultivariateTransformer)
48 |     )
49 |     for key, value in model_desc.items():
50 |         print("-" * 80)
51 |         print(key)
52 |         print(value)
53 | 
54 | 
55 | __all__ = [
56 |     "RollingAggregate",
57 |     "DoubleRollingAggregate",
58 |     "ClassicSeasonalDecomposition",
59 |     "Retrospect",
60 |     "StandardScale",
61 |     "CustomizedTransformer1D",
62 |     "RegressionResidual",
63 |     "PcaProjection",
64 |     "PcaReconstruction",
65 |     "PcaReconstructionError",
66 |     "SumAll",
67 |     "CustomizedTransformerHD",
68 |     "print_all_models",
69 | ]
70 | 


--------------------------------------------------------------------------------
/src/adtk/transformer/_transformer_hd.py:
--------------------------------------------------------------------------------
  1 | """Module for high-dimensional transformers.
  2 | 
  3 | High-dimensional transformers transform hight-dimensional time series, i.e.
  4 | pandas DataFrame, into different series, to extract useful information out of
  5 | the original time series.
  6 | 
  7 | """
  8 | 
  9 | from typing import Any, Callable, Dict, Optional, Tuple, Union
 10 | 
 11 | import pandas as pd
 12 | from sklearn.decomposition import PCA
 13 | 
 14 | from .._transformer_base import (
 15 |     _NonTrainableMultivariateTransformer,
 16 |     _TrainableMultivariateTransformer,
 17 | )
 18 | 
 19 | 
 20 | class CustomizedTransformerHD(_TrainableMultivariateTransformer):
 21 |     """Multivariate transformer derived from a user-given function and parameters.
 22 | 
 23 |     Parameters
 24 |     ----------
 25 |     Parameters
 26 |     ----------
 27 |     transform_func: function
 28 |         A function transforming multivariate time series.
 29 | 
 30 |         The first input argument must be a pandas DataFrame, optional input
 31 |         argument may be accepted through parameter `transform_func_params` and
 32 |         the output of `fit_func`, and the output must be a pandas Series or
 33 |         DataFrame with the same index as input.
 34 | 
 35 |     transform_func_params: dict, optional
 36 |         Parameters of `transform_func`. Default: None.
 37 | 
 38 |     fit_func: function, optional
 39 |         A function training parameters of `transform_func` with multivariate
 40 |         time series.
 41 | 
 42 |         The first input argument must be a pandas DataFrame, optional input
 43 |         argument may be accepted through parameter `fit_func_params`, and the
 44 |         output must be a dict that can be used by `transform_func` as
 45 |         parameters. Default: None.
 46 | 
 47 |     fit_func_params: dict, optional
 48 |         Parameters of `fit_func`. Default: None.
 49 | 
 50 |     """
 51 | 
 52 |     def __init__(
 53 |         self,
 54 |         transform_func: Callable,
 55 |         transform_func_params: Optional[Dict[str, Any]] = None,
 56 |         fit_func: Optional[Callable] = None,
 57 |         fit_func_params: Optional[Dict[str, Any]] = None,
 58 |     ) -> None:
 59 |         self._fitted_transform_func_params = {}  # type: Dict
 60 |         super().__init__()
 61 |         self.transform_func = transform_func
 62 |         self.transform_func_params = transform_func_params
 63 |         self.fit_func = fit_func
 64 |         self.fit_func_params = fit_func_params
 65 |         if self.fit_func is None:
 66 |             self._fitted = 1
 67 | 
 68 |     @property
 69 |     def _param_names(self) -> Tuple[str, ...]:
 70 |         return (
 71 |             "transform_func",
 72 |             "transform_func_params",
 73 |             "fit_func",
 74 |             "fit_func_params",
 75 |         )
 76 | 
 77 |     def _fit_core(self, df: pd.DataFrame) -> None:
 78 |         if self.fit_func is not None:
 79 |             if self.fit_func_params is not None:
 80 |                 fit_func_params = self.fit_func_params
 81 |             else:
 82 |                 fit_func_params = {}
 83 |             self._fitted_transform_func_params = self.fit_func(
 84 |                 df, **fit_func_params
 85 |             )
 86 | 
 87 |     def _predict_core(
 88 |         self, df: pd.DataFrame
 89 |     ) -> Union[pd.Series, pd.DataFrame]:
 90 |         if self.transform_func_params is not None:
 91 |             transform_func_params = self.transform_func_params
 92 |         else:
 93 |             transform_func_params = {}
 94 |         if self.fit_func is not None:
 95 |             return self.transform_func(
 96 |                 df,
 97 |                 **{
 98 |                     **self._fitted_transform_func_params,
 99 |                     **transform_func_params,
100 |                 }
101 |             )
102 |         else:
103 |             return self.transform_func(df, **transform_func_params)
104 | 
105 | 
106 | class SumAll(_NonTrainableMultivariateTransformer):
107 |     """Transformer that returns the sum all series as one series."""
108 | 
109 |     def __init__(self) -> None:
110 |         super().__init__()
111 | 
112 |     @property
113 |     def _param_names(self) -> Tuple[str, ...]:
114 |         return tuple()
115 | 
116 |     def _predict_core(self, df: pd.DataFrame) -> pd.Series:
117 |         return df.sum(axis=1, skipna=False)
118 | 
119 | 
120 | class RegressionResidual(_TrainableMultivariateTransformer):
121 |     """Transformer that performs regression to build relationship between a
122 |     target series and the rest of series, and returns regression residual
123 |     series.
124 | 
125 |     Parameters
126 |     ----------
127 |     regressor: object
128 |         Regressor to be used. Same as a scikit-learn regressor, it should
129 |         minimally have `fit` and `predict` methods.
130 |     target: str, optional
131 |         Name of the column to be regarded as target variable.
132 | 
133 |     """
134 | 
135 |     def __init__(self, regressor: Any, target: str) -> None:
136 |         super().__init__()
137 |         self.regressor = regressor
138 |         self.target = target
139 | 
140 |     @property
141 |     def _param_names(self) -> Tuple[str, ...]:
142 |         return ("regressor", "target")
143 | 
144 |     def _fit_core(self, df: pd.DataFrame) -> None:
145 |         if self.target not in df.columns:
146 |             raise RuntimeError(
147 |                 "Cannot find target series {} in input dataframe.".format(
148 |                     self.target
149 |                 )
150 |             )
151 |         self._target = self.target
152 |         self._features = [col for col in df.columns if col != self._target]
153 |         if df.dropna().empty:
154 |             raise RuntimeError("Valid values are not enough for training.")
155 |         self.regressor.fit(
156 |             df.dropna().loc[:, self._features],
157 |             df.dropna().loc[:, self._target],
158 |         )
159 | 
160 |     def _predict_core(self, df: pd.DataFrame) -> pd.Series:
161 |         target = self._target
162 |         features = self._features
163 |         if target not in df.columns:
164 |             raise RuntimeError(
165 |                 "Cannot find target series {} in input dataframe.".format(
166 |                     target
167 |                 )
168 |             )
169 |         if not set(features) <= set(df.columns):
170 |             raise RuntimeError(
171 |                 "The following series are not found in input dataframe: {}.".format(
172 |                     set(features) - set(df.columns)
173 |                 )
174 |             )
175 |         residual = pd.Series(index=df.index, dtype="float64")
176 |         if not df.dropna().empty:
177 |             residual.loc[df.dropna().index] = df.dropna().loc[
178 |                 :, target
179 |             ] - self.regressor.predict(df.dropna().loc[:, features])
180 |         return residual
181 | 
182 | 
183 | class PcaProjection(_TrainableMultivariateTransformer):
184 |     """Transformer that performs principal component analysis (PCA) to the
185 |     multivariate time series (every time point is treated as a point in high-
186 |     dimensional space), and represents those points with their projection on
187 |     the first k principal components.
188 | 
189 |     Parameters
190 |     ----------
191 |     k: int, optional
192 |         Number of principal components to use. Default: 1.
193 | 
194 |     """
195 | 
196 |     def __init__(self, k: int = 1) -> None:
197 |         self._model = None  # type: Any
198 |         super().__init__()
199 |         self.k = k
200 | 
201 |     @property
202 |     def _param_names(self) -> Tuple[str, ...]:
203 |         return ("k",)
204 | 
205 |     def _fit_core(self, df: pd.DataFrame) -> None:
206 |         self._model = PCA(n_components=self.k)
207 |         if df.dropna().empty:
208 |             raise RuntimeError("Valid values are not enough for training.")
209 |         self._model.fit(df.dropna().values)
210 | 
211 |     def _predict_core(self, df: pd.DataFrame) -> pd.DataFrame:
212 |         if self.k > self._model.n_components:
213 |             raise ValueError(
214 |                 "k is increased after previous fitting. Please fit again."
215 |             )
216 |         results = pd.DataFrame(
217 |             index=df.index, columns=["pc{}".format(i) for i in range(self.k)]
218 |         )
219 |         if not df.dropna().empty:
220 |             results.loc[df.dropna().index] = self._model.transform(
221 |                 df.dropna().values
222 |             )[:, : self.k]
223 |         return results
224 | 
225 | 
226 | class PcaReconstruction(_TrainableMultivariateTransformer):
227 |     """Transformer that performs principal component analysis (PCA) to the
228 |     multivariate time series  (every time point is treated as a point in high-
229 |     dimensional space), and reconstructs those points with the first k
230 |     principal components.
231 | 
232 |     Parameters
233 |     ----------
234 |     k: int, optional
235 |         Number of principal components to use. Default: 1.
236 | 
237 |     """
238 | 
239 |     def __init__(self, k: int = 1) -> None:
240 |         self._model = None  # type: Any
241 |         super().__init__()
242 |         self.k = k
243 | 
244 |     @property
245 |     def _param_names(self) -> Tuple[str, ...]:
246 |         return ("k",)
247 | 
248 |     def _fit_core(self, df: pd.DataFrame) -> None:
249 |         self._model = PCA(n_components=self.k)
250 |         if df.dropna().empty:
251 |             raise RuntimeError("Valid values are not enough for training.")
252 |         self._model.fit(df.dropna().values)
253 | 
254 |     def _predict_core(self, df: pd.DataFrame) -> pd.DataFrame:
255 |         if self._model is None:
256 |             raise RuntimeError("Please fit the model first.")
257 |         if self.k > self._model.n_components:
258 |             raise ValueError(
259 |                 "k is increased after previous fitting. Please fit again."
260 |             )
261 |         results = pd.DataFrame(columns=df.columns, index=df.index)
262 |         if not df.dropna().empty:
263 |             results.loc[df.dropna().index] = self._model.inverse_transform(
264 |                 self._model.transform(df.dropna().values)
265 |             )
266 |         return results
267 | 
268 | 
269 | class PcaReconstructionError(_TrainableMultivariateTransformer):
270 |     """Transformer that performs principal component analysis (PCA) to the
271 |     multivariate time series  (every time point is treated as a point in high-
272 |     dimensional space), reconstruct those points with the first k principal
273 |     components, and returns the reconstruction error (i.e. squared distance
274 |     bewteen the reconstructed point and original point).
275 | 
276 |     Parameters
277 |     ----------
278 |     k: int, optional
279 |         Number of principal components to use. Default: 1.
280 | 
281 |     """
282 | 
283 |     def __init__(self, k: int = 1) -> None:
284 |         self._model = None  # type: Any
285 |         super().__init__()
286 |         self.k = k
287 | 
288 |     @property
289 |     def _param_names(self) -> Tuple[str, ...]:
290 |         return ("k",)
291 | 
292 |     def _fit_core(self, df: pd.DataFrame) -> None:
293 |         self._model = PCA(n_components=self.k)
294 |         if df.dropna().empty:
295 |             raise RuntimeError("Valid values are not enough for training.")
296 |         self._model.fit(df.dropna().values)
297 | 
298 |     def _predict_core(self, df: pd.DataFrame) -> pd.Series:
299 |         if self._model is None:
300 |             raise RuntimeError("Please fit the model first.")
301 |         if self.k > self._model.n_components:
302 |             raise ValueError(
303 |                 "k is increased after previous fitting. Please fit again."
304 |             )
305 |         results = pd.DataFrame(columns=df.columns, index=df.index)
306 |         if not df.dropna().empty:
307 |             results.loc[df.dropna().index] = self._model.inverse_transform(
308 |                 self._model.transform(df.dropna().values)
309 |             )
310 |         return ((results - df) ** 2).sum(axis=1, skipna=False)
311 | 


--------------------------------------------------------------------------------
/src/adtk/visualization/__init__.py:
--------------------------------------------------------------------------------
1 | """Module of visualization."""
2 | 
3 | from ._visualization import plot
4 | 
5 | __all__ = ["plot"]
6 | 


--------------------------------------------------------------------------------
/tests/test_aggregators.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from pandas import Timestamp
  3 | 
  4 | import adtk.aggregator as aggt
  5 | 
  6 | 
  7 | def test_or_dict_of_lists():
  8 |     """
  9 |     Test OrAggregator with input as a dict of lists of time stamps or time
 10 |     stamp 2-tuples
 11 |     """
 12 |     lists = {
 13 |         "A": [
 14 |             (Timestamp("2017-1-1"), Timestamp("2017-1-2")),
 15 |             (Timestamp("2017-1-5"), Timestamp("2017-1-8")),
 16 |             Timestamp("2017-1-10"),
 17 |         ],
 18 |         "B": [
 19 |             Timestamp("2017-1-2"),
 20 |             (Timestamp("2017-1-3"), Timestamp("2017-1-6")),
 21 |             Timestamp("2017-1-8"),
 22 |             (Timestamp("2017-1-7"), Timestamp("2017-1-9")),
 23 |             (Timestamp("2017-1-11"), Timestamp("2017-1-11")),
 24 |         ],
 25 |     }
 26 |     assert aggt.OrAggregator().aggregate(lists) == [
 27 |         (Timestamp("2017-01-01 00:00:00"), Timestamp("2017-01-02 00:00:00")),
 28 |         (Timestamp("2017-01-03 00:00:00"), Timestamp("2017-01-09 00:00:00")),
 29 |         Timestamp("2017-1-10"),
 30 |         Timestamp("2017-1-11"),
 31 |     ]
 32 | 
 33 |     lists = {
 34 |         "A": [
 35 |             (Timestamp("2017-1-1"), Timestamp("2017-1-2")),
 36 |             (Timestamp("2017-1-5"), Timestamp("2017-1-8")),
 37 |             Timestamp("2017-1-10"),
 38 |         ],
 39 |         "B": [],
 40 |     }
 41 |     assert aggt.OrAggregator().aggregate(lists) == [
 42 |         (Timestamp("2017-1-1"), Timestamp("2017-1-2")),
 43 |         (Timestamp("2017-1-5"), Timestamp("2017-1-8")),
 44 |         Timestamp("2017-1-10"),
 45 |     ]
 46 | 
 47 | 
 48 | def test_or_df():
 49 |     """
 50 |     Test OrAggregator with input as a DataFrame
 51 |     """
 52 |     df = pd.DataFrame(
 53 |         [[1, 1], [1, 0], [0, 1], [0, 0], [float("nan"), 1], [0, float("nan")]],
 54 |         index=pd.date_range(start="2017-1-1", periods=6, freq="D"),
 55 |     )
 56 |     pd.testing.assert_series_equal(
 57 |         aggt.OrAggregator().aggregate(df),
 58 |         pd.Series(
 59 |             [1, 1, 1, 0, 1, float("nan")],
 60 |             index=pd.date_range(start="2017-1-1", periods=6, freq="D"),
 61 |         ),
 62 |     )
 63 | 
 64 | 
 65 | def test_or_dict_of_dfs():
 66 |     """
 67 |     Test OrAggregator with input as a dict of DataFrame
 68 |     """
 69 |     df1 = pd.DataFrame(
 70 |         [[1, 1], [1, 0], [0, 1], [0, 0], [float("nan"), 1], [0, float("nan")]],
 71 |         index=pd.date_range(start="2017-1-1", periods=6, freq="D"),
 72 |     )
 73 |     df2 = pd.DataFrame(
 74 |         [[1, 1], [1, 0], [0, 1], [0, 0], [float("nan"), 1], [0, float("nan")]],
 75 |         index=pd.date_range(start="2017-1-1", periods=6, freq="D"),
 76 |     )
 77 |     pd.testing.assert_series_equal(
 78 |         aggt.OrAggregator().aggregate({"A": df1, "B": df2}),
 79 |         pd.Series(
 80 |             [1, 1, 1, 0, 1, float("nan")],
 81 |             index=pd.date_range(start="2017-1-1", periods=6, freq="D"),
 82 |         ),
 83 |     )
 84 | 
 85 | 
 86 | def test_and_dict_of_lists():
 87 |     """
 88 |     Test AndAggregator with input as a dict of lists of time stamps or time
 89 |     stamp 2-tuples
 90 |     """
 91 |     lists = {
 92 |         "A": [
 93 |             (Timestamp("2017-1-1"), Timestamp("2017-1-2")),
 94 |             (Timestamp("2017-1-5"), Timestamp("2017-1-8")),
 95 |             Timestamp("2017-1-10"),
 96 |         ],
 97 |         "B": [
 98 |             Timestamp("2017-1-2"),
 99 |             (Timestamp("2017-1-3"), Timestamp("2017-1-6")),
100 |             Timestamp("2017-1-8"),
101 |             (Timestamp("2017-1-7"), Timestamp("2017-1-9")),
102 |             (Timestamp("2017-1-11"), Timestamp("2017-1-11")),
103 |         ],
104 |     }
105 |     assert aggt.AndAggregator().aggregate(lists) == [
106 |         Timestamp("2017-1-2"),
107 |         (Timestamp("2017-01-05 00:00:00"), Timestamp("2017-01-06 00:00:00")),
108 |         (Timestamp("2017-1-7 00:00:00"), Timestamp("2017-1-8 00:00:00")),
109 |     ]
110 | 
111 |     lists = {
112 |         "A": [
113 |             (Timestamp("2017-1-1"), Timestamp("2017-1-2")),
114 |             (Timestamp("2017-1-5"), Timestamp("2017-1-8")),
115 |             Timestamp("2017-1-10"),
116 |         ],
117 |         "B": [],
118 |     }
119 |     assert aggt.AndAggregator().aggregate(lists) == []
120 | 
121 | 
122 | def test_and_df():
123 |     """
124 |     Test AndAggregator with input as a DataFrame
125 |     """
126 |     df = pd.DataFrame(
127 |         [[1, 1], [1, 0], [0, 1], [0, 0], [float("nan"), 1], [0, float("nan")]],
128 |         index=pd.date_range(start="2017-1-1", periods=6, freq="D"),
129 |     )
130 |     pd.testing.assert_series_equal(
131 |         aggt.AndAggregator().aggregate(df),
132 |         pd.Series(
133 |             [1, 0, 0, 0, float("nan"), 0],
134 |             index=pd.date_range(start="2017-1-1", periods=6, freq="D"),
135 |         ),
136 |     )
137 | 
138 | 
139 | def test_and_dict_of_dfs():
140 |     """
141 |     Test AndAggregator with input as a dict of DataFrame
142 |     """
143 |     df1 = pd.DataFrame(
144 |         [[1, 1], [1, 0], [0, 1], [0, 0], [float("nan"), 1], [0, float("nan")]],
145 |         index=pd.date_range(start="2017-1-1", periods=6, freq="D"),
146 |     )
147 |     df2 = pd.DataFrame(
148 |         [[1, 1], [1, 0], [0, 1], [0, 0], [float("nan"), 1], [0, float("nan")]],
149 |         index=pd.date_range(start="2017-1-1", periods=6, freq="D"),
150 |     )
151 |     pd.testing.assert_series_equal(
152 |         aggt.AndAggregator().aggregate({"A": df1, "B": df2}),
153 |         pd.Series(
154 |             [1, 0, 0, 0, float("nan"), 0],
155 |             index=pd.date_range(start="2017-1-1", periods=6, freq="D"),
156 |         ),
157 |     )
158 | 
159 | 
160 | def test_customized_aggregator():
161 |     """
162 |     Test customized aggregate
163 |     """
164 | 
165 |     def myAggFunc(df, agg="and"):
166 |         if agg == "and":
167 |             return df.all(axis=1)
168 |         elif agg == "or":
169 |             return df.any(axis=1)
170 |         else:
171 |             raise ValueError("`agg` must be either 'and' or 'or'.")
172 | 
173 |     model = aggt.CustomizedAggregator(myAggFunc)
174 | 
175 |     df = pd.DataFrame(
176 |         [[1, 1], [1, 0], [0, 1], [0, 0]],
177 |         index=pd.date_range(start="2017-1-1", periods=4, freq="D"),
178 |     )
179 | 
180 |     pd.testing.assert_series_equal(
181 |         model.aggregate(df),
182 |         pd.Series([True, False, False, False], index=df.index),
183 |     )
184 | 
185 |     model.aggregate_func_params = {"agg": "or"}
186 |     pd.testing.assert_series_equal(
187 |         model.aggregate(df),
188 |         pd.Series([True, True, True, False], index=df.index),
189 |     )
190 | 


--------------------------------------------------------------------------------
/tests/test_attribute.py:
--------------------------------------------------------------------------------
  1 | """Test read-only attributes"""
  2 | import numpy as np
  3 | import pandas as pd
  4 | import pytest
  5 | 
  6 | import adtk.detector as detector
  7 | 
  8 | testCases = [
  9 |     {
 10 |         "model": detector.QuantileAD(),
 11 |         "s": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 12 |         "a": {"abs_low_": -float("inf"), "abs_high_": float("inf")},
 13 |     },
 14 |     {
 15 |         "model": detector.QuantileAD(low=0.1),
 16 |         "s": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 17 |         "a": {"abs_low_": 1, "abs_high_": float("inf")},
 18 |     },
 19 |     {
 20 |         "model": detector.QuantileAD(high=0.9),
 21 |         "s": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 22 |         "a": {"abs_low_": -float("inf"), "abs_high_": 9},
 23 |     },
 24 |     {
 25 |         "model": detector.QuantileAD(low=0.1, high=0.9),
 26 |         "s": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 27 |         "a": {"abs_low_": 1, "abs_high_": 9},
 28 |     },
 29 |     {
 30 |         "model": detector.InterQuartileRangeAD(),
 31 |         "s": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 32 |         "a": {"abs_low_": 2.5 - 15, "abs_high_": 7.5 + 15},
 33 |     },
 34 |     {
 35 |         "model": detector.InterQuartileRangeAD(c=2),
 36 |         "s": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 37 |         "a": {"abs_low_": 2.5 - 10, "abs_high_": 7.5 + 10},
 38 |     },
 39 |     {
 40 |         "model": detector.InterQuartileRangeAD(c=(2, 4)),
 41 |         "s": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 42 |         "a": {"abs_low_": 2.5 - 10, "abs_high_": 7.5 + 20},
 43 |     },
 44 |     {
 45 |         "model": detector.InterQuartileRangeAD(c=(2, None)),
 46 |         "s": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 47 |         "a": {"abs_low_": 2.5 - 10, "abs_high_": float("inf")},
 48 |     },
 49 |     {
 50 |         "model": detector.InterQuartileRangeAD(c=(None, 4)),
 51 |         "s": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 52 |         "a": {"abs_low_": -float("inf"), "abs_high_": 7.5 + 20},
 53 |     },
 54 |     {
 55 |         "model": detector.InterQuartileRangeAD(c=None),
 56 |         "s": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 57 |         "a": {"abs_low_": -float("inf"), "abs_high_": float("inf")},
 58 |     },
 59 |     {
 60 |         "model": detector.SeasonalAD(freq=4),
 61 |         "s": [0, 1, 2, 1] * 10,
 62 |         "a": {"freq_": 4, "seasonal_": [0, 1, 2, 1]},
 63 |     },
 64 |     {
 65 |         "model": detector.SeasonalAD(freq=8),
 66 |         "s": [0, 1, 2, 1] * 10,
 67 |         "a": {"freq_": 8, "seasonal_": [0, 1, 2, 1, 0, 1, 2, 1]},
 68 |     },
 69 |     {
 70 |         "model": detector.SeasonalAD(),
 71 |         "s": [0, 1, 2, 1] * 10,
 72 |         "a": {"freq_": 4, "seasonal_": [0, 1, 2, 1]},
 73 |     },
 74 |     {
 75 |         "model": detector.SeasonalAD(trend=True),
 76 |         "s": np.array([0, 1, 2, 1] * 10) + np.arange(40) / 10,
 77 |         "a": {"freq_": 4, "seasonal_": [-1, 0, 1, 0]},
 78 |     },
 79 |     {
 80 |         "model": detector.SeasonalAD(trend=True, freq=8),
 81 |         "s": np.array([0, 1, 2, 1] * 10) + np.arange(40),
 82 |         "a": {"freq_": 8, "seasonal_": [-1, 0, 1, 0, -1, 0, 1, 0]},
 83 |     },
 84 | ]
 85 | 
 86 | 
 87 | @pytest.mark.parametrize("testCase", testCases)
 88 | def test_attribute(testCase):
 89 |     """Test fit_detect the detector."""
 90 |     s = pd.Series(
 91 |         testCase["s"],
 92 |         pd.date_range(start="2017-1-1", periods=len(testCase["s"]), freq="D"),
 93 |     )
 94 |     model = testCase["model"]
 95 |     for key, value in testCase["a"].items():
 96 |         with pytest.raises(AttributeError):
 97 |             getattr(model, key)
 98 |     model.fit(s)
 99 |     for key, value in testCase["a"].items():
100 |         if isinstance(value, list):
101 |             pd.testing.assert_series_equal(
102 |                 getattr(model, key),
103 |                 pd.Series(value, index=s.index[: len(value)]),
104 |                 check_dtype=False,
105 |                 check_names=False,
106 |             )
107 |         else:
108 |             assert getattr(model, key) == value
109 | 


--------------------------------------------------------------------------------
/tests/test_data_validation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Unit tests on data
  3 | """
  4 | import numpy as np
  5 | import pandas as pd
  6 | import pytest
  7 | 
  8 | from adtk.data import validate_series
  9 | 
 10 | rand = np.random.RandomState(123)
 11 | 
 12 | regular_time_index = pd.date_range(start=0, periods=10, freq="1d")
 13 | so = pd.Series(np.arange(10), index=regular_time_index, name="value")
 14 | bo = pd.Series(
 15 |     [1, 0, 0, 0, 1, 1, 1, 0, 0, 0], index=regular_time_index, name="type1"
 16 | )
 17 | bom = pd.concat([bo, (1 - bo).rename("type2")], axis=1)
 18 | co = pd.Series(
 19 |     ["B", "A", "A", "A", np.nan, np.nan, np.nan, "B", "B", "B"],
 20 |     index=regular_time_index,
 21 | )
 22 | coi = pd.get_dummies(co)
 23 | con = pd.Series(
 24 |     ["B", "A", "A", "A", np.nan, np.nan, np.nan, "B", "B", "B"],
 25 |     index=regular_time_index,
 26 |     name="type3",
 27 | )
 28 | coni = pd.get_dummies(con, prefix="type3", prefix_sep="_")
 29 | 
 30 | test_targets = [
 31 |     (so, so),
 32 |     (bo, bo),
 33 |     (bom, bom),
 34 |     (co, coi),
 35 |     (con, coni),
 36 |     (pd.concat([so, bom, con], axis=1), pd.concat([so, bom, coni], axis=1)),
 37 | ]
 38 | 
 39 | 
 40 | @pytest.mark.parametrize("x", test_targets)
 41 | def test_series_regular(x):
 42 |     # regular Series
 43 |     s = x[0].copy()
 44 |     sv = validate_series(s, check_categorical=True)
 45 |     if isinstance(sv, pd.Series):
 46 |         pd.testing.assert_series_equal(sv, x[1], check_dtype=False)
 47 |     elif isinstance(sv, pd.DataFrame):
 48 |         pd.testing.assert_frame_equal(sv, x[1], check_dtype=False)
 49 |     else:
 50 |         raise TypeError("Must be pandas Series or DataFrame")
 51 |     # check if copy instead of view
 52 |     sc = s.copy()
 53 |     sv.iloc[0] == 1000
 54 |     if isinstance(s, pd.Series):
 55 |         pd.testing.assert_series_equal(s, sc, check_dtype=False)
 56 |     elif isinstance(s, pd.DataFrame):
 57 |         pd.testing.assert_frame_equal(s, sc, check_dtype=False)
 58 |     else:
 59 |         raise TypeError("Must be pandas Series or DataFrame")
 60 | 
 61 | 
 62 | @pytest.mark.parametrize("x", test_targets)
 63 | def test_series_unsorted(x):
 64 |     # unsorted Series
 65 |     s = x[0].copy()
 66 |     s = s.iloc[[9, 6, 7, 1, 0, 3, 4, 5, 8, 2]]
 67 |     sv = validate_series(s, check_categorical=True)
 68 |     if isinstance(sv, pd.Series):
 69 |         pd.testing.assert_series_equal(sv, x[1], check_dtype=False)
 70 |     elif isinstance(sv, pd.DataFrame):
 71 |         pd.testing.assert_frame_equal(sv, x[1], check_dtype=False)
 72 |     else:
 73 |         raise TypeError("Must be pandas Series or DataFrame")
 74 |     # check if copy instead of view
 75 |     sc = s.copy()
 76 |     sv.iloc[0] == 1000
 77 |     if isinstance(s, pd.Series):
 78 |         pd.testing.assert_series_equal(s, sc, check_dtype=False)
 79 |     elif isinstance(s, pd.DataFrame):
 80 |         pd.testing.assert_frame_equal(s, sc, check_dtype=False)
 81 |     else:
 82 |         raise TypeError("Must be pandas Series or DataFrame")
 83 | 
 84 | 
 85 | @pytest.mark.parametrize("x", test_targets)
 86 | def test_series_duplicated_timestamp(x):
 87 |     # Series with duplicated time stamps
 88 |     s = x[0].copy()
 89 |     s = s.iloc[[0, 1, 1, 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 9]]
 90 |     sv = validate_series(s, check_categorical=True)
 91 |     if isinstance(sv, pd.Series):
 92 |         pd.testing.assert_series_equal(sv, x[1], check_dtype=False)
 93 |     elif isinstance(sv, pd.DataFrame):
 94 |         pd.testing.assert_frame_equal(sv, x[1], check_dtype=False)
 95 |     else:
 96 |         raise TypeError("Must be pandas Series or DataFrame")
 97 |     # check if copy instead of view
 98 |     sc = s.copy()
 99 |     sv.iloc[0] == 1000
100 |     if isinstance(s, pd.Series):
101 |         pd.testing.assert_series_equal(s, sc, check_dtype=False)
102 |     elif isinstance(s, pd.DataFrame):
103 |         pd.testing.assert_frame_equal(s, sc, check_dtype=False)
104 |     else:
105 |         raise TypeError("Must be pandas Series or DataFrame")
106 | 
107 | 
108 | @pytest.mark.parametrize("x", test_targets)
109 | def test_series_missed_timestamp(x):
110 |     # Series with missed time stamps
111 |     s = x[0].copy()
112 |     s = s.iloc[[0, 1, 3, 4, 5, 6, 7, 9]]
113 |     ss = x[1].copy()
114 |     ss = ss.iloc[[0, 1, 3, 4, 5, 6, 7, 9]]
115 |     sv = validate_series(s, check_categorical=True)
116 |     if isinstance(sv, pd.Series):
117 |         pd.testing.assert_series_equal(sv, ss, check_dtype=False)
118 |     elif isinstance(sv, pd.DataFrame):
119 |         pd.testing.assert_frame_equal(sv, ss, check_dtype=False)
120 |     else:
121 |         raise TypeError("Must be pandas Series or DataFrame")
122 |     # check if copy instead of view
123 |     sc = s.copy()
124 |     sv.iloc[0] == 1000
125 |     if isinstance(s, pd.Series):
126 |         pd.testing.assert_series_equal(s, sc, check_dtype=False)
127 |     elif isinstance(s, pd.DataFrame):
128 |         pd.testing.assert_frame_equal(s, sc, check_dtype=False)
129 |     else:
130 |         raise TypeError("Must be pandas Series or DataFrame")
131 | 


--------------------------------------------------------------------------------
/tests/test_detectorhd.py:
--------------------------------------------------------------------------------
  1 | """Test HD detectors on some simple cases."""
  2 | from math import isnan
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import pytest
  7 | from sklearn.cluster import KMeans
  8 | from sklearn.ensemble import IsolationForest
  9 | from sklearn.linear_model import LinearRegression
 10 | from sklearn.neighbors import LocalOutlierFactor
 11 | 
 12 | import adtk.detector as detector
 13 | from adtk._base import _TrainableModel
 14 | 
 15 | nan = float("nan")
 16 | 
 17 | testCases = [
 18 |     {
 19 |         "model": detector.CustomizedDetectorHD,
 20 |         "params": {"detect_func": lambda x: x.sum(axis=1) > 0},
 21 |         "df": [
 22 |             [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 23 |             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 24 |         ],
 25 |         "a": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 26 |     },
 27 |     {
 28 |         "model": detector.CustomizedDetectorHD,
 29 |         "params": {
 30 |             "detect_func": lambda x, a: x.sum(axis=1) > a,
 31 |             "detect_func_params": {"a": 0},
 32 |         },
 33 |         "df": [
 34 |             [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 35 |             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 36 |         ],
 37 |         "a": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 38 |     },
 39 |     {
 40 |         "model": detector.CustomizedDetectorHD,
 41 |         "params": {
 42 |             "detect_func": lambda x, a: x.sum(axis=1) > a,
 43 |             "fit_func": lambda x: {"a": x.sum(axis=1).median()},
 44 |         },
 45 |         "df": [
 46 |             [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 47 |             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 48 |         ],
 49 |         "a": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 50 |     },
 51 |     {
 52 |         "model": detector.CustomizedDetectorHD,
 53 |         "params": {
 54 |             "detect_func": lambda x, a: x.sum(axis=1) > a,
 55 |             "fit_func": lambda x, q: {"a": x.sum(axis=1).quantile(q)},
 56 |             "fit_func_params": {"q": 0.5},
 57 |         },
 58 |         "df": [
 59 |             [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 60 |             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 61 |         ],
 62 |         "a": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 63 |     },
 64 |     {
 65 |         "model": detector.CustomizedDetectorHD,
 66 |         "params": {
 67 |             "detect_func": lambda x, a, b: (x.sum(axis=1) > a)
 68 |             | (x.sum(axis=1) < b),
 69 |             "detect_func_params": {"b": -0.5},
 70 |             "fit_func": lambda x: {"a": x.sum(axis=1).median()},
 71 |         },
 72 |         "df": [
 73 |             [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 74 |             [0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0],
 75 |         ],
 76 |         "a": [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0],
 77 |     },
 78 |     {
 79 |         "model": detector.CustomizedDetectorHD,
 80 |         "params": {
 81 |             "detect_func": lambda x, a, b: (x.sum(axis=1) > a)
 82 |             | (x.sum(axis=1) < b),
 83 |             "detect_func_params": {"b": -0.5},
 84 |             "fit_func": lambda x, q: {"a": x.sum(axis=1).quantile(q)},
 85 |             "fit_func_params": {"q": 0.5},
 86 |         },
 87 |         "df": [
 88 |             [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 89 |             [0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0],
 90 |         ],
 91 |         "a": [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0],
 92 |     },
 93 |     {
 94 |         "model": detector.MinClusterDetector,
 95 |         "params": {"model": KMeans(n_clusters=2)},
 96 |         "df": [[0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0]],
 97 |         "a": [0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0],
 98 |     },
 99 |     {
100 |         "model": detector.MinClusterDetector,
101 |         "params": {"model": KMeans(n_clusters=2)},
102 |         "df": [
103 |             [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
104 |             [0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0],
105 |         ],
106 |         "a": [0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0],
107 |     },
108 |     {
109 |         "model": detector.OutlierDetector,
110 |         "params": {
111 |             "model": LocalOutlierFactor(n_neighbors=1, contamination=0.1)
112 |         },
113 |         "df": [[0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0]],
114 |         "a": [0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0],
115 |     },
116 |     {
117 |         "model": detector.OutlierDetector,
118 |         "params": {
119 |             "model": LocalOutlierFactor(n_neighbors=1, contamination=0.1)
120 |         },
121 |         "df": [
122 |             [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
123 |             [0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0],
124 |         ],
125 |         "a": [0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0],
126 |     },
127 |     {
128 |         "model": detector.OutlierDetector,
129 |         "params": {
130 |             "model": IsolationForest(n_estimators=100, contamination=0.1)
131 |         },
132 |         "df": [[0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0]],
133 |         "a": [0, 0, 0, 0, 0, 1, 0, 0, 0, nan, 0, 0],
134 |     },
135 |     {
136 |         "model": detector.RegressionAD,
137 |         "params": {"target": 2, "regressor": LinearRegression()},
138 |         "df": [
139 |             [0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9],
140 |             [0, 2, 4, 6, 8, 10, 12, 14, 14, 16, 18],
141 |             [0, 3, 6, 10, 12, 14, 18, 21, nan, 24, 27],
142 |         ],
143 |         "a": [0, 0, 0, 1, 0, 1, 0, 0, nan, 0, 0],
144 |     },
145 |     {
146 |         "model": detector.RegressionAD,
147 |         "params": {
148 |             "target": 2,
149 |             "regressor": LinearRegression(),
150 |             "side": "negative",
151 |         },
152 |         "df": [
153 |             [0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9],
154 |             [0, 2, 4, 6, 8, 10, 12, 14, 14, 16, 18],
155 |             [0, 3, 6, 10, 12, 14, 18, 21, nan, 24, 27],
156 |         ],
157 |         "a": [0, 0, 0, 0, 0, 1, 0, 0, nan, 0, 0],
158 |     },
159 |     {
160 |         "model": detector.RegressionAD,
161 |         "params": {
162 |             "target": 2,
163 |             "regressor": LinearRegression(),
164 |             "side": "negative",
165 |             "c": 100,
166 |         },
167 |         "df": [
168 |             [0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9],
169 |             [0, 2, 4, 6, 8, 10, 12, 14, 14, 16, 18],
170 |             [0, 3, 6, 10, 12, 14, 18, 21, nan, 24, 27],
171 |         ],
172 |         "a": [0, 0, 0, 0, 0, 0, 0, 0, nan, 0, 0],
173 |     },
174 |     {
175 |         "model": detector.PcaAD,
176 |         "params": {"k": 1, "c": 3},
177 |         "df": [
178 |             [0, 1, 2, 3, 3.9, 4.1, 5, 6, 7, 7, 8, 9],
179 |             [0, 1, 2, 3, 4.1, 3.9, 5, 6, 7, nan, 8, 9],
180 |         ],
181 |         "a": [0, 0, 0, 0, 1, 1, 0, 0, 0, nan, 0, 0],
182 |     },
183 | ]
184 | 
185 | 
186 | @pytest.mark.parametrize("testCase", testCases)
187 | def test_fit_detect(testCase):
188 |     """Test fit_detect the detector."""
189 |     df = pd.DataFrame(
190 |         np.array(testCase["df"]).T,
191 |         pd.date_range(
192 |             start="2017-1-1", periods=len(testCase["df"][0]), freq="D"
193 |         ),
194 |     )
195 |     model = testCase["model"](**testCase["params"])
196 |     a_true = pd.Series(testCase["a"], index=df.index)
197 |     if isinstance(model, _TrainableModel):
198 |         a = model.fit_detect(df)
199 |     else:
200 |         a = model.detect(df)
201 |     pd.testing.assert_series_equal(a, a_true, check_dtype=False)
202 |     if a_true.sum() == 0:
203 |         assert isnan(model.score(df, a_true, scoring="recall"))
204 |     else:
205 |         assert model.score(df, a_true, scoring="precision") == 1
206 | 
207 | 
208 | @pytest.mark.parametrize("testCase", testCases)
209 | def test_fit_and_detect(testCase):
210 |     """Test fit the detector and then detect."""
211 |     df = pd.DataFrame(
212 |         np.array(testCase["df"]).T,
213 |         pd.date_range(
214 |             start="2017-1-1", periods=len(testCase["df"][0]), freq="D"
215 |         ),
216 |     )
217 |     model = testCase["model"](**testCase["params"])
218 |     a_true = pd.Series(testCase["a"], index=df.index)
219 |     if isinstance(model, _TrainableModel):
220 |         model.fit(df)
221 |     a = model.detect(df)
222 |     pd.testing.assert_series_equal(a, a_true, check_dtype=False)
223 |     if a_true.sum() == 0:
224 |         assert isnan(model.score(df, a_true, scoring="f1"))
225 |     else:
226 |         assert model.score(df, a_true, scoring="iou") == 1
227 | 
228 | 
229 | @pytest.mark.parametrize("testCase", testCases)
230 | def test_series(testCase):
231 |     """Test the detector on series."""
232 |     if len(testCase["df"]) == 1:
233 |         s = pd.DataFrame(
234 |             testCase["df"][0],
235 |             pd.date_range(
236 |                 start="2017-1-1", periods=len(testCase["df"][0]), freq="D"
237 |             ),
238 |         )
239 |         model = testCase["model"](**testCase["params"])
240 |         a_true = pd.Series(testCase["a"], index=s.index)
241 |         if isinstance(model, _TrainableModel):
242 |             a = model.fit_detect(s)
243 |         else:
244 |             a = model.detect(s)
245 |         pd.testing.assert_series_equal(a, a_true, check_dtype=False)
246 | 


--------------------------------------------------------------------------------
/tests/test_expand_events.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | 
  3 | from adtk.data import expand_events
  4 | 
  5 | event_list = [
  6 |     pd.Timestamp("2017-1-1 20:04:00"),
  7 |     (pd.Timestamp("2017-1-1 20:00:00"), pd.Timestamp("2017-1-1 20:05:59")),
  8 |     (pd.Timestamp("2017-1-1 20:03:00"), pd.Timestamp("2017-1-1 20:08:59")),
  9 |     pd.Timestamp("2017-1-1 20:30:00"),
 10 |     pd.Timestamp("2017-1-1 21:00:00"),
 11 |     (pd.Timestamp("2017-1-1 21:05:00"), pd.Timestamp("2017-1-1 21:06:59")),
 12 |     pd.Timestamp("2017-1-1 21:03:00"),
 13 | ]
 14 | 
 15 | nan = float("nan")
 16 | event_labels = pd.Series(
 17 |     [0, 0, 1, 1, nan, 0, 1, 0, nan, 0, 0, 1],
 18 |     index=pd.date_range(start="2017-1-1", periods=12, freq="D"),
 19 | )
 20 | 
 21 | 
 22 | def test_expand_event_series_freq():
 23 |     expanded_events = expand_events(
 24 |         event_labels,
 25 |         left_expand="1hour",
 26 |         right_expand="1hour",
 27 |         freq_as_period=True,
 28 |     )
 29 |     true_expanded_events = pd.Series(
 30 |         [0, 1, 1, 1, 1, 1, 1, 1, nan, 0, 1, 1],
 31 |         index=pd.date_range(start="2017-1-1", periods=12, freq="D"),
 32 |     )
 33 |     pd.testing.assert_series_equal(
 34 |         true_expanded_events, expanded_events, check_dtype=False
 35 |     )
 36 | 
 37 | 
 38 | def test_expand_event_series_no_freq():
 39 |     expanded_events = expand_events(
 40 |         event_labels,
 41 |         left_expand="1hour",
 42 |         right_expand="1hour",
 43 |         freq_as_period=False,
 44 |     )
 45 |     pd.testing.assert_series_equal(
 46 |         event_labels, expanded_events, check_dtype=False
 47 |     )
 48 | 
 49 | 
 50 | def test_expand_event_df_freq():
 51 |     expanded_events = expand_events(
 52 |         pd.concat(
 53 |             [event_labels.rename("A"), event_labels.rename("B")], axis=1
 54 |         ),
 55 |         left_expand="1hour",
 56 |         right_expand="1hour",
 57 |         freq_as_period=True,
 58 |     )
 59 |     true_expanded_events = pd.Series(
 60 |         [0, 1, 1, 1, 1, 1, 1, 1, nan, 0, 1, 1],
 61 |         index=pd.date_range(start="2017-1-1", periods=12, freq="D"),
 62 |     )
 63 |     true_expanded_events = pd.concat(
 64 |         [true_expanded_events.rename("A"), true_expanded_events.rename("B")],
 65 |         axis=1,
 66 |     )
 67 |     pd.testing.assert_frame_equal(
 68 |         true_expanded_events, expanded_events, check_dtype=False
 69 |     )
 70 | 
 71 | 
 72 | def test_expand_event_df_no_freq():
 73 |     expanded_events = expand_events(
 74 |         pd.concat(
 75 |             [event_labels.rename("A"), event_labels.rename("B")], axis=1
 76 |         ),
 77 |         left_expand="1hour",
 78 |         right_expand="1hour",
 79 |         freq_as_period=False,
 80 |     )
 81 | 
 82 |     pd.testing.assert_frame_equal(
 83 |         pd.concat(
 84 |             [event_labels.rename("A"), event_labels.rename("B")], axis=1
 85 |         ),
 86 |         expanded_events,
 87 |         check_dtype=False,
 88 |     )
 89 | 
 90 | 
 91 | def test_expand_event_list():
 92 |     expanded_events = expand_events(
 93 |         event_list, left_expand="1min", right_expand="3min"
 94 |     )
 95 |     assert expanded_events == [
 96 |         (pd.Timestamp("2017-1-1 19:59:00"), pd.Timestamp("2017-1-1 20:11:59")),
 97 |         (pd.Timestamp("2017-1-1 20:29:00"), pd.Timestamp("2017-1-1 20:33:00")),
 98 |         (pd.Timestamp("2017-1-1 20:59:00"), pd.Timestamp("2017-1-1 21:09:59")),
 99 |     ]
100 | 
101 | 
102 | def test_expand_event_dict():
103 |     expanded_events = expand_events(
104 |         {"A": event_list, "B": event_list},
105 |         left_expand="1min",
106 |         right_expand="3min",
107 |     )
108 |     assert expanded_events == {
109 |         "A": [
110 |             (
111 |                 pd.Timestamp("2017-1-1 19:59:00"),
112 |                 pd.Timestamp("2017-1-1 20:11:59"),
113 |             ),
114 |             (
115 |                 pd.Timestamp("2017-1-1 20:29:00"),
116 |                 pd.Timestamp("2017-1-1 20:33:00"),
117 |             ),
118 |             (
119 |                 pd.Timestamp("2017-1-1 20:59:00"),
120 |                 pd.Timestamp("2017-1-1 21:09:59"),
121 |             ),
122 |         ],
123 |         "B": [
124 |             (
125 |                 pd.Timestamp("2017-1-1 19:59:00"),
126 |                 pd.Timestamp("2017-1-1 20:11:59"),
127 |             ),
128 |             (
129 |                 pd.Timestamp("2017-1-1 20:29:00"),
130 |                 pd.Timestamp("2017-1-1 20:33:00"),
131 |             ),
132 |             (
133 |                 pd.Timestamp("2017-1-1 20:59:00"),
134 |                 pd.Timestamp("2017-1-1 21:09:59"),
135 |             ),
136 |         ],
137 |     }
138 | 


--------------------------------------------------------------------------------
/tests/test_few_shot_fit.py:
--------------------------------------------------------------------------------
 1 | """Check model fitting with short series
 2 | """
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | import pytest
 7 | 
 8 | from adtk.detector import (
 9 |     AutoregressionAD,
10 |     LevelShiftAD,
11 |     PersistAD,
12 |     VolatilityShiftAD,
13 | )
14 | 
15 | s = pd.Series(
16 |     np.sin(np.arange(10)),
17 |     index=pd.date_range(start="2017-1-1", periods=10, freq="D"),
18 | )
19 | 
20 | 
21 | def test_persist_ad():
22 |     model = PersistAD(window=10)
23 |     with pytest.raises(RuntimeError):
24 |         model.fit(s)
25 | 
26 |     model = PersistAD(window=9)
27 |     model.fit(s)
28 | 
29 | 
30 | def test_level_shift_ad():
31 |     model = LevelShiftAD(window=6)
32 |     with pytest.raises(RuntimeError):
33 |         model.fit(s)
34 | 
35 |     model = PersistAD(window=5)
36 |     model.fit(s)
37 | 
38 | 
39 | def test_volatility_shift_ad():
40 |     model = VolatilityShiftAD(window=6)
41 |     with pytest.raises(RuntimeError):
42 |         model.fit(s)
43 | 
44 |     model = PersistAD(window=5)
45 |     model.fit(s)
46 | 
47 | 
48 | def test_autoregression_ad():
49 |     model = AutoregressionAD(n_steps=3, step_size=4)
50 |     with pytest.raises(RuntimeError):
51 |         model.fit(s)
52 | 
53 |     model = AutoregressionAD(n_steps=3, step_size=3)
54 |     model.fit(s)
55 | 


--------------------------------------------------------------------------------
/tests/test_few_shot_predict.py:
--------------------------------------------------------------------------------
  1 | """Check model predicting with short series
  2 | """
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | from adtk.detector import (
  8 |     AutoregressionAD,
  9 |     LevelShiftAD,
 10 |     PersistAD,
 11 |     VolatilityShiftAD,
 12 | )
 13 | 
 14 | s = pd.Series(
 15 |     np.sin(np.arange(100)),
 16 |     index=pd.date_range(start="2017-1-1", periods=100, freq="D"),
 17 | )
 18 | 
 19 | 
 20 | def test_persist_ad():
 21 |     model = PersistAD(window=1)
 22 |     s_train = s.copy().iloc[:-10]
 23 |     model.fit(s_train)
 24 | 
 25 |     s_test = s.copy().iloc[-2:]
 26 |     s_test.iloc[-1] = 10
 27 |     pd.testing.assert_series_equal(
 28 |         model.predict(s_test), pd.Series([np.nan, 1.0], index=s_test.index)
 29 |     )
 30 | 
 31 |     s_test = s.copy().iloc[-1:]
 32 |     s_test.iloc[-1] = 10
 33 |     pd.testing.assert_series_equal(
 34 |         model.predict(s_test), pd.Series([np.nan], index=s_test.index)
 35 |     )
 36 | 
 37 |     model = PersistAD(window=5)
 38 |     s_train = s.copy().iloc[:-10]
 39 |     model.fit(s_train)
 40 | 
 41 |     s_test = s.copy().iloc[-5:]
 42 |     s_test.iloc[-1] = 10
 43 |     pd.testing.assert_series_equal(
 44 |         model.predict(s_test), pd.Series([np.nan] * 5, index=s_test.index)
 45 |     )
 46 | 
 47 |     s_test = s.copy().iloc[-6:]
 48 |     s_test.iloc[-1] = 10
 49 |     pd.testing.assert_series_equal(
 50 |         model.predict(s_test),
 51 |         pd.Series([np.nan] * 5 + [1.0], index=s_test.index),
 52 |     )
 53 | 
 54 | 
 55 | def test_level_shift_ad():
 56 |     model = LevelShiftAD(window=3)
 57 |     s_train = s.copy().iloc[:-10]
 58 |     model.fit(s_train)
 59 | 
 60 |     s_test = s.copy().iloc[-5:]
 61 |     s_test.iloc[-3:] = 10
 62 |     pd.testing.assert_series_equal(
 63 |         model.predict(s_test), pd.Series([np.nan] * 5, index=s_test.index)
 64 |     )
 65 | 
 66 |     s_test = s.copy().iloc[-6:]
 67 |     s_test.iloc[-3:] = 10
 68 |     pd.testing.assert_series_equal(
 69 |         model.predict(s_test),
 70 |         pd.Series([np.nan] * 3 + [1.0] + [np.nan] * 2, index=s_test.index),
 71 |     )
 72 | 
 73 | 
 74 | def test_volatility_shift_ad():
 75 |     model = VolatilityShiftAD(window=3)
 76 |     s_train = s.copy().iloc[:-10]
 77 |     model.fit(s_train)
 78 | 
 79 |     s_test = s.copy().iloc[-5:]
 80 |     s_test.iloc[-3:] *= 10
 81 |     pd.testing.assert_series_equal(
 82 |         model.predict(s_test), pd.Series([np.nan] * 5, index=s_test.index)
 83 |     )
 84 | 
 85 |     s_test = s.copy().iloc[-6:]
 86 |     s_test.iloc[-3:] *= 10
 87 |     pd.testing.assert_series_equal(
 88 |         model.predict(s_test),
 89 |         pd.Series([np.nan] * 3 + [1.0] + [np.nan] * 2, index=s_test.index),
 90 |     )
 91 | 
 92 | 
 93 | def test_autoregression_ad():
 94 |     model = AutoregressionAD(n_steps=3, step_size=7)
 95 |     s_train = s.copy().iloc[:-10]
 96 |     model.fit(s_train)
 97 | 
 98 |     s_test = s.copy().iloc[-21:]
 99 |     s_test.iloc[-1:] = 10
100 |     pd.testing.assert_series_equal(
101 |         model.predict(s_test), pd.Series([np.nan] * 21, index=s_test.index)
102 |     )
103 | 
104 |     s_test = s.copy().iloc[-22:]
105 |     s_test.iloc[-1:] = 10
106 |     pd.testing.assert_series_equal(
107 |         model.predict(s_test),
108 |         pd.Series([np.nan] * 21 + [1.0], index=s_test.index),
109 |     )
110 | 


--------------------------------------------------------------------------------
/tests/test_inconsistent_train_test.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test raising error when training and testing dataframes are inconsistent in
 3 | multivariate trainable models.
 4 | """
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | import pytest
 9 | from sklearn.cluster import KMeans
10 | from sklearn.linear_model import LinearRegression
11 | from sklearn.neighbors import LocalOutlierFactor
12 | 
13 | import adtk.detector as detector
14 | import adtk.transformer as transformer
15 | 
16 | models = [
17 |     detector.MinClusterDetector(KMeans(n_clusters=2)),
18 |     detector.OutlierDetector(
19 |         LocalOutlierFactor(n_neighbors=20, contamination=0.1)
20 |     ),
21 |     detector.RegressionAD(target="A", regressor=LinearRegression()),
22 |     detector.PcaAD(),
23 |     transformer.RegressionResidual(target="A", regressor=LinearRegression()),
24 |     transformer.PcaReconstructionError(),
25 |     transformer.PcaProjection(),
26 |     transformer.PcaReconstruction(),
27 | ]
28 | 
29 | df_train = pd.DataFrame(
30 |     np.arange(40).reshape(20, 2),
31 |     columns=["A", "B"],
32 |     index=pd.date_range(start="2017-1-1", periods=20, freq="D"),
33 | )
34 | 
35 | df_test_ok = pd.DataFrame(
36 |     np.arange(0, -60, -1).reshape(20, 3),
37 |     columns=["C", "B", "A"],
38 |     index=pd.date_range(start="2017-1-1", periods=20, freq="D"),
39 | )
40 | 
41 | df_test_not_ok = pd.DataFrame(
42 |     np.arange(0, -60, -1).reshape(20, 3),
43 |     columns=["C", "D", "A"],
44 |     index=pd.date_range(start="2017-1-1", periods=20, freq="D"),
45 | )
46 | 
47 | 
48 | @pytest.mark.parametrize("model", models)
49 | def test_inconsistent_train_test(model):
50 |     model.fit(df_train)
51 | 
52 |     model.predict(df_test_ok)
53 | 
54 |     with pytest.raises(
55 |         ValueError,
56 |         match="The model was trained by a pandas DataFrame with columns",
57 |     ):
58 |         model.predict(df_test_not_ok)
59 | 


--------------------------------------------------------------------------------
/tests/test_metric.py:
--------------------------------------------------------------------------------
  1 | from math import isnan
  2 | 
  3 | import pandas as pd
  4 | import pytest
  5 | from pandas import Timestamp
  6 | 
  7 | from adtk.metrics import f1_score, iou, precision, recall
  8 | 
  9 | n = float("nan")
 10 | 
 11 | s_true = pd.Series(
 12 |     [0, 0, 1, 1, 0, 1, 0, n, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, n, 0, 0, 1, 0, 0],
 13 |     pd.date_range(start=0, periods=24, freq="1d"),
 14 | )
 15 | s_pred = pd.Series(
 16 |     [0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, n, 1, 1, n, 0, 1, 0, 1, 1],
 17 |     pd.date_range(start=0, periods=24, freq="1d"),
 18 | )
 19 | s0 = pd.Series(
 20 |     [0, 0, 0, 0, 0, 0, 0, n, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, n, 0, 0, 0, 0, 0],
 21 |     pd.date_range(start=0, periods=24, freq="1d"),
 22 | )
 23 | 
 24 | df_true = pd.concat([s_true, s_pred], axis=1).rename(columns={0: "A", 1: "B"})
 25 | df_pred = pd.concat([s_pred, s_true], axis=1).rename(columns={0: "A", 1: "B"})
 26 | df0 = pd.concat([s0, s0], axis=1).rename(columns={0: "A", 1: "B"})
 27 | 
 28 | 
 29 | l_true = [
 30 |     (Timestamp("1970-01-03 00:00:00"), Timestamp("1970-01-04 00:00:00")),
 31 |     Timestamp("1970-01-06 00:00:00"),
 32 |     (Timestamp("1970-01-08 00:00:00"), Timestamp("1970-01-10 00:00:00")),
 33 |     Timestamp("1970-01-12 00:00:00"),
 34 |     (Timestamp("1970-01-14 00:00:00"), Timestamp("1970-01-18 00:00:00")),
 35 |     Timestamp("1970-01-22 00:00:00"),
 36 | ]
 37 | l_pred = [
 38 |     (Timestamp("1970-01-02 00:00:00"), Timestamp("1970-01-07 00:00:00")),
 39 |     (Timestamp("1970-01-09 00:00:00"), Timestamp("1970-01-10 00:00:00")),
 40 |     Timestamp("1970-01-12 00:00:00"),
 41 |     Timestamp("1970-01-15 00:00:00"),
 42 |     (Timestamp("1970-01-17 00:00:00"), Timestamp("1970-01-19 00:00:00")),
 43 |     Timestamp("1970-01-21 00:00:00"),
 44 |     (Timestamp("1970-01-23 00:00:00"), Timestamp("1970-01-24 00:00:00")),
 45 | ]
 46 | l0 = []
 47 | 
 48 | d_true = {"A": l_true, "B": l_pred}
 49 | d_pred = {"A": l_pred, "B": l_true}
 50 | d0 = {"A": l0, "B": l0}
 51 | 
 52 | 
 53 | def test_metric_series():
 54 |     assert recall(s_true, s_pred) == 9 / 12
 55 |     assert isnan(recall(s0, s_pred))
 56 |     assert precision(s_true, s_pred) == 9 / 15
 57 |     assert f1_score(s_true, s_pred) == pytest.approx(2 / 3)
 58 |     assert isnan(f1_score(s0, s_pred))
 59 |     assert isnan(f1_score(1 - s_pred, s_pred))
 60 |     assert iou(s_true, s_pred) == 9 / 17
 61 |     assert iou(s_pred, s_true) == 9 / 17
 62 |     assert isnan(iou(s0, s0))
 63 | 
 64 | 
 65 | def test_metric_list():
 66 |     assert recall(l_true, l_pred) == 4 / 6
 67 |     assert isnan(recall(l0, l_pred))
 68 |     assert precision(l_true, l_pred) == 4 / 7
 69 |     assert recall(l_true, l_pred, thresh=1) == 3 / 6
 70 |     assert precision(l_true, l_pred, thresh=1) == 3 / 7
 71 |     assert iou(l_true, l_pred) == 3 / 13
 72 |     assert isnan(iou(l0, l0))
 73 | 
 74 | 
 75 | def test_metric_dataframe():
 76 |     assert recall(df_true, df_pred) == {"A": 9 / 12, "B": 9 / 15}
 77 |     assert all([isnan(x) for x in recall(df0, df_pred).values()]) and (
 78 |         recall(df0, df_pred).keys() == {"A": n, "B": n}.keys()
 79 |     )
 80 |     assert precision(df_true, df_pred) == {"A": 9 / 15, "B": 9 / 12}
 81 |     assert f1_score(df_true, df_pred) == {
 82 |         "A": pytest.approx(2 / 3),
 83 |         "B": pytest.approx(2 / 3),
 84 |     }
 85 |     assert all([isnan(x) for x in f1_score(df0, df_pred).values()]) and (
 86 |         f1_score(df0, df_pred).keys() == {"A": n, "B": n}.keys()
 87 |     )
 88 |     assert iou(df_true, df_pred) == {"A": 9 / 17, "B": 9 / 17}
 89 |     assert all([isnan(x) for x in iou(df0, df0).values()]) and (
 90 |         iou(df0, df0).keys() == {"A": n, "B": n}.keys()
 91 |     )
 92 | 
 93 | 
 94 | def test_metric_dict():
 95 |     assert recall(d_true, d_pred) == {"A": 4 / 6, "B": 4 / 7}
 96 |     assert all([isnan(x) for x in recall(d0, d_pred).values()]) and (
 97 |         recall(d0, d_pred).keys() == {"A": n, "B": n}.keys()
 98 |     )
 99 |     assert precision(d_true, d_pred) == {"A": 4 / 7, "B": 4 / 6}
100 |     assert f1_score(d_true, d_pred) == {
101 |         "A": pytest.approx(2 * 4 / 7 * 4 / 6 / (4 / 7 + 4 / 6)),
102 |         "B": pytest.approx(2 * 4 / 7 * 4 / 6 / (4 / 7 + 4 / 6)),
103 |     }
104 |     assert recall(d_true, d_pred, thresh=1) == {"A": 3 / 6, "B": 3 / 7}
105 |     assert precision(d_true, d_pred, thresh=1) == {"A": 3 / 7, "B": 3 / 6}
106 |     assert iou(d_true, d_pred) == {"A": 3 / 13, "B": 3 / 13}
107 | 


--------------------------------------------------------------------------------
/tests/test_print_subclasses.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import adtk.aggregator as aggt
 4 | import adtk.detector as detector
 5 | import adtk.transformer as transformer
 6 | 
 7 | 
 8 | def test_print_subclasses():
 9 |     """
10 |     get `print_all_models` method for every module
11 |     """
12 |     _ = aggt.print_all_models()
13 |     _ = detector.print_all_models()
14 |     _ = transformer.print_all_models()
15 | 


--------------------------------------------------------------------------------
/tests/test_series_name.py:
--------------------------------------------------------------------------------
  1 | "Check if the series name or column name is correctly kept."
  2 | 
  3 | import sys
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import pytest
  8 | from sklearn.cluster import KMeans
  9 | from sklearn.linear_model import LinearRegression
 10 | from sklearn.neighbors import LocalOutlierFactor
 11 | 
 12 | import adtk.detector as detector
 13 | import adtk.transformer as transformer
 14 | from adtk._base import _TrainableModel
 15 | from adtk._detector_base import (  # _NonTrainableMultivariateDetector,
 16 |     _NonTrainableUnivariateDetector,
 17 |     _TrainableMultivariateDetector,
 18 |     _TrainableUnivariateDetector,
 19 | )
 20 | 
 21 | _Detector = (
 22 |     _NonTrainableUnivariateDetector,
 23 |     # _NonTrainableMultivariateDetector,
 24 |     _TrainableUnivariateDetector,
 25 |     _TrainableMultivariateDetector,
 26 | )
 27 | 
 28 | # We have 4 types of models
 29 | #   - one-to-one: input a univariate series, output a univariate series
 30 | #   - one-to-many: input a univariate series, output a multivariate series
 31 | #   - many-to-one: input a multivariate series, output a univariate series
 32 | #   - many-to-many: input a multivariate series, output a multivariate series
 33 | 
 34 | one2one_models = [
 35 |     detector.ThresholdAD(),
 36 |     detector.QuantileAD(),
 37 |     detector.InterQuartileRangeAD(),
 38 |     detector.GeneralizedESDTestAD(),
 39 |     detector.PersistAD(window=10),
 40 |     detector.LevelShiftAD(window=10),
 41 |     detector.VolatilityShiftAD(window=10),
 42 |     detector.AutoregressionAD(),
 43 |     detector.SeasonalAD(freq=2),
 44 |     transformer.RollingAggregate(window=10, agg="median"),
 45 |     transformer.RollingAggregate(
 46 |         window=10, agg="quantile", agg_params={"q": 0.5}
 47 |     ),
 48 |     transformer.DoubleRollingAggregate(window=10, agg="median"),
 49 |     transformer.DoubleRollingAggregate(
 50 |         window=10, agg="quantile", agg_params={"q": [0.1, 0.5, 0.9]}
 51 |     ),
 52 |     transformer.DoubleRollingAggregate(
 53 |         window=10, agg="hist", agg_params={"bins": [30, 50, 70]}
 54 |     ),
 55 |     transformer.StandardScale(),
 56 |     transformer.ClassicSeasonalDecomposition(freq=2),
 57 | ]
 58 | 
 59 | one2many_models = [
 60 |     transformer.RollingAggregate(
 61 |         window=10, agg="quantile", agg_params={"q": [0.1, 0.5, 0.9]}
 62 |     ),
 63 |     transformer.RollingAggregate(
 64 |         window=10, agg="hist", agg_params={"bins": [20, 50, 80]}
 65 |     ),
 66 |     transformer.Retrospect(n_steps=3),
 67 | ]
 68 | 
 69 | many2one_models = [
 70 |     detector.MinClusterDetector(KMeans(n_clusters=2)),
 71 |     detector.OutlierDetector(
 72 |         LocalOutlierFactor(n_neighbors=20, contamination=0.1)
 73 |     ),
 74 |     detector.RegressionAD(target="A", regressor=LinearRegression()),
 75 |     detector.PcaAD(),
 76 |     transformer.SumAll(),
 77 |     transformer.RegressionResidual(target="A", regressor=LinearRegression()),
 78 |     transformer.PcaReconstructionError(),
 79 | ]
 80 | 
 81 | 
 82 | @pytest.mark.parametrize("model", one2one_models)
 83 | def test_one2one_s2s_w_name(model):
 84 |     """
 85 |     if a one-to-one model is applied to a Series, it should keep the Series
 86 |     name unchanged
 87 |     """
 88 |     s_name = pd.Series(
 89 |         np.arange(100),
 90 |         index=pd.date_range(start="2017-1-1", periods=100, freq="D"),
 91 |         name="A",
 92 |     )
 93 |     if isinstance(model, _TrainableModel):
 94 |         result = model.fit_predict(s_name)
 95 |     else:
 96 |         result = model.predict(s_name)
 97 |     assert result.name == "A"
 98 | 
 99 | 
100 | @pytest.mark.parametrize("model", one2one_models)
101 | def test_one2one_s2s_wo_name(model):
102 |     """
103 |     if a one-to-one model is applied to a Series, it should keep the Series
104 |     name unchanged
105 |     """
106 |     s_no_name = pd.Series(
107 |         np.arange(100),
108 |         index=pd.date_range(start="2017-1-1", periods=100, freq="D"),
109 |     )
110 |     if isinstance(model, _TrainableModel):
111 |         result = model.fit_predict(s_no_name)
112 |     else:
113 |         result = model.predict(s_no_name)
114 |     assert result.name is None
115 | 
116 | 
117 | @pytest.mark.parametrize("model", one2one_models)
118 | def test_one2one_df2df(model):
119 |     """
120 |     if a one-to-one model is applied to a DataFrame, it should keep the column
121 |     names unchanged
122 |     """
123 |     df = pd.DataFrame(
124 |         np.arange(300).reshape(100, 3),
125 |         index=pd.date_range(start="2017-1-1", periods=100, freq="D"),
126 |         columns=["A", "B", "C"],
127 |     )
128 |     if isinstance(model, _TrainableModel):
129 |         result = model.fit_predict(df)
130 |     else:
131 |         result = model.predict(df)
132 |     assert list(result.columns) == ["A", "B", "C"]
133 | 
134 | 
135 | @pytest.mark.parametrize("model", one2one_models)
136 | def test_one2one_df2list(model):
137 |     """
138 |     if a one-to-one model (detector) is applied to a DataFrame and returns a
139 |     dict, the output dict keys should match the input column names
140 |     """
141 |     if isinstance(model, _Detector):
142 |         df = pd.DataFrame(
143 |             np.arange(300).reshape(100, 3),
144 |             index=pd.date_range(start="2017-1-1", periods=100, freq="D"),
145 |             columns=["A", "B", "C"],
146 |         )
147 |         if isinstance(model, _TrainableModel):
148 |             result = model.fit_detect(df, return_list=True)
149 |         else:
150 |             result = model.detect(df, return_list=True)
151 |         if sys.version_info[1] >= 6:
152 |             assert list(result.keys()) == ["A", "B", "C"]
153 |         else:
154 |             assert set(result.keys()) == {"A", "B", "C"}
155 | 
156 | 
157 | @pytest.mark.parametrize("model", one2many_models)
158 | def test_one2many_s2df_w_name(model):
159 |     """
160 |     if a one-to-many model is applied to a Series, the output should not have
161 |     prefix in column names, no matter whether the input Series has a name.
162 |     """
163 |     s_name = pd.Series(
164 |         np.arange(100),
165 |         index=pd.date_range(start="2017-1-1", periods=100, freq="D"),
166 |         name="A",
167 |     )
168 |     if isinstance(model, _TrainableModel):
169 |         result = model.fit_predict(s_name)
170 |     else:
171 |         result = model.predict(s_name)
172 |     assert all([col[:2] != "A_" for col in result.columns])
173 | 
174 | 
175 | @pytest.mark.parametrize("model", one2many_models)
176 | def test_one2many_s2df_wo_name(model):
177 |     """
178 |     if a one-to-many model is applied to a Series, the output should not have
179 |     prefix in column names, no matter whether the input Series has a name.
180 |     """
181 |     s_no_name = pd.Series(
182 |         np.arange(100),
183 |         index=pd.date_range(start="2017-1-1", periods=100, freq="D"),
184 |     )
185 |     if isinstance(model, _TrainableModel):
186 |         result = model.fit_predict(s_no_name)
187 |     else:
188 |         result = model.predict(s_no_name)
189 |     assert all([col[:2] != "A_" for col in result.columns])
190 | 
191 | 
192 | @pytest.mark.parametrize("model", one2many_models)
193 | def test_one2many_df2df(model):
194 |     """
195 |     if a one-to-many model is applied to a DataFrame, the output should have
196 |     prefix in column names to indicate the input columns they correspond.
197 |     """
198 |     df = pd.DataFrame(
199 |         np.arange(300).reshape(100, 3),
200 |         index=pd.date_range(start="2017-1-1", periods=100, freq="D"),
201 |         columns=["A", "B", "C"],
202 |     )
203 |     if isinstance(model, _TrainableModel):
204 |         result = model.fit_predict(df)
205 |     else:
206 |         result = model.predict(df)
207 |     n_cols = round(len(result.columns) / 3)
208 |     assert all([col[:2] == "A_" for col in result.columns[:n_cols]])
209 |     assert all([col[2:4] != "A_" for col in result.columns[:n_cols]])
210 |     assert all(
211 |         [col[:2] == "B_" for col in result.columns[n_cols : 2 * n_cols]]
212 |     )
213 |     assert all(
214 |         [col[2:4] != "B_" for col in result.columns[n_cols : 2 * n_cols]]
215 |     )
216 |     assert all([col[:2] == "C_" for col in result.columns[2 * n_cols :]])
217 |     assert all([col[2:4] != "C_" for col in result.columns[2 * n_cols :]])
218 | 
219 | 
220 | @pytest.mark.parametrize("model", many2one_models)
221 | def test_many2one(model):
222 |     """
223 |     The output Series from a many-to-one model should NOT have name
224 |     """
225 |     df = pd.DataFrame(
226 |         np.arange(300).reshape(100, 3),
227 |         index=pd.date_range(start="2017-1-1", periods=100, freq="D"),
228 |         columns=["A", "B", "C"],
229 |     )
230 |     if isinstance(model, _TrainableModel):
231 |         result = model.fit_predict(df)
232 |     else:
233 |         result = model.predict(df)
234 |     assert result.name is None
235 | 
236 | 
237 | def test_pca_reconstruction():
238 |     df = pd.DataFrame(
239 |         np.arange(300).reshape(100, 3),
240 |         index=pd.date_range(start="2017-1-1", periods=100, freq="D"),
241 |         columns=["A", "B", "C"],
242 |     )
243 |     result = transformer.PcaReconstruction(k=2).fit_predict(df)
244 |     assert list(result.columns) == ["A", "B", "C"]
245 | 


--------------------------------------------------------------------------------
/tests/test_train_test_split.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Unit tests on train-test split
  3 | """
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | from adtk.data import split_train_test
  8 | 
  9 | 
 10 | def test_split_series():
 11 |     """
 12 |     test all modes on a naive list of from 0 to 99
 13 |     """
 14 |     s = pd.Series(range(100))
 15 | 
 16 |     splits = split_train_test(s, mode=1, n_splits=4, train_ratio=0.8)
 17 |     ts_train, ts_test = zip(*splits)
 18 |     assert all(
 19 |         x.equals(y)
 20 |         for x, y in zip(
 21 |             ts_train,
 22 |             [s.iloc[:20], s.iloc[25:45], s.iloc[50:70], s.iloc[75:95]],
 23 |         )
 24 |     )
 25 |     assert all(
 26 |         x.equals(y)
 27 |         for x, y in zip(
 28 |             ts_test, [s.iloc[20:25], s.iloc[45:50], s.iloc[70:75], s.iloc[95:]]
 29 |         )
 30 |     )
 31 | 
 32 |     splits = split_train_test(s, mode=2, n_splits=4, train_ratio=0.8)
 33 |     ts_train, ts_test = zip(*splits)
 34 |     assert all(
 35 |         x.equals(y)
 36 |         for x, y in zip(
 37 |             ts_train, [s.iloc[:20], s.iloc[:40], s.iloc[:60], s.iloc[:80]]
 38 |         )
 39 |     )
 40 |     assert all(
 41 |         x.equals(y)
 42 |         for x, y in zip(
 43 |             ts_test, [s.iloc[20:25], s.iloc[40:50], s.iloc[60:75], s.iloc[80:]]
 44 |         )
 45 |     )
 46 | 
 47 |     splits = split_train_test(s, mode=3, n_splits=4, train_ratio=0.8)
 48 |     ts_train, ts_test = zip(*splits)
 49 |     assert all(
 50 |         x.equals(y)
 51 |         for x, y in zip(
 52 |             ts_train, [s.iloc[:20], s.iloc[:40], s.iloc[:60], s.iloc[:80]]
 53 |         )
 54 |     )
 55 |     assert all(
 56 |         x.equals(y)
 57 |         for x, y in zip(
 58 |             ts_test, [s.iloc[20:40], s.iloc[40:60], s.iloc[60:80], s.iloc[80:]]
 59 |         )
 60 |     )
 61 | 
 62 |     splits = split_train_test(s, mode=4, n_splits=4, train_ratio=0.8)
 63 |     ts_train, ts_test = zip(*splits)
 64 |     assert all(
 65 |         x.equals(y)
 66 |         for x, y in zip(
 67 |             ts_train, [s.iloc[:20], s.iloc[:40], s.iloc[:60], s.iloc[:80]]
 68 |         )
 69 |     )
 70 |     assert all(
 71 |         x.equals(y)
 72 |         for x, y in zip(
 73 |             ts_test, [s.iloc[20:], s.iloc[40:], s.iloc[60:], s.iloc[80:]]
 74 |         )
 75 |     )
 76 | 
 77 | 
 78 | def test_split_dataframe():
 79 |     """
 80 |     test all modes on a naive df of from 0 to 99
 81 |     """
 82 |     s = pd.Series(range(100))
 83 |     df = pd.DataFrame({"A": s, "B": s})
 84 | 
 85 |     splits = split_train_test(df, mode=1, n_splits=4, train_ratio=0.8)
 86 |     ts_train, ts_test = zip(*splits)
 87 |     assert all(
 88 |         np.array_equal(x.values, y.values)
 89 |         for x, y in zip(
 90 |             ts_train,
 91 |             [df.iloc[:20], df.iloc[25:45], df.iloc[50:70], df.iloc[75:95]],
 92 |         )
 93 |     )
 94 |     assert all(
 95 |         np.array_equal(x.values, y.values)
 96 |         for x, y in zip(
 97 |             ts_test,
 98 |             [df.iloc[20:25], df.iloc[45:50], df.iloc[70:75], df.iloc[95:]],
 99 |         )
100 |     )
101 | 
102 |     splits = split_train_test(df, mode=2, n_splits=4, train_ratio=0.8)
103 |     ts_train, ts_test = zip(*splits)
104 |     assert all(
105 |         np.array_equal(x.values, y.values)
106 |         for x, y in zip(
107 |             ts_train, [df.iloc[:20], df.iloc[:40], df.iloc[:60], df.iloc[:80]]
108 |         )
109 |     )
110 |     assert all(
111 |         np.array_equal(x.values, y.values)
112 |         for x, y in zip(
113 |             ts_test,
114 |             [df.iloc[20:25], df.iloc[40:50], df.iloc[60:75], df.iloc[80:]],
115 |         )
116 |     )
117 | 
118 |     splits = split_train_test(df, mode=3, n_splits=4, train_ratio=0.8)
119 |     ts_train, ts_test = zip(*splits)
120 |     assert all(
121 |         np.array_equal(x.values, y.values)
122 |         for x, y in zip(
123 |             ts_train, [df.iloc[:20], df.iloc[:40], df.iloc[:60], df.iloc[:80]]
124 |         )
125 |     )
126 |     assert all(
127 |         np.array_equal(x.values, y.values)
128 |         for x, y in zip(
129 |             ts_test,
130 |             [df.iloc[20:40], df.iloc[40:60], df.iloc[60:80], df.iloc[80:]],
131 |         )
132 |     )
133 | 
134 |     splits = split_train_test(df, mode=4, n_splits=4, train_ratio=0.8)
135 |     ts_train, ts_test = zip(*splits)
136 |     assert all(
137 |         np.array_equal(x.values, y.values)
138 |         for x, y in zip(
139 |             ts_train, [df.iloc[:20], df.iloc[:40], df.iloc[:60], df.iloc[:80]]
140 |         )
141 |     )
142 |     assert all(
143 |         np.array_equal(x.values, y.values)
144 |         for x, y in zip(
145 |             ts_test, [df.iloc[20:], df.iloc[40:], df.iloc[60:], df.iloc[80:]]
146 |         )
147 |     )
148 | 


--------------------------------------------------------------------------------
/tests/test_transformerhd.py:
--------------------------------------------------------------------------------
  1 | """Test HD transformers."""
  2 | import numpy as np
  3 | import pandas as pd
  4 | import pytest
  5 | from sklearn.linear_model import LinearRegression
  6 | 
  7 | import adtk.transformer as transformer
  8 | from adtk._base import _TrainableModel
  9 | 
 10 | nan = float("nan")
 11 | 
 12 | testCases = [
 13 |     {
 14 |         "model": transformer.CustomizedTransformerHD,
 15 |         "params": {"transform_func": lambda x: x.sum(axis=1) > 0},
 16 |         "df": [
 17 |             [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 18 |             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 19 |         ],
 20 |         "t": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 21 |     },
 22 |     {
 23 |         "model": transformer.CustomizedTransformerHD,
 24 |         "params": {
 25 |             "transform_func": lambda x, a: x.sum(axis=1) > a,
 26 |             "transform_func_params": {"a": 0},
 27 |         },
 28 |         "df": [
 29 |             [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 30 |             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 31 |         ],
 32 |         "t": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 33 |     },
 34 |     {
 35 |         "model": transformer.CustomizedTransformerHD,
 36 |         "params": {
 37 |             "transform_func": lambda x, a: x.sum(axis=1) > a,
 38 |             "fit_func": lambda x: {"a": x.sum(axis=1).median()},
 39 |         },
 40 |         "df": [
 41 |             [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 42 |             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 43 |         ],
 44 |         "t": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 45 |     },
 46 |     {
 47 |         "model": transformer.CustomizedTransformerHD,
 48 |         "params": {
 49 |             "transform_func": lambda x, a: x.sum(axis=1) > a,
 50 |             "fit_func": lambda x, q: {"a": x.sum(axis=1).quantile(q)},
 51 |             "fit_func_params": {"q": 0.5},
 52 |         },
 53 |         "df": [
 54 |             [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 55 |             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 56 |         ],
 57 |         "t": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 58 |     },
 59 |     {
 60 |         "model": transformer.CustomizedTransformerHD,
 61 |         "params": {
 62 |             "transform_func": lambda x, a, b: (x.sum(axis=1) > a)
 63 |             | (x.sum(axis=1) < b),
 64 |             "transform_func_params": {"b": -0.5},
 65 |             "fit_func": lambda x: {"a": x.sum(axis=1).median()},
 66 |         },
 67 |         "df": [
 68 |             [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 69 |             [0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0],
 70 |         ],
 71 |         "t": [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0],
 72 |     },
 73 |     {
 74 |         "model": transformer.CustomizedTransformerHD,
 75 |         "params": {
 76 |             "transform_func": lambda x, a, b: (x.sum(axis=1) > a)
 77 |             | (x.sum(axis=1) < b),
 78 |             "transform_func_params": {"b": -0.5},
 79 |             "fit_func": lambda x, q: {"a": x.sum(axis=1).quantile(q)},
 80 |             "fit_func_params": {"q": 0.5},
 81 |         },
 82 |         "df": [
 83 |             [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 84 |             [0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0],
 85 |         ],
 86 |         "t": [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0],
 87 |     },
 88 |     {
 89 |         "model": transformer.CustomizedTransformerHD,
 90 |         "params": {
 91 |             "transform_func": lambda x: pd.DataFrame(
 92 |                 {"min": x.min(axis=1) > 0, "max": x.max(axis=1) > 0}
 93 |             )
 94 |         },
 95 |         "df": [
 96 |             [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 97 |             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 98 |         ],
 99 |         "t": {
100 |             "min": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
101 |             "max": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
102 |         },
103 |     },
104 |     {
105 |         "model": transformer.CustomizedTransformerHD,
106 |         "params": {
107 |             "transform_func": lambda x, a: pd.DataFrame(
108 |                 {"min": x.min(axis=1) > a, "max": x.max(axis=1) > a}
109 |             ),
110 |             "transform_func_params": {"a": 0},
111 |         },
112 |         "df": [
113 |             [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
114 |             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
115 |         ],
116 |         "t": {
117 |             "min": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
118 |             "max": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
119 |         },
120 |     },
121 |     {
122 |         "model": transformer.CustomizedTransformerHD,
123 |         "params": {
124 |             "transform_func": lambda x, a: pd.DataFrame(
125 |                 {"min": x.min(axis=1) > a, "max": x.max(axis=1) > a}
126 |             ),
127 |             "fit_func": lambda x: {"a": x.sum(axis=1).median()},
128 |         },
129 |         "df": [
130 |             [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
131 |             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
132 |         ],
133 |         "t": {
134 |             "min": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
135 |             "max": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
136 |         },
137 |     },
138 |     {
139 |         "model": transformer.CustomizedTransformerHD,
140 |         "params": {
141 |             "transform_func": lambda x, a: pd.DataFrame(
142 |                 {"min": x.min(axis=1) > a, "max": x.max(axis=1) > a}
143 |             ),
144 |             "fit_func": lambda x, q: {"a": x.sum(axis=1).quantile(q)},
145 |             "fit_func_params": {"q": 0.5},
146 |         },
147 |         "df": [
148 |             [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
149 |             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
150 |         ],
151 |         "t": {
152 |             "min": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
153 |             "max": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
154 |         },
155 |     },
156 |     {
157 |         "model": transformer.CustomizedTransformerHD,
158 |         "params": {
159 |             "transform_func": lambda x, a, b: pd.DataFrame(
160 |                 {
161 |                     "min": (x.min(axis=1) > a) | (x.min(axis=1) < b),
162 |                     "max": (x.max(axis=1) > a) | (x.max(axis=1) < b),
163 |                 }
164 |             ),
165 |             "transform_func_params": {"b": -0.5},
166 |             "fit_func": lambda x: {"a": x.sum(axis=1).median()},
167 |         },
168 |         "df": [
169 |             [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
170 |             [0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0],
171 |         ],
172 |         "t": {
173 |             "min": [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
174 |             "max": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
175 |         },
176 |     },
177 |     {
178 |         "model": transformer.CustomizedTransformerHD,
179 |         "params": {
180 |             "transform_func": lambda x, a, b: pd.DataFrame(
181 |                 {
182 |                     "min": (x.min(axis=1) > a) | (x.min(axis=1) < b),
183 |                     "max": (x.max(axis=1) > a) | (x.max(axis=1) < b),
184 |                 }
185 |             ),
186 |             "transform_func_params": {"b": -0.5},
187 |             "fit_func": lambda x, q: {"a": x.sum(axis=1).quantile(q)},
188 |             "fit_func_params": {"q": 0.5},
189 |         },
190 |         "df": [
191 |             [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
192 |             [0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0],
193 |         ],
194 |         "t": {
195 |             "min": [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
196 |             "max": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
197 |         },
198 |     },
199 |     {
200 |         "model": transformer.RegressionResidual,
201 |         "params": {"regressor": LinearRegression(), "target": 1},
202 |         "df": [
203 |             [0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9],
204 |             [9, 8, 7, 6, 5, 4, 3, 2, nan, 1, 0],
205 |             [9] * 11,
206 |         ],
207 |         "t": [0] * 8 + [nan] + [0] * 2,
208 |     },
209 |     {
210 |         "model": transformer.PcaProjection,
211 |         "params": {"k": 1},
212 |         "df": [[0, 1, 2, 3, 4, 4, nan, 5, 6], [0, 1, 2, 3, nan, 4, 5, 5, 6]],
213 |         "t": {
214 |             "pc0": [
215 |                 3 * 2 ** 0.5,
216 |                 2 * 2 ** 0.5,
217 |                 1 * 2 ** 0.5,
218 |                 0 * 2 ** 0.5,
219 |                 nan,
220 |                 -1 * 2 ** 0.5,
221 |                 nan,
222 |                 -2 * 2 ** 0.5,
223 |                 -3 * 2 ** 0.5,
224 |             ]
225 |         },
226 |     },
227 |     {
228 |         "model": transformer.PcaReconstruction,
229 |         "params": {"k": 1},
230 |         "df": [
231 |             [0, 1, 2, 3, 3.9, 4.1, 5, 6, 7, 7, 8, 9],
232 |             [0, 1, 2, 3, 4.1, 3.9, 5, 6, 7, nan, 8, 9],
233 |         ],
234 |         "t": {
235 |             0: [0, 1, 2, 3, 4, 4, 5, 6, 7, nan, 8, 9],
236 |             1: [0, 1, 2, 3, 4, 4, 5, 6, 7, nan, 8, 9],
237 |         },
238 |     },
239 |     {
240 |         "model": transformer.PcaReconstructionError,
241 |         "params": {"k": 1},
242 |         "df": [
243 |             [0, 1, 2, 3, 3.9, 4.1, 5, 6, 7, 7, 8, 9],
244 |             [0, 1, 2, 3, 4.1, 3.9, 5, 6, 7, nan, 8, 9],
245 |         ],
246 |         "t": [0, 0, 0, 0, 0.02, 0.02, 0, 0, 0, nan, 0, 0],
247 |     },
248 | ]
249 | 
250 | 
251 | @pytest.mark.parametrize("testCase", testCases)
252 | def test_fit_transform(testCase):
253 |     """Test fit_transform the transformer."""
254 |     df = pd.DataFrame(
255 |         np.array(testCase["df"]).T,
256 |         pd.date_range(
257 |             start="2017-1-1", periods=len(testCase["df"][0]), freq="D"
258 |         ),
259 |     )
260 |     model = testCase["model"](**testCase["params"])
261 |     if isinstance(model, _TrainableModel):
262 |         t = model.fit_transform(df)
263 |     else:
264 |         t = model.transform(df)
265 |     if not isinstance(testCase["t"], dict):
266 |         t_true = pd.Series(testCase["t"], index=df.index)
267 |         pd.testing.assert_series_equal(t, t_true, check_dtype=False)
268 |     else:
269 |         t_true = pd.DataFrame(testCase["t"], index=df.index)
270 |         pd.testing.assert_frame_equal(t, t_true, check_dtype=False)
271 | 
272 | 
273 | @pytest.mark.parametrize("testCase", testCases)
274 | def test_fit_and_transform(testCase):
275 |     """Test fit the transformer and then transform."""
276 |     df = pd.DataFrame(
277 |         np.array(testCase["df"]).T,
278 |         pd.date_range(
279 |             start="2017-1-1", periods=len(testCase["df"][0]), freq="D"
280 |         ),
281 |     )
282 |     model = testCase["model"](**testCase["params"])
283 |     if isinstance(model, _TrainableModel):
284 |         model.fit(df)
285 |     t = model.transform(df)
286 |     if not isinstance(testCase["t"], dict):
287 |         t_true = pd.Series(testCase["t"], index=df.index)
288 |         pd.testing.assert_series_equal(t, t_true, check_dtype=False)
289 |     else:
290 |         t_true = pd.DataFrame(testCase["t"], index=df.index)
291 |         pd.testing.assert_frame_equal(t, t_true, check_dtype=False)
292 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist =
 3 |     py35-pandas24--stats{9,11}
 4 |     py{36,37}-pandas{24,25,1}-stats{9,11}
 5 |     py38-pandas{25,1}-stats11
 6 | [testenv]
 7 | extras = test
 8 | deps =
 9 |     pandas24: pandas>=0.24,<0.25
10 |     pandas25: pandas>=0.25,<0.26
11 |     pandas1: pandas>=1.0,<1.1
12 |     stats9: statsmodels>=0.9,<0.10
13 |     stats11: statsmodels>=0.11,<0.12
14 | commands =
15 |     pytest
16 |     mypy ./src/adtk/ --config-file ./mypy.ini


--------------------------------------------------------------------------------