├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── python-publish.yml │ └── testing.yml ├── .gitignore ├── .readthedocs.yml ├── CITATION.cff ├── LICENSE ├── MANIFEST.in ├── README.md ├── benchmark ├── README.md ├── benchamark_results.csv ├── benchmark.ipynb ├── benchmark_results.svg ├── dataset.py └── requirements_benchmark.txt ├── docs ├── Makefile ├── make.bat ├── requirements-docs.txt └── source │ ├── _templates │ └── class.rst │ ├── api │ ├── streamad.evaluate.md │ ├── streamad.md │ ├── streamad.model.md │ ├── streamad.process.md │ └── streamad.util.md │ ├── benchmark.md │ ├── conf.py │ ├── example │ ├── calibrator_usage.ipynb │ ├── dataset_usage.ipynb │ ├── ensemble_usage.ipynb │ ├── example.md │ ├── multivariate.ipynb │ └── univariate.ipynb │ ├── images │ ├── logo_html.svg │ ├── logo_htmlwithname.svg │ └── logo_index.svg │ ├── index.md │ ├── overview.md │ ├── references.md │ └── refs.bib ├── example ├── README.md ├── dataset_usage.ipynb ├── multivariate.ipynb ├── thresholder_usage.ipynb └── univariate.ipynb ├── poetry.lock ├── pyproject.toml ├── streamad ├── __init__.py ├── base │ ├── __init__.py │ ├── detector.py │ └── metrics.py ├── evaluate │ ├── __init__.py │ ├── numenta_aware_metrics.py │ ├── point_aware_metrics.py │ ├── series_aware_metrics.py │ └── ts_metrics.py ├── meta.yaml ├── model │ ├── KNN_Detector.py │ ├── Mad_Dectector.py │ ├── OCSVM_Detector.py │ ├── SArima_Detector.py │ ├── SR_Detector.py │ ├── __init__.py │ ├── hstree_Detector.py │ ├── loda_Detector.py │ ├── random_Detector.py │ ├── rrcf_Detector.py │ ├── rshash_Detector.py │ ├── spot_Detector.py │ ├── xStream_Detector.py │ ├── zscore_Detector.py │ └── zspot_Detector.py ├── process │ ├── __init__.py │ ├── tdigest_calibrator.py │ ├── vote_ensemble.py │ ├── weight_ensemble.py │ └── zscore_calibrator.py ├── util │ ├── __init__.py │ ├── data │ │ ├── multiDS.csv │ │ └── uniDS.csv │ ├── dataset.py │ ├── math_toolkit.py │ ├── plot.py │ └── stream_generator.py └── version.py └── test ├── __init__.py ├── test_OCSVM.py ├── test_calibrator.py ├── test_ensemble.py ├── test_evaluate.py ├── test_hstree.py ├── test_knncad.py ├── test_loda.py ├── test_mad.py ├── test_plot.py ├── test_random.py ├── test_rrcf.py ├── test_rshash.py ├── test_sarima.py ├── test_sdft.py ├── test_spot.py ├── test_sr.py ├── test_stats.py ├── test_xstream.py └── test_zscore.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug, enhancement 6 | assignees: Fengrui-Liu 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. Windows 11] 28 | 29 | **Package version (please complete the following information):** 30 | - Version [e.g. pypi 0.1.1] 31 | 32 | **Additional context** 33 | Add any other context about the problem here. 34 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: Fengrui-Liu 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Use this template to add a new detector/feature. 2 | ### Model Name: 3 | 4 | 5 | ### Paper/Project related links or docs: 6 | 7 | 8 | 9 | ### New Model Submissions: 10 | 11 | * [ ] Have you created a .py in ~/streamad/model/? 12 | * [ ] Have you created a _example.py in ~/examples/? 13 | * [ ] Have you created a test_.py in ~/test/? 14 | * [ ] Have you created Google style doc for each Class in ? -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | runs-on: ubuntu-latest 21 | 22 | steps: 23 | - uses: actions/checkout@v3 24 | - name: Build and publish to PyPI 25 | uses: JRubics/poetry-publish@v1.16 26 | with: 27 | user: __token__ 28 | pypi_token: ${{ secrets.PYPI_API_TOKEN }} 29 | verbose: true 30 | -------------------------------------------------------------------------------- /.github/workflows/testing.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - dev 8 | pull_request: 9 | branches: 10 | - main 11 | - dev 12 | workflow_call: 13 | inputs: 14 | username: 15 | required: false 16 | type: string 17 | secrets: 18 | access-token: 19 | required: false 20 | 21 | jobs: 22 | build: 23 | strategy: 24 | fail-fast: false 25 | matrix: 26 | os: [ubuntu-latest] 27 | python-version: ["3.8", "3.9", "3.10", "3.11"] 28 | poetry-version: ["1.4.2"] 29 | runs-on: ${{ matrix.os }} 30 | 31 | steps: 32 | - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event." 33 | - run: echo "🐧 This job is now running on a ${{ runner.os }} server hosted by GitHub!" 34 | - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}." 35 | #---------------------------------------------- 36 | # check-out repo and set-up python 37 | #---------------------------------------------- 38 | - name: Check out repository code 39 | uses: actions/checkout@master 40 | - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner." 41 | - run: echo "🖥️ The workflow is now ready to test your code on the runner." 42 | - name: List files in the repository 43 | run: | 44 | ls ${{ github.workspace }} 45 | - name: Python ${{ matrix.python-version }} 46 | uses: actions/setup-python@master 47 | with: 48 | python-version: ${{ matrix.python-version }} 49 | #---------------------------------------------- 50 | # ----- install & configure poetry ----- 51 | #---------------------------------------------- 52 | - name: Install Poetry 53 | uses: abatilo/actions-poetry@v2 54 | with: 55 | virtualenvs-create: true 56 | virtualenvs-in-project: true 57 | installer-parallel: true 58 | poetry-version: ${{ matrix.poetry-version }} 59 | - name: Install dependencies 60 | run: poetry install --no-interaction --no-root 61 | - name: Generate coverage report 62 | run: poetry run pytest --cov=./ --cov-report=xml 63 | - name: Upload coverage to Codecov 64 | uses: codecov/codecov-action@v2 65 | with: 66 | token: ${{ secrets.CODECOV_TOKEN }} 67 | directory: ./ 68 | files: ./coverage.xml 69 | env_vars: OS,PYTHON 70 | fail_ci_if_error: true 71 | flags: pytests 72 | name: codecov-umbrella 73 | verbose: true 74 | - run: echo "🍏 This job's status is ${{ job.status }}." 75 | 76 | publish: 77 | runs-on: ubuntu-latest 78 | if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') 79 | steps: 80 | - uses: actions/checkout@v3 81 | - uses: actions/setup-python@v3 82 | - name: Install dependencies 83 | run: | 84 | python -m pip install --upgrade pip 85 | pip install build 86 | - name: Build package 87 | run: python -m build 88 | - name: Install twine to check the package 89 | run: pip install twine 90 | - name: Check the package 91 | run: twine check dist/* 92 | - name: Publish distribution 📦 to PyPI 93 | if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') 94 | uses: pypa/gh-action-pypi-publish@v1.5.0 95 | with: 96 | user: __token__ 97 | password: ${{ secrets.PYPI_API_TOKEN }} 98 | verbose: true 99 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .cache/ 3 | .pytest_cache 4 | __pycache__ 5 | .vscode 6 | 7 | # Byte-compiled / optimized / DLL files 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | .pytest_cache/ 14 | 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | share/python-wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .nox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | *.py,cover 57 | .hypothesis/ 58 | .pytest_cache/ 59 | cover/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | db.sqlite3 69 | db.sqlite3-journal 70 | 71 | # Flask stuff: 72 | instance/ 73 | .webassets-cache 74 | 75 | # Scrapy stuff: 76 | .scrapy 77 | 78 | 79 | 80 | # PyBuilder 81 | .pybuilder/ 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | test.ipynb 87 | 88 | # IPython 89 | profile_default/ 90 | ipython_config.py 91 | 92 | # pyenv 93 | # For a library or package, you might want to ignore these files since the code is 94 | # intended to run in multiple environments; otherwise, check them in: 95 | # .python-version 96 | 97 | # pipenv 98 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 99 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 100 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 101 | # install all needed dependencies. 102 | #Pipfile.lock 103 | 104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 105 | __pypackages__/ 106 | 107 | # Celery stuff 108 | celerybeat-schedule 109 | celerybeat.pid 110 | 111 | # SageMath parsed files 112 | *.sage.py 113 | 114 | # Environments 115 | .env 116 | .venv 117 | env/ 118 | venv/ 119 | ENV/ 120 | env.bak/ 121 | venv.bak/ 122 | 123 | # Spyder project settings 124 | .spyderproject 125 | .spyproject 126 | 127 | # Rope project settings 128 | .ropeproject 129 | 130 | # mkdocs documentation 131 | /site 132 | /docs/build/ 133 | /docs/source/benchmark/streamad-benchmark-dataset/ 134 | 135 | # benchmark 136 | 137 | /benchmark/streamad-benchmark-dataset/ 138 | 139 | # mypy 140 | .mypy_cache/ 141 | .dmypy.json 142 | dmypy.json 143 | 144 | # Pyre type checker 145 | .pyre/ 146 | 147 | # pytype static type analyzer 148 | .pytype/ 149 | 150 | # Cython debug symbols 151 | cython_debug/ 152 | 153 | # Others 154 | benchmark/ 155 | .DS_Store -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: "ubuntu-22.04" 5 | tools: 6 | python: "3.8" 7 | jobs: 8 | post_create_environment: 9 | - pip install -U setuptools==58.2.0 10 | 11 | 12 | 13 | python: 14 | install: 15 | - method: pip 16 | path: . 17 | extra_requirements: 18 | - rtd 19 | - requirements: docs/requirements-docs.txt 20 | 21 | sphinx: 22 | builder: html 23 | fail_on_warning: true 24 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: Liu 5 | given-names: Fengrui 6 | orcid: 7 | title: "StreamAD" 8 | version: 0.3.0 9 | doi: 10 | date-released: 2022-05-15 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2019 Seldon Technologies Ltd. 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include requirements.txt 3 | prune test -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # StreamAD 2 | 3 | ![StreamAD Logo](docs/source/images/logo_htmlwithname.svg) 4 | 5 | 6 | 7 | Anomaly detection for data streams/time series. Detectors process the univariate or multivariate data one by one to simulte a real-time scene. 8 | 9 | 10 | 11 | [Documentation](https://streamad.readthedocs.io/en/latest/) 12 | 13 | 14 | 15 | 16 | 17 | 18 | ![PyPI](https://img.shields.io/pypi/v/streamad) 19 | ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/StreamAD?style=flat) 20 | ![PyPI - Implementation](https://img.shields.io/pypi/implementation/streamad) 21 | 22 | ![Read the Docs](https://img.shields.io/readthedocs/streamad?style=flat) 23 | ![GitHub](https://img.shields.io/github/license/Fengrui-Liu/StreamAD) 24 | [![Downloads](https://static.pepy.tech/personalized-badge/streamad?period=total&units=international_system&left_color=grey&right_color=orange&left_text=Downloads)](https://pepy.tech/project/streamad) 25 | 26 | 27 | ![example workflow](https://github.com/Fengrui-Liu/StreamAD/actions/workflows/testing.yml//badge.svg) 28 | [![codecov](https://codecov.io/gh/Fengrui-Liu/StreamAD/branch/main/graph/badge.svg?token=AQG26L2RA7)](https://codecov.io/gh/Fengrui-Liu/StreamAD) 29 | [![Maintainability](https://api.codeclimate.com/v1/badges/525d7e3663ee4c5c0daa/maintainability)](https://codeclimate.com/github/Fengrui-Liu/StreamAD/maintainability) 30 | [![FOSSA Status](https://app.fossa.com/api/projects/git%2Bgithub.com%2FFengrui-Liu%2FStreamAD.svg?type=small)](https://app.fossa.com/projects/git%2Bgithub.com%2FFengrui-Liu%2FStreamAD?ref=badge_small) 31 | 32 | 33 | 34 | --- 35 | 36 | 37 | 38 | ## Installation 39 | 40 | The stable version can be installed from PyPI: 41 | 42 | ```bash 43 | pip install streamad 44 | ``` 45 | 46 | The development version can be installed from GitHub: 47 | 48 | ```bash 49 | pip install git+https://github.com/Fengrui-Liu/StreamAD 50 | ``` 51 | 52 | --- 53 | 54 | ## Quick Start 55 | 56 | Start once detection within 5 lines of code. You can find more example with visualization results [here](https://streamad.readthedocs.io/en/latest/example/example.html). 57 | 58 | ```python 59 | from streamad.util import StreamGenerator, UnivariateDS 60 | from streamad.model import SpotDetector 61 | 62 | ds = UnivariateDS() 63 | stream = StreamGenerator(ds.data) 64 | model = SpotDetector() 65 | 66 | for x in stream.iter_item(): 67 | score = model.fit_score(x) 68 | ``` 69 | 70 | ## Models 71 | 72 | ### For univariate time series 73 | 74 | If you want to detect multivarite time series with these models, you need to apply them on each feature separately. 75 | 76 | | Model Example | API Usage | Paper | 77 | | ----------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 78 | | [KNNCAD](https://streamad.readthedocs.io/en/latest/example/univariate.html#knncad-detector) | [streamad.model.KNNDetector](https://streamad.readthedocs.io/en/latest/api/streamad.model.html#knndetector) | [Conformalized density- and distance-based anomaly detection in time-series data](https://arxiv.org/abs/1608.04585) | 79 | | [SPOT](https://streamad.readthedocs.io/en/latest/example/univariate.html#spot-detector) | [streamad.model.SpotDetector](https://streamad.readthedocs.io/en/latest/api/streamad.model.html#spotdetector) | [Anomaly detection in streams with extreme value theory](https://dl.acm.org/doi/10.1145/3097983.3098144) | 80 | | [Spectral Residual](https://streamad.readthedocs.io/en/latest/example/univariate.html#spectral-residual-detector) | [streamad.model.SRDetector](https://streamad.readthedocs.io/en/latest/api/streamad.model.html#srdetector) | [Time-series anomaly detection service at microsoft](https://arxiv.org/abs/1906.03821) | 81 | | [Z score](https://streamad.readthedocs.io/en/latest/example/univariate.html#z-score-detector) | [streamad.model.ZScoreDetector](https://streamad.readthedocs.io/en/latest/api/streamad.model.html#zscoredetector) | [Standard score](https://en.wikipedia.org/wiki/Standard_score) | 82 | | [One-class SVM](https://streamad.readthedocs.io/en/latest/example/univariate.html#one-class-svm-detector) | [streamad.model.OCSVMDetector](https://streamad.readthedocs.io/en/latest/api/streamad.model.html#ocsvmdetector) | [One-class SVM](https://en.wikipedia.org/w/index.php?title=One-class_classification&oldid=1098733917) | 83 | | [MAD](https://streamad.readthedocs.io/en/latest/example/univariate.html#median-absolute-deviation-detector) | [streamad.model.MadDetector](https://streamad.readthedocs.io/en/latest/api/streamad.model.html#maddetector) | [Median absolute deviation](https://www.influxdata.com/blog/anomaly-detection-with-median-absolute-deviation/#:~:text=How%20Median%20Absolute%20Deviation%20algorithm,time%20series%20at%20that%20timestamp/) | 84 | | [SARIMAX](https://streamad.readthedocs.io/en/latest/example/univariate.html#seasonal-arima-detector) | [streamad.model.SArimaDetector](https://streamad.readthedocs.io/en/latest/api/streamad.model.html#sarimadetector) | [Seasonal Arima Detector](https://www.statsmodels.org/dev/generated/statsmodels.tsa.statespace.sarimax.SARIMAX.html?highlight=sarimax#statsmodels.tsa.statespace.sarimax.SARIMAX) | 85 | 86 | ### For multivariate time series 87 | 88 | These models are compatible with univariate time series. 89 | 90 | 91 | 92 | | Models Example | API Usage | Paper | 93 | | ------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 94 | | [xStream](https://streamad.readthedocs.io/en/latest/example/multivariate.html#xstream-detector) | [streamad.model.xStramDetector](https://streamad.readthedocs.io/en/latest/api/streamad.model.html#xstreamdetector) | [Xstream: outlier detection in feature-evolving data streams](http://www.kdd.org/kdd2018/accepted-papers/view/xstream-outlier-detection-in-feature-evolving-data-streams) | 95 | | [RShash](https://streamad.readthedocs.io/en/latest/example/multivariate.html#rshash-detector) | [streamad.model.RShashDetector](https://streamad.readthedocs.io/en/latest/api/streamad.model.html#rshashdetector) | [Subspace Outlier Detection in Linear Time with Randomized Hashing](https://ieeexplore.ieee.org/document/7837870) | 96 | | [HSTree](https://streamad.readthedocs.io/en/latest/example/multivariate.html#half-space-tree-detector) | [streamad.model.HSTreeDetector](https://streamad.readthedocs.io/en/latest/api/streamad.model.html#hstreedetector) | [Fast Anomaly Detection for Streaming Data](https://www.ijcai.org/Proceedings/11/Papers/254.pdf) | 97 | | [LODA](https://streamad.readthedocs.io/en/latest/example/multivariate.html#loda-detector) | [streamad.model.LodaDetector](https://streamad.readthedocs.io/en/latest/api/streamad.model.html#lodadetector) | [Lightweight on-line detector of anomalies](https://link.springer.com/article/10.1007/s10994-015-5521-0) | 98 | | [RRCF](https://streamad.readthedocs.io/en/latest/example/univariate.html#rrcf-detector) | [streamad.model.RrcfDetector](https://streamad.readthedocs.io/en/latest/api/streamad.model.html#rrcfdetector) | [Robust random cut forest based anomaly detection on streams](http://proceedings.mlr.press/v48/guha16.pdf) | 99 | -------------------------------------------------------------------------------- /benchmark/README.md: -------------------------------------------------------------------------------- 1 | 1. [GAIA Dataset](https://github.com/CloudWise-OpenSource/GAIA-DataSet) 2 | 2. [] -------------------------------------------------------------------------------- /benchmark/benchamark_results.csv: -------------------------------------------------------------------------------- 1 | Detector,Dataset,Key,Size(#),Time(s),Point_Precision,Point_Recall,Point_Fbeta,Series_Precision,Series_Recall,Series_Fbeta,Numenta_Precision,Numenta_Recall,Numenta_Fbeta 2 | SpotDetector,GAIA,linear_data_4_from2018-12-19to2019-01-31_8313,12672,0.45219812600000253,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 3 | SpotDetector,GAIA,linear_data_42_from2018-12-19to2019-01-31_8153,12672,0.4196338829999995,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 4 | SpotDetector,GAIA,linear_data_7_from2018-12-19to2019-01-31_8300,12672,0.4480964440000008,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 5 | SpotDetector,GAIA,linear_data_11_from2018-12-19to2019-01-31_8164,12672,0.44065757699999963,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 6 | SpotDetector,GAIA,linear_data_32_from2018-12-19to2019-01-31_8151,12672,0.501862311,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 7 | SpotDetector,GAIA,linear_data_25_from2018-12-19to2019-01-31_8412,12672,0.4510404809999997,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 8 | SpotDetector,GAIA,linear_data_4_from2018-12-19to2019-01-31_8313,12672,0.694301566,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 9 | SpotDetector,GAIA,linear_data_42_from2018-12-19to2019-01-31_8153,12672,0.9583882379999977,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 10 | SpotDetector,GAIA,linear_data_7_from2018-12-19to2019-01-31_8300,12672,0.7049085780000013,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 11 | SpotDetector,GAIA,linear_data_11_from2018-12-19to2019-01-31_8164,12672,0.49868877999999484,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 12 | SpotDetector,GAIA,linear_data_32_from2018-12-19to2019-01-31_8151,12672,0.4977096179999947,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 13 | SpotDetector,GAIA,linear_data_25_from2018-12-19to2019-01-31_8412,12672,0.4146735479999961,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 14 | SpotDetector,GAIA,linear_data_8_from2018-12-19to2019-01-31_8329,12672,0.9985857739999986,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 15 | SpotDetector,GAIA,linear_data_57_from2018-12-19to2019-01-31_8268,12672,0.3831847010000047,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 16 | SpotDetector,GAIA,linear_data_41_from2018-12-19to2019-01-31_8386,12672,0.37832419400000106,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 17 | SpotDetector,GAIA,linear_data_29_from2018-12-19to2019-01-31_8410,12672,0.4857962309999948,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 18 | SpotDetector,GAIA,linear_data_54_from2018-12-19to2019-01-31_8319,12672,0.38469699799999546,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 19 | SpotDetector,GAIA,linear_data_12_from2018-12-19to2019-01-31_8372,12672,0.4077938490000008,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 20 | SpotDetector,GAIA,linear_data_26_from2018-12-19to2019-01-31_8348,12672,0.44336228300001324,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 21 | SpotDetector,GAIA,linear_data_31_from2018-12-19to2019-01-31_8192,12672,0.6270404440000021,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 22 | SpotDetector,GAIA,linear_data_58_from2018-12-19to2019-01-31_8334,12672,0.4341263150000003,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 23 | SpotDetector,GAIA,linear_data_30_from2018-12-19to2019-01-31_8376,12672,0.45635435400001256,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 24 | SpotDetector,GAIA,linear_data_20_from2018-12-19to2019-01-31_8210,12672,0.4395088150000106,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 25 | SpotDetector,GAIA,linear_data_43_from2019-11-16to2019-12-16_2584,8640,0.4916386119999885,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 26 | SpotDetector,GAIA,linear_data_48_from2018-12-19to2019-01-31_8425,12672,0.449180405000007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 27 | SpotDetector,GAIA,linear_data_51_from2018-12-19to2019-01-31_8286,12672,0.5169517649999875,0.6666666666666666,0.017699115044247787,0.034482758620689655,0.75,0.007352941176470588,0.014563106796116504,0.6666666666666666,0.014705882352941176,0.028776978417266192 28 | SpotDetector,GAIA,linear_data_2_from2018-12-19to2019-01-31_8304,12672,0.40478352799999584,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 29 | SpotDetector,GAIA,linear_data_17_from2018-12-19to2019-01-31_8159,12672,0.3556049849999994,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 30 | SpotDetector,GAIA,linear_data_13_from2019-11-16to2019-12-16_1646,8640,0.3012597190000008,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 31 | SpotDetector,GAIA,linear_data_18_from2018-12-19to2019-01-31_8167,12672,0.373629550000004,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 32 | SpotDetector,GAIA,linear_data_34_from2018-12-19to2019-01-31_8385,12672,0.37975166599999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 33 | SpotDetector,GAIA,linear_data_47_from2019-01-01to2019-01-31_7946,8806,2.7575944400000054,0.2,0.009345794392523364,0.017857142857142856,0.2,0.018518518518518517,0.03389830508474576,0.2,0.018518518518518517,0.03389830508474576 34 | SpotDetector,GAIA,linear_data_37_from2018-12-19to2019-01-31_8378,12672,1.0630981340000005,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 35 | SpotDetector,GAIA,linear_data_1_from2019-01-01to2019-01-31_7861,8806,1.670446443000003,0.5,0.017857142857142856,0.03448275862068965,0.5,0.027777777777777776,0.05263157894736842,0.5,0.05555555555555555,0.09999999999999999 36 | SpotDetector,GAIA,linear_data_23_from2018-12-19to2019-01-31_8199,12672,0.5604692799999924,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 37 | SpotDetector,GAIA,linear_data_52_from2019-01-01to2019-01-31_8145,8806,0.34901906199999644,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 38 | SpotDetector,GAIA,linear_data_44_from2018-12-19to2019-01-31_8396,12672,0.5879434660000129,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 39 | SpotDetector,GAIA,linear_data_38_from2018-12-19to2019-01-31_8224,12672,0.43967261599999574,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 40 | SpotDetector,GAIA,linear_data_14_from2018-12-19to2019-01-31_8213,12672,0.4007684850000004,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 41 | SpotDetector,GAIA,linear_data_33_from2018-12-19to2019-01-31_8232,12672,0.4994144230000046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 42 | SpotDetector,GAIA,linear_data_9_from2018-12-19to2019-01-31_8315,12672,0.4864235219999955,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 43 | SpotDetector,GAIA,linear_data_56_from2019-01-01to2019-01-31_8146,8807,0.24272840499997983,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 44 | SpotDetector,GAIA,linear_data_6_from2018-12-19to2019-01-31_8263,12672,0.49525077100000203,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 45 | SpotDetector,GAIA,linear_data_24_from2018-12-19to2019-01-31_8360,12672,0.3405894979999857,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 46 | SpotDetector,GAIA,linear_data_10_from2018-12-19to2019-01-31_8158,12672,0.33526661499999477,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0 47 | SpotDetector,GAIA,linear_data_40_from2018-12-19to2019-01-31_8219,12672,0.3877679789999888,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 48 | SpotDetector,GAIA,linear_data_28_from2018-12-19to2019-01-31_8163,12672,0.4542016530000126,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 49 | SpotDetector,GAIA,linear_data_55_from2018-12-19to2019-01-31_8332,12672,0.36237533799999255,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 50 | SpotDetector,GAIA,linear_data_5_from2018-12-19to2019-01-31_8277,12672,0.4070936169999868,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 51 | SpotDetector,GAIA,linear_data_27_from2018-12-19to2019-01-31_8177,12672,0.38263655999998036,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 52 | SpotDetector,GAIA,linear_data_21_from2018-12-19to2019-01-31_8172,12672,0.3919302930000015,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 53 | SpotDetector,GAIA,linear_data_3_from2018-12-19to2019-01-31_8273,12672,0.4034993919999863,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 54 | SpotDetector,GAIA,linear_data_35_from2018-12-19to2019-01-31_8185,12672,0.36679661100001226,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 55 | SpotDetector,GAIA,linear_data_19_from2018-12-19to2019-01-31_8417,12672,0.5563156700000036,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 56 | SpotDetector,GAIA,linear_data_16_from2018-12-19to2019-01-31_8373,12672,0.35345210799999904,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 57 | SpotDetector,GAIA,linear_data_46_from2018-12-19to2019-01-31_8354,12672,0.34292623700000036,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 58 | SpotDetector,GAIA,linear_data_50_from2018-12-19to2019-01-31_8318,12672,0.422890265999996,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 59 | SpotDetector,GAIA,linear_data_49_from2018-12-19to2019-01-31_8380,12672,0.9232572329999869,0.6666666666666666,0.015873015873015872,0.031007751937984492,0.75,0.007352941176470588,0.014563106796116504,0.6666666666666666,0.014705882352941176,0.028776978417266192 60 | SpotDetector,GAIA,linear_data_22_from2018-12-19to2019-01-31_8166,12672,0.3804424410000138,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 61 | SpotDetector,GAIA,linear_data_36_from2018-12-19to2019-01-31_8387,12672,0.3825128079999729,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 62 | SpotDetector,GAIA,linear_data_0_from2018-12-19to2019-01-31_8339,12672,0.3520609029999946,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 63 | SpotDetector,GAIA,linear_data_15_from2018-12-19to2019-01-31_8367,12672,0.6504752200000041,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 64 | SpotDetector,GAIA,linear_data_53_from2018-12-19to2019-01-31_8331,12672,0.3944678900000156,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 65 | SpotDetector,GAIA,linear_data_39_from2018-12-19to2019-01-31_8225,12672,0.5024501979999911,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 66 | SpotDetector,GAIA,linear_data_45_from2018-12-19to2019-01-31_8221,12672,0.35027252800000497,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 67 | -------------------------------------------------------------------------------- /benchmark/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import pandas as pd 4 | import numpy as np 5 | from typing import Literal 6 | import json 7 | import ast 8 | import gdown 9 | import zipfile 10 | 11 | DS = { 12 | "AIOPS_KPI": ["preliminary_train", "finals_train", "finals_ground_truth"], 13 | "MICRO": [], 14 | "AWSCloudwatch": [], 15 | "GAIA": [ 16 | "changepoint_data", 17 | "concept_drift_data", 18 | "linear_data", 19 | "low_signal-to-noise_ratio_data", 20 | "partially_stationary_data", 21 | "periodic_data", 22 | "staircase_data", 23 | ], 24 | "MSL": ["test"], 25 | "SMD": [], 26 | "CHM": [], 27 | } 28 | 29 | 30 | def check(ds_name, path="./streamad-benchmark-dataset"): 31 | assert ds_name in DS, f"Unavailable dataset {ds_name}, only support {list(DS.keys())}" 32 | 33 | if not os.path.exists(path): 34 | os.makedirs(path) 35 | 36 | 37 | def download_ds(ds_name, path="./streamad-benchmark-dataset"): 38 | check(ds_name, path) 39 | 40 | if os.path.exists(path + "/" + ds_name): 41 | print("Dataset {} already exists".format(ds_name)) 42 | return 43 | 44 | if str.lower(ds_name) == "aiops_kpi": 45 | subprocess.check_call( 46 | [ 47 | "git", 48 | "clone", 49 | "--depth=1", 50 | "https://github.com/NetManAIOps/KPI-Anomaly-Detection.git", 51 | path + "/AIOPS_KPI", 52 | ] 53 | ) 54 | subprocess.check_call( 55 | [ 56 | "unzip", 57 | path + "/AIOPS_KPI/Finals_dataset/phase2_ground_truth.hdf.zip", 58 | "-d", 59 | path + "/AIOPS_KPI/Finals_dataset/", 60 | ] 61 | ) 62 | subprocess.check_call( 63 | [ 64 | "unzip", 65 | path + "/AIOPS_KPI/Finals_dataset/phase2_train.csv.zip", 66 | "-d", 67 | path + "/AIOPS_KPI/Finals_dataset/", 68 | ] 69 | ) 70 | elif str.lower(ds_name) == "micro": 71 | os.makedirs(path + "/MICRO/", exist_ok=True) 72 | gdown.download( 73 | id="1nkEsD1g7THm_T58KwUQZ7o-b174fdx-n", 74 | output=path + "/MICRO/data.zip", 75 | ) 76 | 77 | with zipfile.ZipFile(path + "/MICRO/data.zip") as zip_ref: 78 | zip_ref.extractall(path + "/MICRO/") 79 | 80 | for root, dirs, files in os.walk(path + "/MICRO/AIOps挑战赛数据"): 81 | for filename in files: 82 | if filename.endswith(".zip"): 83 | fileSpec = path + "/MICRO/AIOps挑战赛数据/" + filename 84 | with zipfile.ZipFile(fileSpec) as zip_ref: 85 | zip_ref.extractall(path + "/MICRO/") 86 | 87 | elif str.lower(ds_name) == "awscloudwatch": 88 | subprocess.check_call( 89 | [ 90 | "git", 91 | "clone", 92 | "--depth=1", 93 | "--filter=tree:0", 94 | "--sparse", 95 | "https://github.com/numenta/NAB.git", 96 | path + "/AWSCloudwatch", 97 | ] 98 | ) 99 | subprocess.check_call( 100 | [ 101 | "cd " 102 | + path 103 | + "/AWSCloudwatch/ && git sparse-checkout set data/realAWSCloudwatch && wget https://raw.githubusercontent.com/numenta/NAB/master/labels/combined_labels.json" 104 | ], 105 | shell=True, 106 | ) 107 | elif str.lower(ds_name) == "gaia": 108 | subprocess.check_call( 109 | [ 110 | "wget", 111 | "https://raw.githubusercontent.com/CloudWise-OpenSource/GAIA-DataSet/main/Companion_Data/metric_detection.zip", 112 | "-P", 113 | path + "/GAIA", 114 | ] 115 | ) 116 | subprocess.check_call( 117 | [ 118 | "unzip", 119 | path + "/GAIA/metric_detection.zip", 120 | "-d", 121 | path + "/GAIA/", 122 | ] 123 | ) 124 | 125 | elif str.lower(ds_name) == "msl": 126 | subprocess.check_call( 127 | [ 128 | "wget", 129 | "https://s3-us-west-2.amazonaws.com/telemanom/data.zip", 130 | "-P", 131 | path + "/MSL", 132 | ] 133 | ) 134 | 135 | subprocess.check_call( 136 | [ 137 | "unzip", 138 | path + "/MSL/data.zip", 139 | "-d", 140 | path + "/MSL/", 141 | ] 142 | ) 143 | subprocess.check_call( 144 | [ 145 | "rm", 146 | path + "/MSL/data.zip", 147 | ] 148 | ) 149 | 150 | subprocess.check_call( 151 | [ 152 | "wget", 153 | "https://raw.githubusercontent.com/khundman/telemanom/master/labeled_anomalies.csv", 154 | "-P", 155 | path + "/MSL", 156 | ] 157 | ) 158 | elif str.lower(ds_name) == "chm": 159 | subprocess.check_call( 160 | [ 161 | "git", 162 | "clone", 163 | "--depth=1", 164 | "https://github.com/Fengrui-Liu/Cloud-host-metrics-dataset", 165 | path + "/CHM", 166 | ] 167 | ) 168 | subprocess.check_call( 169 | ["unzip", path + "/CHM/data.zip", "-d", path + "/CHM/"] 170 | ) 171 | subprocess.check_call(["rm", "-rf", path + "/CHM/.git"]) 172 | elif str.lower(ds_name) == "smd": 173 | subprocess.check_call( 174 | [ 175 | "git", 176 | "clone", 177 | "--depth=1", 178 | "https://github.com/NetManAIOps/OmniAnomaly", 179 | path + "/SMD", 180 | ] 181 | ) 182 | 183 | 184 | def prepare_ds( 185 | ds_name: Literal["AIOPS_KPI"], path="./streamad-benchmark-dataset" 186 | ): 187 | # check(ds_name, path) 188 | 189 | download_ds(ds_name, path) 190 | 191 | 192 | def read_ds(ds_name, ds_file, path="./streamad-benchmark-dataset"): 193 | check(ds_name, path) 194 | 195 | if str.lower(ds_name) == "aiops_kpi": 196 | if ds_file == "preliminary_train": 197 | df = pd.read_csv( 198 | path + "/" + ds_name + "/Preliminary_dataset/train.csv" 199 | ) 200 | 201 | elif ds_file == "finals_train": 202 | df = pd.read_csv( 203 | path + "/" + ds_name + "/Finals_dataset/phase2_train.csv" 204 | ) 205 | 206 | elif ds_file == "finals_ground_truth": 207 | df = pd.read_hdf( 208 | path + "/" + ds_name + "/Finals_dataset/phase2_ground_truth.hdf" 209 | ) 210 | else: 211 | raise FileNotFoundError( 212 | "Unavailable dataset file, only support {}".format(DS[ds_name]) 213 | ) 214 | 215 | df_groups = df.groupby("KPI ID") 216 | keys = df_groups.groups.keys() 217 | dfs = {} 218 | for key in keys: 219 | df_key = df_groups.get_group(key) 220 | df_key = df_key[["timestamp", "value", "label"]] 221 | df_label = df_key["label"] 222 | dfs[key] = (df_key, df_label) 223 | 224 | return dfs 225 | 226 | elif str.lower(ds_name) == "micro": 227 | labels = pd.read_csv(path + "/MICRO/故障整理(预赛).csv", index_col=["index"]) 228 | labels = labels.dropna(subset=["kpi", "start_time"]) 229 | dfs = {} 230 | for idx, fault in labels.iterrows(): 231 | start_time = fault["start_time"] 232 | duration = fault["duration"] 233 | folder = pd.to_datetime(start_time).strftime("%Y_%m_%d") 234 | start_time = pd.to_datetime(start_time + "+0800", utc=True) 235 | end_time = start_time + pd.Timedelta(duration) 236 | 237 | df_lst = [] 238 | for root, dirs, files in os.walk( 239 | path + "/MICRO/" + folder + "/平台指标/" 240 | ): 241 | for filename in files: 242 | if filename.endswith(".csv"): 243 | df = pd.read_csv( 244 | path + "/MICRO/" + folder + "/平台指标/" + filename 245 | ) 246 | df_lst.append(df) 247 | 248 | df = pd.concat(df_lst, axis=0) 249 | 250 | for kpi in fault["kpi"].split(";"): 251 | df_kpi = df[ 252 | (df["name"] == kpi) & (df["cmdb_id"] == fault["name"]) 253 | ][["timestamp", "value"]] 254 | df_kpi["label"] = 0 255 | df_kpi.loc[ 256 | (df_kpi["timestamp"] > start_time.timestamp() * 1000) 257 | & (df_kpi["timestamp"] < end_time.timestamp() * 1000), 258 | "label", 259 | ] = 1 260 | dfs[kpi + "_" + fault["name"]] = (df_kpi, df_kpi["label"]) 261 | 262 | return dfs 263 | 264 | elif str.lower(ds_name) == "awscloudwatch": 265 | labels = json.load(open(path + "/AWSCloudwatch/combined_labels.json")) 266 | dfs = {} 267 | for f in os.listdir(path + "/AWSCloudwatch/data/realAWSCloudwatch"): 268 | if f.endswith(".csv"): 269 | df = pd.read_csv( 270 | path + "/AWSCloudwatch/data/realAWSCloudwatch/" + f 271 | ) 272 | df = df[["timestamp", "value"]] 273 | key = "realAWSCloudwatch/" + f 274 | label = labels[key] 275 | df["label"] = 0 276 | df.loc[df["timestamp"].isin(label), "label"] = 1 277 | df_label = df["label"] 278 | 279 | dfs[f.split(".")[0]] = (df, df_label) 280 | return dfs 281 | 282 | elif str.lower(ds_name) == "gaia": 283 | if ds_file in DS[ds_name]: 284 | dfs = {} 285 | folder = path + "/GAIA/metric_detection/" + ds_file 286 | for root, dirs, files in os.walk(folder): 287 | for item in files: 288 | df = pd.read_csv(root + "/" + item) 289 | df_label = df["label"] 290 | dfs[item.split(".csv")[0]] = (df, df_label) 291 | return dfs 292 | elif ds_file == 'all': 293 | dfs = {} 294 | for ds_file in DS[ds_name]: 295 | folder = path + "/GAIA/metric_detection/" + ds_file 296 | for root, dirs, files in os.walk(folder): 297 | for item in files: 298 | df = pd.read_csv(root + "/" + item) 299 | df_label = df["label"] 300 | dfs[item.split(".csv")[0]] = (df, df_label) 301 | return dfs 302 | else: 303 | raise FileNotFoundError 304 | 305 | elif str.lower(ds_name) == "msl": 306 | labels = pd.read_csv(path + "/MSL/labeled_anomalies.csv") 307 | if ds_file in DS[ds_name]: 308 | dfs = {} 309 | folder = path + "/MSL/data/" + ds_file 310 | for root, dirs, files in os.walk(folder): 311 | for item in files: 312 | name = item.replace(".npy", "") 313 | df = pd.DataFrame(np.load(root + "/" + item)) 314 | df.columns = df.columns.astype(str) 315 | anomalies = labels[labels["chan_id"] == name][ 316 | "anomaly_sequences" 317 | ] 318 | df["label"] = 0 319 | if len(anomalies) > 0: 320 | anomalies = ast.literal_eval(anomalies.values[0]) 321 | for seg in anomalies: 322 | seg_begin = seg[0] 323 | seg_end = seg[1] 324 | df.iloc[seg_begin:seg_end] = 1 325 | 326 | dfs[name] = (df, df["label"]) 327 | 328 | return dfs 329 | 330 | else: 331 | raise FileNotFoundError 332 | 333 | elif str.lower(ds_name) == "chm": 334 | dfs = {} 335 | for root, dirs, files in os.walk(path + "/CHM/data"): 336 | for item in files: 337 | df = pd.read_csv(root + "/" + item, index_col=["timestamp"]) 338 | df = df.sort_index() 339 | dfs[item.split(".csv")[0]] = (df, df["label"]) 340 | 341 | return dfs 342 | elif str.lower(ds_name) == "smd": 343 | dfs = {} 344 | for root, dirs, files in os.walk( 345 | path + "/SMD/ServerMachineDataset/test" 346 | ): 347 | for item in files: 348 | df = pd.read_csv(root + "/" + item, header=None) 349 | label = pd.read_csv( 350 | path + "/SMD/ServerMachineDataset/test_label/" + item, 351 | header=None, 352 | names=["label"], 353 | ) 354 | df.columns = df.columns.astype(str) 355 | df["label"] = label 356 | dfs[item.split(".txt")[0]] = (df, df["label"]) 357 | return dfs 358 | 359 | 360 | if __name__ == "__main__": 361 | ds_name = "SMD" 362 | df_file = "" 363 | prepare_ds( 364 | ds_name=ds_name, 365 | path="./benchmark/streamad-benchmark-dataset", 366 | ) 367 | dfs = read_ds( 368 | ds_name=ds_name, 369 | ds_file=df_file, 370 | path="./benchmark/streamad-benchmark-dataset", 371 | ) 372 | 373 | dfs 374 | -------------------------------------------------------------------------------- /benchmark/requirements_benchmark.txt: -------------------------------------------------------------------------------- 1 | kaleido 2 | tabulate -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | 22 | api: 23 | sphinx-apidoc -f -E -M -o ./source ../streamad 24 | 25 | doc: 26 | make clean 27 | sphinx-autogen -o generated -t source/_templates/class.rst source/index.md 28 | make html 29 | 30 | 31 | gh-pages: 32 | rm -rf /tmp/gh-pages 33 | cp -r $(BUILDDIR)/html/ /tmp/gh-pages 34 | git checkout gh-pages 35 | cd .. && rm -rf * && cp -r /tmp/gh-pages/* ./ && rm -rf /tmp/gh-pages && git add . && git commit -m "Updated gh-pages" && git push && git checkout main -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements-docs.txt: -------------------------------------------------------------------------------- 1 | sphinx_copybutton 2 | jupytext 3 | sphinxcontrib.bibtex 4 | sphinx_autodoc_typehints 5 | sphinxcontrib.apidoc 6 | sphinx-book-theme 7 | sphinx_design 8 | sphinx-togglebutton 9 | sphinx == 4.2 10 | recommonmark 11 | myst_nb 12 | plotly 13 | rrcf 14 | tdigest 15 | setuptools==58.2.0 -------------------------------------------------------------------------------- /docs/source/_templates/class.rst: -------------------------------------------------------------------------------- 1 | {{ fullname }} 2 | {{ underline }} 3 | .. currentmodule:: {{ module }} 4 | .. autoclass:: {{ objname }} 5 | {% block methods %} 6 | {% if methods %} 7 | .. rubric:: Methods 8 | .. autosummary:: 9 | {% for item in methods %} 10 | ~{{ name }}.{{ item }} 11 | {%- endfor %} 12 | {% endif %} 13 | {% endblock %} 14 | {% block attributes %} 15 | {% if attributes %} 16 | .. rubric:: Attributes 17 | .. autosummary:: 18 | {% for item in attributes %} 19 | ~{{ name }}.{{ item }} 20 | {%- endfor %} 21 | {% endif %} 22 | {% endblock %} -------------------------------------------------------------------------------- /docs/source/api/streamad.evaluate.md: -------------------------------------------------------------------------------- 1 | # StreamAD Evaluation 2 | 3 | 4 | 5 | ## Point aware metrics 6 | ```{eval-rst} 7 | .. autoclass:: streamad.evaluate.PointAwareMetircs 8 | :show-inheritance: 9 | :members: parse 10 | ``` 11 | 12 | ## Time-series aware metrics 13 | ```{eval-rst} 14 | .. autoclass:: streamad.evaluate.SeriesAwareMetircs 15 | :show-inheritance: 16 | :members: parse 17 | ``` 18 | 19 | ## Numenta aware metrics 20 | ```{eval-rst} 21 | .. autoclass:: streamad.evaluate.NumentaAwareMetircs 22 | :show-inheritance: 23 | :members: parse 24 | ``` 25 | -------------------------------------------------------------------------------- /docs/source/api/streamad.md: -------------------------------------------------------------------------------- 1 | 2 | # API Reference 3 | 4 | 5 | 6 | ```{toctree} 7 | :maxdepth: 2 8 | 9 | streamad.model 10 | streamad.util 11 | streamad.evaluate 12 | streamad.process 13 | ``` 14 | -------------------------------------------------------------------------------- /docs/source/api/streamad.model.md: -------------------------------------------------------------------------------- 1 | 2 | # StreamAD Detector 3 | 4 | 5 | ## Univariate Anomaly Detector 6 | 7 | If you want to detect multivarite time series with these models, you need to apply them on each feature separately. 8 | ### KNNDetector 9 | 10 | ```{eval-rst} 11 | .. autoclass:: streamad.model.KNNDetector 12 | :show-inheritance: 13 | :members: parse 14 | ``` 15 | 16 | ---- 17 | 18 | ### SpotDetector 19 | 20 | ```{eval-rst} 21 | .. autoclass:: streamad.model.SpotDetector 22 | :show-inheritance: 23 | :members: parse 24 | ``` 25 | 26 | 27 | ---- 28 | 29 | 30 | ### RrcfDetector 31 | 32 | ```{eval-rst} 33 | .. autoclass:: streamad.model.RrcfDetector 34 | :show-inheritance: 35 | :members: parse 36 | ``` 37 | 38 | ---- 39 | 40 | 41 | ### SRDetector 42 | 43 | ```{eval-rst} 44 | .. autoclass:: streamad.model.SRDetector 45 | :show-inheritance: 46 | :members: parse 47 | ``` 48 | 49 | ---- 50 | 51 | 52 | ### ZScoreDetector 53 | 54 | ```{eval-rst} 55 | .. autoclass:: streamad.model.ZScoreDetector 56 | :show-inheritance: 57 | :members: parse 58 | ``` 59 | 60 | ---- 61 | 62 | 63 | ## Multivariate Anomaly Detector 64 | 65 | These models are compatible with univariate time series. 66 | 67 | ### xStreamDetector 68 | 69 | ```{eval-rst} 70 | .. autoclass:: streamad.model.xStreamDetector 71 | :show-inheritance: 72 | :members: parse 73 | ``` 74 | 75 | ---- 76 | 77 | ### RShashDetector 78 | 79 | ```{eval-rst} 80 | .. autoclass:: streamad.model.RShashDetector 81 | :show-inheritance: 82 | :members: parse 83 | ``` 84 | 85 | ---- 86 | 87 | ### HSTreeDetector 88 | 89 | ```{eval-rst} 90 | .. autoclass:: streamad.model.HSTreeDetector 91 | :show-inheritance: 92 | :members: parse 93 | ``` 94 | 95 | ---- 96 | 97 | ### LodaDetector 98 | 99 | ```{eval-rst} 100 | .. autoclass:: streamad.model.LodaDetector 101 | :show-inheritance: 102 | :members: parse 103 | ``` 104 | 105 | ---- 106 | 107 | ### RandomDetector 108 | 109 | ```{eval-rst} 110 | .. autoclass:: streamad.model.RandomDetector 111 | :show-inheritance: 112 | :members: parse 113 | ``` 114 | -------------------------------------------------------------------------------- /docs/source/api/streamad.process.md: -------------------------------------------------------------------------------- 1 | # StreamAD Process 2 | 3 | 4 | ## Post process 5 | 6 | 7 | 8 | ### ZScoreCalibrator 9 | ```{eval-rst} 10 | .. autoclass:: streamad.process.ZScoreCalibrator 11 | :show-inheritance: 12 | :members: parse 13 | ``` 14 | 15 | --- 16 | ### TDigestCalibrator 17 | ```{eval-rst} 18 | .. autoclass:: streamad.process.TDigestCalibrator 19 | :show-inheritance: 20 | :members: parse 21 | ``` -------------------------------------------------------------------------------- /docs/source/api/streamad.util.md: -------------------------------------------------------------------------------- 1 | # StreamAD Utilities 2 | 3 | 4 | ## Dataset 5 | 6 | 7 | 8 | ### UnivariateDS 9 | ```{eval-rst} 10 | .. autoclass:: streamad.util.UnivariateDS 11 | :show-inheritance: 12 | :members: parse 13 | ``` 14 | 15 | ---- 16 | 17 | ### MultivariateDS 18 | ```{eval-rst} 19 | .. autoclass:: streamad.util.MultivariateDS 20 | :show-inheritance: 21 | :members: parse 22 | ``` 23 | 24 | ---- 25 | 26 | ### CustomDS 27 | ```{eval-rst} 28 | .. autoclass:: streamad.util.CustomDS 29 | :show-inheritance: 30 | :members: parse 31 | ``` 32 | 33 | ---- 34 | 35 | ## Generator 36 | 37 | ### StreamGenerator 38 | ```{eval-rst} 39 | .. autoclass:: streamad.util.StreamGenerator 40 | :show-inheritance: 41 | :members: parse 42 | ``` 43 | 44 | ---- 45 | 46 | ## Math toolkit 47 | 48 | ### Statistic 49 | ```{eval-rst} 50 | .. autoclass:: streamad.util.StreamStatistic 51 | :show-inheritance: 52 | :members: parse 53 | ``` 54 | 55 | 56 | 57 | ---- 58 | 59 | ## Visualization 60 | 61 | ### Plot 62 | ```{eval-rst} 63 | .. autoclass:: streamad.util.plot 64 | :show-inheritance: 65 | :members: parse 66 | ``` 67 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | from os.path import dirname, abspath 16 | 17 | 18 | sys.path.insert(0, os.path.abspath("../..")) 19 | StreamAD_dir = dirname(dirname(dirname(abspath(__file__)))) 20 | version_path = os.path.join(StreamAD_dir, "streamad", "version.py") 21 | exec(open(version_path).read()) 22 | 23 | # -- Project information ----------------------------------------------------- 24 | 25 | project = "StreamAD" 26 | copyright = "2023, Fengrui-Liu" 27 | author = "Fengrui-Liu" 28 | 29 | # The full version, including alpha/beta/rc tags 30 | version = __version__ 31 | release = __version__ 32 | 33 | 34 | # -- General configuration --------------------------------------------------- 35 | 36 | # Add any Sphinx extension module names here, as strings. They can be 37 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 38 | # ones. 39 | extensions = [ 40 | "sphinx.ext.mathjax", 41 | "sphinx_copybutton", 42 | "sphinx.ext.autosummary", 43 | "sphinx.ext.viewcode", 44 | "sphinx.ext.napoleon", 45 | "sphinx.ext.todo", 46 | "sphinx.ext.coverage", 47 | "sphinx.ext.doctest", 48 | "sphinx_autodoc_typehints", 49 | "sphinxcontrib.bibtex", 50 | "sphinx.ext.autodoc", 51 | "sphinx.ext.autosectionlabel", 52 | "sphinx.ext.githubpages", 53 | "sphinx.ext.intersphinx", 54 | "sphinx.ext.ifconfig", 55 | # "sphinxcontrib.apidoc", 56 | # "myst_parser", 57 | "myst_nb", 58 | "sphinx_design", 59 | "sphinx.ext.autosectionlabel", 60 | ] 61 | 62 | 63 | source_suffix = [".rst", ".md", ".ipynb"] 64 | 65 | myst_enable_extensions = [ 66 | "amsmath", 67 | "colon_fence", 68 | "deflist", 69 | "dollarmath", 70 | "html_image", 71 | ] 72 | myst_url_schemes = ("http", "https", "mailto") 73 | myst_footnote_transition = False 74 | autosectionlabel_prefix_document = True 75 | nb_execution_mode = "off" 76 | suppress_warnings = ["mystnb.unknown_mime_type"] 77 | nb_execution_show_tb = "READTHEDOCS" in os.environ 78 | html_js_files = [ 79 | "https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js" 80 | ] 81 | # -- nbsphinx settings ------------------------------------------------------- 82 | # nbsphinx_execute = "auto" 83 | 84 | # Create symlinks for example notebooks 85 | import glob 86 | 87 | nb_files = [ 88 | os.path.basename(f) 89 | for f in glob.glob(os.path.join("example", "*.ipynb")) 90 | if not os.path.basename(f).startswith("temp_") 91 | ] 92 | for nb_file in nb_files: 93 | target = os.path.join("../../example", nb_file) 94 | if os.path.exists(target): 95 | os.remove(target) 96 | os.symlink(os.path.join("../docs/source/example", nb_file), target) 97 | 98 | 99 | # -- Bibliography ------------------------------------------------------------ 100 | bibtex_bibfiles = ["refs.bib"] 101 | bibtex_default_style = "unsrt" 102 | bibtex_reference_style = "author_year" 103 | 104 | # apidoc settings 105 | apidoc_module_dir = "../../streamad" 106 | apidoc_output_dir = "api" 107 | apidoc_excluded_paths = ["**/*test*"] 108 | apidoc_module_first = True 109 | apidoc_separate_modules = True 110 | apidoc_extra_args = ["-d 6"] 111 | 112 | # mock imports 113 | # autodoc_mock_imports = ["pandas", "numpy", "scipy"] 114 | 115 | # Napoleon settings 116 | napoleon_google_docstring = True 117 | napoleon_numpy_docstring = False 118 | napoleon_include_init_with_doc = True 119 | napoleon_include_private_with_doc = False 120 | napoleon_include_special_with_doc = True 121 | napoleon_use_admonition_for_examples = False 122 | napoleon_use_admonition_for_notes = False 123 | napoleon_use_admonition_for_references = False 124 | napoleon_use_ivar = False 125 | napoleon_use_param = True 126 | napoleon_use_rtype = False 127 | 128 | # nbsphinx_execute_arguments = [ 129 | # "--InlineBackend.figure_formats={'svg', 'pdf'}", 130 | # "--InlineBackend.rc={'figure.dpi': 96}", 131 | # ] 132 | # nbsphinx_input_prompt = "In [%s]:" 133 | # nbsphinx_output_prompt = "Out[%s]:" 134 | master_doc = "index" 135 | pygments_style = "sphinx" 136 | 137 | # Add any paths that contain templates here, relative to this directory. 138 | templates_path = ["_templates"] 139 | 140 | # The language for content autogenerated by Sphinx. Refer to documentation 141 | # for a list of supported languages. 142 | # 143 | # This is also used if you do content translation via gettext catalogs. 144 | # Usually you set "language" from the command line for these cases. 145 | language = "en" 146 | 147 | # List of patterns, relative to source directory, that match files and 148 | # directories to ignore when looking for source files. 149 | # This pattern also affects html_static_path and html_extra_path. 150 | exclude_patterns = ["../build"] 151 | 152 | 153 | # -- Options for HTML output ------------------------------------------------- 154 | 155 | # The theme to use for HTML and HTML Help pages. See the documentation for 156 | # a list of builtin themes. 157 | # 158 | html_theme = "sphinx_book_theme" 159 | 160 | 161 | html_theme_options = { 162 | "use_repository_button": True, 163 | "repository_url": "https://github.com/Fengrui-Liu/StreamAD", 164 | } 165 | # Add any paths that contain custom static files (such as style sheets) here, 166 | # relative to this directory. They are copied after the builtin static files, 167 | # so a file named "default.css" will overwrite the builtin "default.css". 168 | # html_static_path = ["_static"] 169 | 170 | 171 | highlight_language = "none" 172 | 173 | # Prefix document path to section labels, otherwise autogenerated labels would look like 'heading' 174 | # rather than 'path/to/file:heading' 175 | autosectionlabel_prefix_document = True 176 | 177 | autodoc_default_options = { 178 | "members": True, 179 | "inherited-members": True, 180 | } 181 | autodoc_typehints = "none" 182 | 183 | numpydoc_show_class_members = False 184 | autosummary_generate = True 185 | autosummary_imported_members = True 186 | 187 | 188 | html_logo = "images/logo_htmlwithname.svg" 189 | html_favicon = "images/logo_html.svg" 190 | 191 | 192 | # -- myst-parser configuration ----------------------------------------------- 193 | # See https://myst-parser.readthedocs.io/en/stable/syntax/optional.html for 194 | # details of available extensions. 195 | myst_enable_extensions = [ 196 | "dollarmath", 197 | "amsmath", 198 | "colon_fence", 199 | "smartquotes", 200 | "tasklist", 201 | "html_image", 202 | ] 203 | 204 | # Create heading anchors for h1 to h3 (useful for local toc's) 205 | myst_heading_anchors = 3 206 | 207 | 208 | def remove_module_docstring(app, what, name, obj, options, lines): 209 | if what == "module" and name == "streamad": 210 | del lines[:] 211 | 212 | 213 | def setup(app): 214 | app.connect("autodoc-process-docstring", remove_module_docstring) 215 | -------------------------------------------------------------------------------- /docs/source/example/example.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | 4 | 5 | ```{toctree} 6 | 7 | dataset_usage 8 | ``` 9 | 10 | ---- 11 | 12 | 13 | ```{toctree} 14 | 15 | univariate 16 | ``` 17 | 18 | ---- 19 | 20 | ```{toctree} 21 | multivariate 22 | ``` 23 | 24 | ---- 25 | 26 | 27 | ```{toctree} 28 | 29 | calibrator_usage 30 | ``` 31 | 32 | ---- 33 | 34 | ```{toctree} 35 | 36 | ensemble_usage 37 | ``` -------------------------------------------------------------------------------- /docs/source/images/logo_html.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 切片 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | STREAM 14 | 15 | 16 | -------------------------------------------------------------------------------- /docs/source/images/logo_htmlwithname.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 切片 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 16 | 17 | STREAM 18 | 19 | 20 | -------------------------------------------------------------------------------- /docs/source/images/logo_index.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | StreamingAD 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | Stream 14 | AD 15 | 16 | 17 | -------------------------------------------------------------------------------- /docs/source/index.md: -------------------------------------------------------------------------------- 1 | 2 | # StreamAD 3 | 4 | ```{toctree} 5 | :caption: Overview 6 | :maxdepth: 2 7 | 8 | overview 9 | example/example 10 | ``` 11 | 12 | 13 | 14 | ```{toctree} 15 | :caption: Benchmark 16 | :hidden: 17 | :titlesonly: 18 | :maxdepth: 1 19 | 20 | benchmark 21 | ``` 22 | 23 | 24 | 25 | 26 | 27 | ```{toctree} 28 | :caption: Reference 29 | :hidden: 30 | :titlesonly: 31 | :maxdepth: 1 32 | 33 | api/streamad 34 | references 35 | ``` -------------------------------------------------------------------------------- /docs/source/overview.md: -------------------------------------------------------------------------------- 1 | ```{include} ../../README.md 2 | :relative-images: 3 | ``` 4 | -------------------------------------------------------------------------------- /docs/source/references.md: -------------------------------------------------------------------------------- 1 | # Paper Reference 2 | 3 | 4 | 5 | 6 | ```{bibliography} refs.bib 7 | :labelprefix: md 8 | ``` -------------------------------------------------------------------------------- /docs/source/refs.bib: -------------------------------------------------------------------------------- 1 | @article{DBLP:journals/corr/BurnaevI16, 2 | author = {Evgeny Burnaev and 3 | Vladislav Ishimtsev}, 4 | title = {Conformalized density- and distance-based anomaly detection in time-series 5 | data}, 6 | journal = {CoRR}, 7 | volume = {abs/1608.04585}, 8 | year = {2016}, 9 | url = {http://arxiv.org/abs/1608.04585}, 10 | archivePrefix = {arXiv}, 11 | eprint = {1608.04585}, 12 | timestamp = {Mon, 13 Aug 2018 16:47:12 +0200}, 13 | biburl = {https://dblp.org/rec/journals/corr/BurnaevI16.bib}, 14 | bibsource = {dblp computer science bibliography, https://dblp.org} 15 | } 16 | 17 | 18 | @inproceedings{DBLP:conf/kdd/ManzoorLA18, 19 | author = {Emaad A. Manzoor and 20 | Hemank Lamba and 21 | Leman Akoglu}, 22 | editor = {Yike Guo and 23 | Faisal Farooq}, 24 | title = {xStream: Outlier Detection in Feature-Evolving Data Streams}, 25 | booktitle = {Proceedings of the 24th {ACM} {SIGKDD} International Conference on 26 | Knowledge Discovery {\&} Data Mining, {KDD} 2018, London, UK, 27 | August 19-23, 2018}, 28 | pages = {1963--1972}, 29 | publisher = {{ACM}}, 30 | year = {2018}, 31 | url = {https://doi.org/10.1145/3219819.3220107}, 32 | doi = {10.1145/3219819.3220107}, 33 | timestamp = {Fri, 19 Jun 2020 12:43:05 +0200}, 34 | biburl = {https://dblp.org/rec/conf/kdd/ManzoorLA18.bib}, 35 | bibsource = {dblp computer science bibliography, https://dblp.org} 36 | } 37 | 38 | 39 | @inproceedings{DBLP:conf/kdd/SifferFTL17, 40 | author = {Alban Siffer and 41 | Pierre{-}Alain Fouque and 42 | Alexandre Termier and 43 | Christine Largou{\"{e}}t}, 44 | title = {Anomaly Detection in Streams with Extreme Value Theory}, 45 | booktitle = {Proceedings of the 23rd {ACM} {SIGKDD} International Conference on 46 | Knowledge Discovery and Data Mining, Halifax, NS, Canada, August 13 47 | - 17, 2017}, 48 | pages = {1067--1075}, 49 | publisher = {{ACM}}, 50 | year = {2017}, 51 | url = {https://doi.org/10.1145/3097983.3098144}, 52 | doi = {10.1145/3097983.3098144}, 53 | timestamp = {Fri, 25 Dec 2020 01:14:16 +0100}, 54 | biburl = {https://dblp.org/rec/conf/kdd/SifferFTL17.bib}, 55 | bibsource = {dblp computer science bibliography, https://dblp.org} 56 | } 57 | 58 | @inproceedings{DBLP:conf/icml/GuhaMRS16, 59 | author = {Sudipto Guha and 60 | Nina Mishra and 61 | Gourav Roy and 62 | Okke Schrijvers}, 63 | editor = {Maria{-}Florina Balcan and 64 | Kilian Q. Weinberger}, 65 | title = {Robust Random Cut Forest Based Anomaly Detection on Streams}, 66 | booktitle = {Proceedings of the 33nd International Conference on Machine Learning, 67 | {ICML} 2016, New York City, NY, USA, June 19-24, 2016}, 68 | series = {{JMLR} Workshop and Conference Proceedings}, 69 | volume = {48}, 70 | pages = {2712--2721}, 71 | publisher = {JMLR.org}, 72 | year = {2016}, 73 | url = {http://proceedings.mlr.press/v48/guha16.html}, 74 | timestamp = {Wed, 29 May 2019 08:41:46 +0200}, 75 | biburl = {https://dblp.org/rec/conf/icml/GuhaMRS16.bib}, 76 | bibsource = {dblp computer science bibliography, https://dblp.org} 77 | } 78 | 79 | 80 | @inproceedings{DBLP:conf/kdd/RenXWYHKXYTZ19, 81 | author = {Hansheng Ren and 82 | Bixiong Xu and 83 | Yujing Wang and 84 | Chao Yi and 85 | Congrui Huang and 86 | Xiaoyu Kou and 87 | Tony Xing and 88 | Mao Yang and 89 | Jie Tong and 90 | Qi Zhang}, 91 | editor = {Ankur Teredesai and 92 | Vipin Kumar and 93 | Ying Li and 94 | R{\'{o}}mer Rosales and 95 | Evimaria Terzi and 96 | George Karypis}, 97 | title = {Time-Series Anomaly Detection Service at Microsoft}, 98 | booktitle = {Proceedings of the 25th {ACM} {SIGKDD} International Conference on 99 | Knowledge Discovery {\&} Data Mining, {KDD} 2019, Anchorage, AK, 100 | USA, August 4-8, 2019}, 101 | pages = {3009--3017}, 102 | publisher = {{ACM}}, 103 | year = {2019}, 104 | url = {https://doi.org/10.1145/3292500.3330680}, 105 | doi = {10.1145/3292500.3330680}, 106 | timestamp = {Thu, 28 Apr 2022 17:42:59 +0200}, 107 | biburl = {https://dblp.org/rec/conf/kdd/RenXWYHKXYTZ19.bib}, 108 | bibsource = {dblp computer science bibliography, https://dblp.org} 109 | } 110 | 111 | 112 | @inproceedings{DBLP:conf/icdm/SatheA16, 113 | author = {Saket Sathe and 114 | Charu C. Aggarwal}, 115 | editor = {Francesco Bonchi and 116 | Josep Domingo{-}Ferrer and 117 | Ricardo Baeza{-}Yates and 118 | Zhi{-}Hua Zhou and 119 | Xindong Wu}, 120 | title = {Subspace Outlier Detection in Linear Time with Randomized Hashing}, 121 | booktitle = {{IEEE} 16th International Conference on Data Mining, {ICDM} 2016, 122 | December 12-15, 2016, Barcelona, Spain}, 123 | pages = {459--468}, 124 | publisher = {{IEEE} Computer Society}, 125 | year = {2016}, 126 | url = {https://doi.org/10.1109/ICDM.2016.0057}, 127 | doi = {10.1109/ICDM.2016.0057}, 128 | timestamp = {Wed, 17 Mar 2021 09:50:14 +0100}, 129 | biburl = {https://dblp.org/rec/conf/icdm/SatheA16.bib}, 130 | bibsource = {dblp computer science bibliography, https://dblp.org} 131 | } 132 | 133 | @inproceedings{DBLP:conf/ijcai/TanTL11, 134 | author = {Swee Chuan Tan and 135 | Kai Ming Ting and 136 | Fei Tony Liu}, 137 | editor = {Toby Walsh}, 138 | title = {Fast Anomaly Detection for Streaming Data}, 139 | booktitle = {{IJCAI} 2011, Proceedings of the 22nd International Joint Conference 140 | on Artificial Intelligence, Barcelona, Catalonia, Spain, July 16-22, 141 | 2011}, 142 | pages = {1511--1516}, 143 | publisher = {{IJCAI/AAAI}}, 144 | year = {2011}, 145 | url = {https://doi.org/10.5591/978-1-57735-516-8/IJCAI11-254}, 146 | doi = {10.5591/978-1-57735-516-8/IJCAI11-254}, 147 | timestamp = {Tue, 20 Aug 2019 16:16:04 +0200}, 148 | biburl = {https://dblp.org/rec/conf/ijcai/TanTL11.bib}, 149 | bibsource = {dblp computer science bibliography, https://dblp.org} 150 | } 151 | 152 | @article{DBLP:journals/simpa/Dunning21, 153 | author = {Ted Dunning}, 154 | title = {The t-digest: Efficient estimates of distributions}, 155 | journal = {Softw. Impacts}, 156 | volume = {7}, 157 | pages = {100049}, 158 | year = {2021}, 159 | url = {https://doi.org/10.1016/j.simpa.2020.100049}, 160 | doi = {10.1016/j.simpa.2020.100049}, 161 | timestamp = {Wed, 05 May 2021 14:43:00 +0200}, 162 | biburl = {https://dblp.org/rec/journals/simpa/Dunning21.bib}, 163 | bibsource = {dblp computer science bibliography, https://dblp.org} 164 | } 165 | 166 | 167 | @article{DBLP:journals/ml/Pevny16, 168 | author = {Tom{\'{a}}s Pevn{\'{y}}}, 169 | title = {Loda: Lightweight on-line detector of anomalies}, 170 | journal = {Mach. Learn.}, 171 | volume = {102}, 172 | number = {2}, 173 | pages = {275--304}, 174 | year = {2016}, 175 | url = {https://doi.org/10.1007/s10994-015-5521-0}, 176 | doi = {10.1007/s10994-015-5521-0}, 177 | timestamp = {Sun, 25 Jul 2021 11:37:58 +0200}, 178 | biburl = {https://dblp.org/rec/journals/ml/Pevny16.bib}, 179 | bibsource = {dblp computer science bibliography, https://dblp.org} 180 | } 181 | 182 | @article{DBLP:journals/ijon/AhmadLPA17, 183 | author = {Subutai Ahmad and 184 | Alexander Lavin and 185 | Scott Purdy and 186 | Zuha Agha}, 187 | title = {Unsupervised real-time anomaly detection for streaming data}, 188 | journal = {Neurocomputing}, 189 | volume = {262}, 190 | pages = {134--147}, 191 | year = {2017}, 192 | url = {https://doi.org/10.1016/j.neucom.2017.04.070}, 193 | doi = {10.1016/j.neucom.2017.04.070}, 194 | timestamp = {Fri, 31 Jan 2020 14:18:54 +0100}, 195 | biburl = {https://dblp.org/rec/journals/ijon/AhmadLPA17.bib}, 196 | bibsource = {dblp computer science bibliography, https://dblp.org} 197 | } 198 | 199 | @misc{ enwiki:1086685336, 200 | author = "{Wikipedia contributors}", 201 | title = "Standard score --- {Wikipedia}{,} The Free Encyclopedia", 202 | year = "2022", 203 | url = "https://en.wikipedia.org/w/index.php?title=Standard_score&oldid=1086685336", 204 | note = "[Online; accessed 19-June-2022]" 205 | } 206 | 207 | 208 | @misc{ enwiki:1089762876, 209 | author = "{Wikipedia contributors}", 210 | title = "Precision and recall --- {Wikipedia}{,} The Free Encyclopedia", 211 | year = "2022", 212 | url = "https://en.wikipedia.org/w/index.php?title=Precision_and_recall&oldid=1089762876", 213 | note = "[Online; accessed 19-June-2022]" 214 | } 215 | 216 | @inproceedings{DBLP:conf/nips/TatbulLZAG18, 217 | author = {Nesime Tatbul and 218 | Tae Jun Lee and 219 | Stan Zdonik and 220 | Mejbah Alam and 221 | Justin Gottschlich}, 222 | editor = {Samy Bengio and 223 | Hanna M. Wallach and 224 | Hugo Larochelle and 225 | Kristen Grauman and 226 | Nicol{\`{o}} Cesa{-}Bianchi and 227 | Roman Garnett}, 228 | title = {Precision and Recall for Time Series}, 229 | booktitle = {Advances in Neural Information Processing Systems 31: Annual Conference 230 | on Neural Information Processing Systems 2018, NeurIPS 2018, December 231 | 3-8, 2018, Montr{\'{e}}al, Canada}, 232 | pages = {1924--1934}, 233 | year = {2018}, 234 | url = {https://proceedings.neurips.cc/paper/2018/hash/8f468c873a32bb0619eaeb2050ba45d1-Abstract.html}, 235 | timestamp = {Mon, 16 May 2022 15:41:51 +0200}, 236 | biburl = {https://dblp.org/rec/conf/nips/TatbulLZAG18.bib}, 237 | bibsource = {dblp computer science bibliography, https://dblp.org} 238 | } 239 | 240 | @Misc{InfluxDB:MAD, 241 | howpublished = {Website}, 242 | note = {\url{https://www.influxdata.com/blog/anomaly-detection-with-median-absolute-deviation/#:~:text=How%20Median%20Absolute%20Deviation%20algorithm,time%20series%20at%20that%20timestamp/} Accessed 7 July 2020}, 243 | title = {Anomaly Detection with Median Absolute Deviation}, 244 | author = {Anais Dotis-Georgiou} 245 | } 246 | 247 | 248 | @misc{ enwiki:1098733917, 249 | author = "{Wikipedia contributors}", 250 | title = "One-class classification --- {Wikipedia}{,} The Free Encyclopedia", 251 | year = "2022", 252 | url = "https://en.wikipedia.org/w/index.php?title=One-class_classification&oldid=1098733917", 253 | note = "[Online; accessed 24-July-2022]" 254 | } 255 | 256 | 257 | 258 | @article{assimakopoulos2000theta, 259 | title={The theta model: a decomposition approach to forecasting}, 260 | author={Assimakopoulos, Vassilis and Nikolopoulos, Konstantinos}, 261 | journal={International journal of forecasting}, 262 | volume={16}, 263 | number={4}, 264 | pages={521--530}, 265 | year={2000}, 266 | publisher={Elsevier} 267 | } 268 | 269 | @book{durbin2012time, 270 | title={Time series analysis by state space methods}, 271 | author={Durbin, James and Koopman, Siem Jan}, 272 | volume={38}, 273 | year={2012}, 274 | publisher={OUP Oxford} 275 | } -------------------------------------------------------------------------------- /example/README.md: -------------------------------------------------------------------------------- 1 | This folder contains the symlink of all examples from docs/source/examples/ 2 | 3 | If it is fail to open in some certain OS, please check the origin file or the online docs. -------------------------------------------------------------------------------- /example/dataset_usage.ipynb: -------------------------------------------------------------------------------- 1 | ../docs/source/example/dataset_usage.ipynb -------------------------------------------------------------------------------- /example/multivariate.ipynb: -------------------------------------------------------------------------------- 1 | ../docs/source/example/multivariate.ipynb -------------------------------------------------------------------------------- /example/thresholder_usage.ipynb: -------------------------------------------------------------------------------- 1 | ../docs/source/example/thresholder_usage.ipynb -------------------------------------------------------------------------------- /example/univariate.ipynb: -------------------------------------------------------------------------------- 1 | ../docs/source/example/univariate.ipynb -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "streamad" 3 | version = "0.3.1" 4 | description = "An anomaly detection package for data streams." 5 | authors = ["Fengrui-Liu "] 6 | license = "Apache Software License" 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.8" 11 | numpy = "^1.22" 12 | pandas = "^1.3.0" 13 | scikit-learn = "^1.0.0" 14 | mmh3 = "^3.0.0" 15 | rrcf = "^0.4.4" 16 | plotly = "^5.14.1" 17 | tdigest = "^0.5.2.2" 18 | statsmodels = "0.13.5" 19 | fast-histogram = "^0.11" 20 | pytest = "^7.3.1" 21 | pytest-cov = "^4.0.0" 22 | scipy = "^1.3.1" 23 | 24 | 25 | [build-system] 26 | requires = ["poetry-core"] 27 | build-backend = "poetry.core.masonry.api" 28 | -------------------------------------------------------------------------------- /streamad/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # 4 | # Author: liufr 5 | # Github: https://github.com/Fengrui-Liu 6 | # LastEditTime: 2021-01-05 21:30:25 7 | # Copyright 2021 liufr 8 | # Description: 9 | # 10 | 11 | from . import base 12 | from . import model 13 | from . import util 14 | from .version import __version__ 15 | 16 | __all__ = ["__version__", "base", "model", "util"] 17 | -------------------------------------------------------------------------------- /streamad/base/__init__.py: -------------------------------------------------------------------------------- 1 | from .detector import BaseDetector 2 | from .metrics import BaseMetrics 3 | 4 | 5 | __all__ = ["BaseDetector", "BaseMetrics"] 6 | -------------------------------------------------------------------------------- /streamad/base/detector.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | import numpy as np 4 | from collections import deque 5 | 6 | 7 | class BaseDetector(ABC): 8 | """Abstract class for Detector, supporting for customize detector.""" 9 | 10 | def __init__( 11 | self, 12 | window_len: int = 50, 13 | detrend: bool = False, 14 | detrend_len: int = 10, 15 | data_type: str = "multivariate", 16 | score_first: bool = False, 17 | ): 18 | """Initialize the attributes of the BaseDetector class 19 | 20 | 21 | Args: 22 | window_len (int, optional): Length of window for observations. Defaults to 50. 23 | detrend (bool, optional): Data is detrended by subtracting the mean. Defaults to True. 24 | detrend_len (int, optional): Length of data for reference to detrend. Defaults to 10. 25 | data_type (str, optional): Multi/Univariate data type. Defaults to "multivariate". 26 | """ 27 | 28 | self.data_type = data_type 29 | self.index = -1 30 | self.detrend = detrend 31 | self.window_len = window_len 32 | self.detrend_len = detrend_len 33 | self.window = deque(maxlen=self.window_len) 34 | self.detrend_window = deque(maxlen=self.detrend_len) 35 | self.score_first = score_first 36 | 37 | def _check(self, X) -> bool: 38 | """Check whether the detector can handle the data.""" 39 | x_shape = X.shape[0] 40 | 41 | if self.data_type == "univariate": 42 | assert x_shape == 1, "The data is not univariate." 43 | elif self.data_type == "multivariate": 44 | assert x_shape >= 1, "The data is not univariate or multivariate." 45 | 46 | if np.isnan(X).any(): 47 | return False 48 | self.index += 1 49 | return True 50 | 51 | def _detrend(self, X: np.ndarray) -> np.ndarray: 52 | """Detrend the data by subtracting the mean. 53 | 54 | Args: 55 | X (np.ndarray): Data of current observation. 56 | 57 | Returns: 58 | np.ndarray: Detrended data. 59 | """ 60 | 61 | self.detrend_window.append(X) 62 | 63 | return X - np.mean(self.detrend_window, axis=0) 64 | 65 | @abstractmethod 66 | def fit(self, X: np.ndarray, timestamp: int = None): 67 | return NotImplementedError 68 | 69 | @abstractmethod 70 | def score(self, X: np.ndarray, timestamp: int = None) -> float: 71 | return NotImplementedError 72 | 73 | def fit_score(self, X: np.ndarray, timestamp: int = None) -> float: 74 | """Fit one observation and calculate its anomaly score. 75 | 76 | Args: 77 | X (np.ndarray): Data of current observation. 78 | 79 | Returns: 80 | float: Anomaly score. A high score indicates a high degree of anomaly. 81 | """ 82 | 83 | check_flag = self._check(X) 84 | if not check_flag: 85 | return None 86 | X = self._detrend(X) if self.detrend else X 87 | 88 | if self.index < self.window_len: 89 | self.fit(X, timestamp) 90 | return None 91 | 92 | if self.score_first: 93 | score = self.score(X, timestamp) 94 | self.fit(X, timestamp) 95 | else: 96 | score = self.fit(X, timestamp).score(X, timestamp) 97 | 98 | return float(abs(score)) 99 | -------------------------------------------------------------------------------- /streamad/base/metrics.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | import numpy as np 3 | 4 | 5 | class BaseMetrics(ABC): 6 | """ 7 | Abstract class for evaluation metrics, supporting for customize evaluation. 8 | """ 9 | 10 | def __init__(self) -> None: 11 | super().__init__() 12 | self.y_pred = None 13 | self.y_true = None 14 | 15 | @abstractmethod 16 | def evaluate(self, y_true: np.ndarray, y_pred: np.ndarray): 17 | y_pred = np.array(y_pred) 18 | y_pred[y_pred == None] = 0 19 | self.y_true = y_true.astype(int) 20 | self.y_pred = y_pred.astype(int) 21 | return 22 | -------------------------------------------------------------------------------- /streamad/evaluate/__init__.py: -------------------------------------------------------------------------------- 1 | from .point_aware_metrics import PointAwareMetircs 2 | from .series_aware_metrics import SeriesAwareMetircs 3 | from .numenta_aware_metrics import NumentaAwareMetircs 4 | 5 | __all__ = [ 6 | "PointAwareMetircs", 7 | "SeriesAwareMetircs", 8 | "NumentaAwareMetircs", 9 | ] 10 | -------------------------------------------------------------------------------- /streamad/evaluate/numenta_aware_metrics.py: -------------------------------------------------------------------------------- 1 | from streamad.base import BaseMetrics 2 | from streamad.evaluate.ts_metrics import TSMetric 3 | import numpy as np 4 | 5 | 6 | class NumentaAwareMetircs(BaseMetrics): 7 | def __init__(self, anomaly_threshold: float = 0.8, beta: float = 1.0): 8 | """Numenta metrics calculation methods. :cite:`DBLP:journals/ijon/AhmadLPA17`. 9 | 10 | Args: 11 | anomaly_threshold (float, optional): A threshold to determine the anomalies, it can covert the anomaly scores to binary (0/1) indicators. Defaults to 0.8. 12 | beta (float, optional): F-beta score, like a F1-score. Defaults to 1.0. 13 | """ 14 | super().__init__() 15 | self.threshold = anomaly_threshold 16 | self.beta = beta 17 | self.precision = None 18 | self.recall = None 19 | self.Fbeta = None 20 | 21 | def evaluate(self, y_true: np.ndarray, y_pred: np.ndarray) -> tuple: 22 | super().evaluate(y_true, y_pred) 23 | 24 | select = self.y_pred > self.threshold 25 | self.y_pred[select] = 1 26 | self.y_pred[~select] = 0 27 | 28 | metric = TSMetric( 29 | metric_option="numenta", 30 | beta=self.beta, 31 | alpha_r=0.0, 32 | cardinality="one", 33 | bias_p="flat", 34 | bias_r="flat", 35 | ) 36 | self.precision, self.recall, self.Fbeta = metric.score( 37 | self.y_true, self.y_pred 38 | ) 39 | 40 | return self.precision, self.recall, self.Fbeta 41 | -------------------------------------------------------------------------------- /streamad/evaluate/point_aware_metrics.py: -------------------------------------------------------------------------------- 1 | from streamad.base import BaseMetrics 2 | from streamad.evaluate.ts_metrics import TSMetric 3 | import numpy as np 4 | 5 | 6 | class PointAwareMetircs(BaseMetrics): 7 | def __init__(self, anomaly_threshold: float = 0.8, beta: float = 1.0): 8 | """Classic metrics :cite:`enwiki:1089762876` 9 | 10 | Args: 11 | anomaly_threshold (float, optional): A threshold to determine the anomalies, it can covert the anomaly scores to binary (0/1) indicators. Defaults to 0.8. 12 | beta (float, optional): F-beta score, like a F1-score. Defaults to 1.0. 13 | """ 14 | super().__init__() 15 | self.threshold = anomaly_threshold 16 | self.beta = beta 17 | self.precision = None 18 | self.recall = None 19 | self.Fbeta = None 20 | 21 | def evaluate(self, y_true: np.ndarray, y_pred: np.ndarray) -> tuple: 22 | super().evaluate(y_true, y_pred) 23 | 24 | select = self.y_pred > self.threshold 25 | self.y_pred[select] = 1 26 | self.y_pred[~select] = 0 27 | 28 | metric = TSMetric( 29 | metric_option="classic", 30 | beta=self.beta, 31 | alpha_r=0.0, 32 | cardinality="one", 33 | bias_p="flat", 34 | bias_r="flat", 35 | ) 36 | self.precision, self.recall, self.Fbeta = metric.score( 37 | self.y_true, self.y_pred 38 | ) 39 | 40 | return self.precision, self.recall, self.Fbeta 41 | -------------------------------------------------------------------------------- /streamad/evaluate/series_aware_metrics.py: -------------------------------------------------------------------------------- 1 | from streamad.base import BaseMetrics 2 | from streamad.evaluate.ts_metrics import TSMetric 3 | import numpy as np 4 | 5 | 6 | class SeriesAwareMetircs(BaseMetrics): 7 | def __init__( 8 | self, 9 | anomaly_threshold: float = 0.8, 10 | beta: float = 1.0, 11 | bias_p: str = "flat", 12 | bias_r: str = "flat", 13 | ): 14 | """Time series aware metrics :cite:`DBLP:conf/nips/TatbulLZAG18` 15 | 16 | Args: 17 | anomaly_threshold (float, optional): A threshold to determine the anomalies, it can covert the anomaly scores to binary (0/1) indicators. Defaults to 0.8. 18 | beta (float, optional): F-beta score, like a F1-score. Defaults to 1.0. 19 | bias_p (str, optional): Bias for precision. Optionals are "flat", "front", "middle", "back". Defaults to "flat". 20 | bias_r (str, optional): Bias for recall. Optionals are "flat", "front", "middle", "back". Defaults to "flat". 21 | """ 22 | super().__init__() 23 | self.threshold = anomaly_threshold 24 | self.beta = beta 25 | self.bias_p = bias_p 26 | self.bias_r = bias_r 27 | self.precision = None 28 | self.recall = None 29 | self.Fbeta = None 30 | 31 | def evaluate(self, y_true: np.ndarray, y_pred: np.ndarray) -> tuple: 32 | super().evaluate(y_true, y_pred) 33 | 34 | select = self.y_pred > self.threshold 35 | self.y_pred[select] = 1 36 | self.y_pred[~select] = 0 37 | 38 | metric = TSMetric( 39 | metric_option="time-series", 40 | beta=self.beta, 41 | alpha_r=0.0, 42 | cardinality="reciprocal", 43 | bias_p=self.bias_p, 44 | bias_r=self.bias_r, 45 | ) 46 | self.precision, self.recall, self.Fbeta = metric.score( 47 | self.y_true, self.y_pred 48 | ) 49 | 50 | return self.precision, self.recall, self.Fbeta 51 | -------------------------------------------------------------------------------- /streamad/evaluate/ts_metrics.py: -------------------------------------------------------------------------------- 1 | # This script is from https://github.com/KurochkinAlexey/Time-series-precision-recall, Thanks! 2 | 3 | import numpy as np 4 | 5 | 6 | class TSMetric: 7 | def __init__( 8 | self, 9 | metric_option="classic", 10 | beta=1.0, 11 | alpha_r=0.0, 12 | cardinality="one", 13 | bias_p="flat", 14 | bias_r="flat", 15 | ): 16 | 17 | assert (alpha_r >= 0) & (alpha_r <= 1) 18 | assert metric_option in ["classic", "time-series", "numenta"] 19 | assert beta > 0 20 | assert cardinality in ["one", "reciprocal", "udf_gamma"] 21 | assert bias_p in ["flat", "front", "middle", "back"] 22 | assert bias_r in ["flat", "front", "middle", "back"] 23 | 24 | self.metric_option = metric_option 25 | self.beta = beta 26 | self.alpha_r = alpha_r 27 | self.alpha_p = 0 28 | self.cardinality = cardinality 29 | self.bias_p = bias_p 30 | self.bias_r = bias_r 31 | 32 | def _udf_gamma(self, overlap, task_type): 33 | """ 34 | user defined gamma 35 | """ 36 | return 1.0 37 | 38 | def _gamma_select(self, gamma, overlap, task_type): 39 | if gamma == "one": 40 | return 1.0 41 | elif gamma == "reciprocal": 42 | if overlap > 1: 43 | return 1.0 / overlap 44 | else: 45 | return 1.0 46 | elif gamma == "udf_gamma_def": 47 | if overlap > 1: 48 | return 1.0 / self._udf_gamma(overlap, task_type) 49 | else: 50 | return 1.0 51 | 52 | def _gamma_function(self, overlap_count, task_type): 53 | overlap = overlap_count[0] 54 | if task_type == 0: 55 | return self._gamma_select(self.cardinality, overlap, task_type) 56 | elif task_type == 1: 57 | return self._gamma_select(self.cardinality, overlap, task_type) 58 | else: 59 | raise Exception("invalid argument in gamma function") 60 | 61 | def _compute_omega_reward(self, r1, r2, overlap_count, task_type): 62 | if r1[1] < r2[0] or r1[0] > r2[1]: 63 | return 0 64 | else: 65 | overlap_count[0] += 1 66 | overlap = np.zeros(r1.shape) 67 | overlap[0] = max(r1[0], r2[0]) 68 | overlap[1] = min(r1[1], r2[1]) 69 | return self._omega_function(r1, overlap, task_type) 70 | 71 | def _omega_function(self, rrange, overlap, task_type): 72 | anomaly_length = rrange[1] - rrange[0] + 1 73 | my_positional_bias = 0 74 | max_positional_bias = 0 75 | temp_bias = 0 76 | for i in range(1, anomaly_length + 1): 77 | temp_bias = self._delta_function(i, anomaly_length, task_type) 78 | max_positional_bias += temp_bias 79 | j = rrange[0] + i - 1 80 | if j >= overlap[0] and j <= overlap[1]: 81 | my_positional_bias += temp_bias 82 | if max_positional_bias > 0: 83 | res = my_positional_bias / max_positional_bias 84 | return res 85 | else: 86 | return 0 87 | 88 | def _delta_function(self, t, anomaly_length, task_type): 89 | if task_type == 0: 90 | return self._delta_select(self.bias_p, t, anomaly_length, task_type) 91 | elif task_type == 1: 92 | return self._delta_select(self.bias_r, t, anomaly_length, task_type) 93 | else: 94 | raise Exception("Invalid task type in delta function") 95 | 96 | def _delta_select(self, delta, t, anomaly_length, task_type): 97 | if delta == "flat": 98 | return 1.0 99 | elif delta == "front": 100 | return float(anomaly_length - t + 1.0) 101 | elif delta == "middle": 102 | if t <= anomaly_length / 2.0: 103 | return float(t) 104 | else: 105 | return float(anomaly_length - t + 1.0) 106 | elif delta == "back": 107 | return float(t) 108 | elif delta == "udf_delta": 109 | return self._udf_delta(t, anomaly_length, task_type) 110 | else: 111 | raise Exception("Invalid positional bias value") 112 | 113 | def _udf_delta(self, t, anomaly_length, task_type): 114 | """ 115 | user defined delta function 116 | """ 117 | return 1.0 118 | 119 | def _update_precision(self, real_anomalies, predicted_anomalies): 120 | precision = 0 121 | if len(predicted_anomalies) == 0: 122 | return 0 123 | for i in range(len(predicted_anomalies)): 124 | range_p = predicted_anomalies[i, :] 125 | omega_reward = 0 126 | overlap_count = [0] 127 | for j in range(len(real_anomalies)): 128 | range_r = real_anomalies[j, :] 129 | omega_reward += self._compute_omega_reward( 130 | range_p, range_r, overlap_count, 0 131 | ) 132 | overlap_reward = ( 133 | self._gamma_function(overlap_count, 0) * omega_reward 134 | ) 135 | if overlap_count[0] > 0: 136 | existence_reward = 1 137 | else: 138 | existence_reward = 0 139 | 140 | precision += ( 141 | self.alpha_p * existence_reward 142 | + (1 - self.alpha_p) * overlap_reward 143 | ) 144 | precision /= len(predicted_anomalies) 145 | return precision 146 | 147 | def _update_recall(self, real_anomalies, predicted_anomalies): 148 | recall = 0 149 | if len(real_anomalies) == 0: 150 | return 0 151 | for i in range(len(real_anomalies)): 152 | omega_reward = 0 153 | overlap_count = [0] 154 | range_r = real_anomalies[i, :] 155 | for j in range(len(predicted_anomalies)): 156 | range_p = predicted_anomalies[j, :] 157 | omega_reward += self._compute_omega_reward( 158 | range_r, range_p, overlap_count, 1 159 | ) 160 | overlap_reward = ( 161 | self._gamma_function(overlap_count, 1) * omega_reward 162 | ) 163 | 164 | if overlap_count[0] > 0: 165 | existence_reward = 1 166 | else: 167 | existence_reward = 0 168 | 169 | recall += ( 170 | self.alpha_r * existence_reward 171 | + (1 - self.alpha_r) * overlap_reward 172 | ) 173 | recall /= len(real_anomalies) 174 | return recall 175 | 176 | def _shift(self, arr, num, fill_value=np.nan): 177 | arr = np.roll(arr, num) 178 | if num < 0: 179 | arr[num:] = fill_value 180 | elif num > 0: 181 | arr[:num] = fill_value 182 | return arr 183 | 184 | def _prepare_data(self, values_real, values_pred): 185 | 186 | assert len(values_real) == len(values_pred) 187 | 188 | if self.metric_option == "classic": 189 | real_anomalies = np.argwhere(values_real == 1).repeat(2, axis=1) 190 | predicted_anomalies = np.argwhere(values_pred == 1).repeat( 191 | 2, axis=1 192 | ) 193 | 194 | elif self.metric_option == "time-series": 195 | predicted_anomalies_ = np.argwhere(values_pred == 1).ravel() 196 | predicted_anomalies_shift_forward = self._shift( 197 | predicted_anomalies_, 1, fill_value=predicted_anomalies_[0] 198 | ) 199 | predicted_anomalies_shift_backward = self._shift( 200 | predicted_anomalies_, -1, fill_value=predicted_anomalies_[-1] 201 | ) 202 | predicted_anomalies_start = np.argwhere( 203 | (predicted_anomalies_shift_forward - predicted_anomalies_) != -1 204 | ).ravel() 205 | predicted_anomalies_finish = np.argwhere( 206 | (predicted_anomalies_ - predicted_anomalies_shift_backward) 207 | != -1 208 | ).ravel() 209 | predicted_anomalies = np.hstack( 210 | [ 211 | predicted_anomalies_[predicted_anomalies_start].reshape( 212 | -1, 1 213 | ), 214 | predicted_anomalies_[predicted_anomalies_finish].reshape( 215 | -1, 1 216 | ), 217 | ] 218 | ) 219 | 220 | real_anomalies_ = np.argwhere(values_real == 1).ravel() 221 | real_anomalies_shift_forward = self._shift( 222 | real_anomalies_, 223 | 1, 224 | fill_value=real_anomalies_[0] if len(real_anomalies_) else 0, 225 | ) 226 | real_anomalies_shift_backward = self._shift( 227 | real_anomalies_, 228 | -1, 229 | fill_value=real_anomalies_[-1] if len(real_anomalies_) else 0, 230 | ) 231 | real_anomalies_start = np.argwhere( 232 | (real_anomalies_shift_forward - real_anomalies_) != -1 233 | ).ravel() 234 | real_anomalies_finish = np.argwhere( 235 | (real_anomalies_ - real_anomalies_shift_backward) != -1 236 | ).ravel() 237 | real_anomalies = np.hstack( 238 | [ 239 | real_anomalies_[real_anomalies_start].reshape(-1, 1), 240 | real_anomalies_[real_anomalies_finish].reshape(-1, 1), 241 | ] 242 | ) 243 | 244 | elif self.metric_option == "numenta": 245 | predicted_anomalies = np.argwhere(values_pred == 1).repeat( 246 | 2, axis=1 247 | ) 248 | real_anomalies_ = np.argwhere(values_real == 1).ravel() 249 | real_anomalies_shift_forward = self._shift( 250 | real_anomalies_, 251 | 1, 252 | fill_value=real_anomalies_[0] if len(real_anomalies_) else 0, 253 | ) 254 | real_anomalies_shift_backward = self._shift( 255 | real_anomalies_, 256 | -1, 257 | fill_value=real_anomalies_[-1] if len(real_anomalies_) else 0, 258 | ) 259 | real_anomalies_start = np.argwhere( 260 | (real_anomalies_shift_forward - real_anomalies_) != -1 261 | ).ravel() 262 | real_anomalies_finish = np.argwhere( 263 | (real_anomalies_ - real_anomalies_shift_backward) != -1 264 | ).ravel() 265 | real_anomalies = np.hstack( 266 | [ 267 | real_anomalies_[real_anomalies_start].reshape(-1, 1), 268 | real_anomalies_[real_anomalies_finish].reshape(-1, 1), 269 | ] 270 | ) 271 | return real_anomalies, predicted_anomalies 272 | 273 | def score(self, values_real, values_predicted): 274 | assert isinstance(values_real, np.ndarray) 275 | assert isinstance(values_predicted, np.ndarray) 276 | 277 | if not values_predicted.any(): 278 | if not values_real.any(): 279 | return 1.0, 1.0, 1.0 280 | else: 281 | return 0.0, 0.0, 0.0 282 | 283 | real_anomalies, predicted_anomalies = self._prepare_data( 284 | values_real, values_predicted 285 | ) 286 | precision = self._update_precision(real_anomalies, predicted_anomalies) 287 | recall = self._update_recall(real_anomalies, predicted_anomalies) 288 | if precision + recall != 0: 289 | Fbeta = ( 290 | (1 + self.beta**2) 291 | * precision 292 | * recall 293 | / (self.beta**2 * precision + recall) 294 | ) 295 | else: 296 | Fbeta = 0 297 | 298 | return precision, recall, Fbeta 299 | -------------------------------------------------------------------------------- /streamad/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set name = "streamad" %} 2 | {% set version = "0.3.0" %} 3 | 4 | package: 5 | name: {{ name|lower }} 6 | version: {{ version }} 7 | 8 | source: 9 | url: https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/streamad-{{ version }}.tar.gz 10 | sha256: b775e2ca53f296e7a6d3c117b7becc263146b01e57e7326296564931c0b4ef9c 11 | 12 | build: 13 | noarch: python 14 | script: {{ PYTHON }} -m pip install . -vv 15 | number: 0 16 | 17 | requirements: 18 | host: 19 | - pip 20 | - python >= 3.7 21 | - setuptools ==58.2.0 22 | run: 23 | - mmh3 >=3.0.0 24 | - numpy >=1.19 25 | - pandas >=1.3.0 26 | - plotly 27 | - python >=3.7 28 | - scikit-learn >=1.0.0 29 | - scipy >=1.7.0 30 | 31 | test: 32 | imports: 33 | - streamad 34 | commands: 35 | - pip check 36 | requires: 37 | - pip 38 | 39 | about: 40 | home: https://github.com/Fengrui-Liu/StreamAD 41 | summary: An anomaly detection package for data streams. 42 | license: Apache-2.0 43 | license_file: LICENSE 44 | 45 | extra: 46 | recipe-maintainers: 47 | - Fengrui-Liu 48 | -------------------------------------------------------------------------------- /streamad/model/KNN_Detector.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | from copy import deepcopy 3 | 4 | import numpy as np 5 | from scipy.spatial.distance import cdist 6 | from streamad.base import BaseDetector 7 | 8 | 9 | class KNNDetector(BaseDetector): 10 | def __init__(self, k_neighbor: int = 5, **kwargs): 11 | """Univariate KNN-CAD model with mahalanobis distance :cite:`DBLP:journals/corr/BurnaevI16`. 12 | 13 | Args: 14 | k_neighbor (int, optional): The number of neighbors to cumulate distances. Defaults to 5. 15 | """ 16 | super().__init__(data_type="univariate", **kwargs) 17 | self.window = deque(maxlen=int(np.sqrt(self.window_len))) 18 | self.buffer = deque(maxlen=self.window_len - self.window.maxlen) 19 | 20 | assert ( 21 | k_neighbor < self.buffer.maxlen 22 | ), "k_neighbor must be less than the length of buffer" 23 | 24 | self.k = k_neighbor 25 | 26 | def fit(self, X: np.ndarray, timestamp: int = None): 27 | 28 | self.window.append(X[0]) 29 | 30 | if len(self.window) == self.window.maxlen: 31 | self.buffer.append(deepcopy(self.window)) 32 | 33 | return self 34 | 35 | def score(self, X: np.ndarray, timestamp: int = None) -> float: 36 | 37 | window = deepcopy(self.window) 38 | window.pop() 39 | window.append(X[0]) 40 | 41 | try: 42 | dist = cdist(np.array([window]), self.buffer, metric="mahalanobis")[ 43 | 0 44 | ] 45 | except: 46 | dist = cdist( 47 | np.array([window]), 48 | self.buffer, 49 | metric="mahalanobis", 50 | VI=np.linalg.pinv(self.buffer), 51 | )[0] 52 | score = np.sum(np.partition(np.array(dist), self.k + 1)[1 : self.k + 1]) 53 | 54 | return float(score) 55 | -------------------------------------------------------------------------------- /streamad/model/Mad_Dectector.py: -------------------------------------------------------------------------------- 1 | from streamad.base import BaseDetector 2 | import numpy as np 3 | from collections import deque 4 | 5 | 6 | class MadDetector(BaseDetector): 7 | def __init__(self, **kwargs): 8 | """Median Absolute Deviation Detector :cite: `InfluxDB:MAD`. 9 | 10 | Args: 11 | window_len (int, optional): Length of sliding window. Defaults to 50. 12 | threshold (float, optional): threshold to decide a anomaly data. Defaults to 3.0. 13 | 14 | parameters: 15 | scale_factor : Multiple relationship between standard deviation and absolute median difference under normal distribution. 16 | 17 | """ 18 | super().__init__(data_type="univariate", **kwargs) 19 | self.scale_factor = 1.4826 20 | 21 | def fit(self, X: np.ndarray, timestamp: int = None): 22 | self.window.append(X[0]) 23 | 24 | return self 25 | 26 | def score(self, X: np.ndarray, timestamp: int = None): 27 | ori_median = np.median(self.window) 28 | abs_diff = np.abs(self.window - ori_median) 29 | mad = self.scale_factor * np.median(abs_diff) 30 | score = np.divide( 31 | abs_diff[-1], mad, out=np.array(abs_diff[-1] / 1e-5), where=mad != 0 32 | ) 33 | 34 | return score 35 | -------------------------------------------------------------------------------- /streamad/model/OCSVM_Detector.py: -------------------------------------------------------------------------------- 1 | from sklearn.svm import OneClassSVM 2 | import numpy as np 3 | from streamad.base.detector import BaseDetector 4 | from collections import deque 5 | from typing import Literal 6 | 7 | 8 | class OCSVMDetector(BaseDetector): 9 | def __init__( 10 | self, 11 | nu: float = 0.5, 12 | kernel: Literal[ 13 | "linear", "poly", "rbf", "sigmoid", "precomputed" 14 | ] = "rbf", 15 | **kwargs 16 | ): 17 | """One-Class SVM Detector :cite:`enwiki:1098733917`. 18 | 19 | Args: 20 | nu (float, optional): An upper bound on the fraction of training errors and a lower bound of the fraction of support vectors. Defaults to 0.5. 21 | kernel (str, optional): The kernel type to be used in the algorithm. Defaults to "rbf". 22 | """ 23 | super().__init__(data_type="multivariate", **kwargs) 24 | self.nu = nu 25 | self.kernel = kernel 26 | self.model = None 27 | 28 | def fit(self, X: np.ndarray, timestamp: int = None): 29 | 30 | self.window.append(X) 31 | if self.index >= self.window_len: 32 | self.model = OneClassSVM( 33 | gamma="scale", nu=self.nu, kernel=self.kernel 34 | ) 35 | self.model.fit(list(self.window)) 36 | 37 | return self 38 | 39 | def score(self, X: np.ndarray, timestamp: int = None): 40 | 41 | score = self.model.decision_function(X.reshape(1, -1)) 42 | return abs(score) 43 | -------------------------------------------------------------------------------- /streamad/model/SArima_Detector.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import warnings 3 | 4 | import numpy as np 5 | import statsmodels.api as sm 6 | from streamad.base.detector import BaseDetector 7 | 8 | warnings.filterwarnings("ignore") 9 | 10 | 11 | class SArimaDetector(BaseDetector): 12 | def __init__(self, **kwargs): 13 | """Auto Regressive Integrated Moving Averages Detector :cite:`durbin2012time` 14 | 15 | Args: 16 | window_len (int, optional): Length of sliding window. Defaults to 50. 17 | """ 18 | super().__init__(data_type="univariate", **kwargs) 19 | self.best_result = None 20 | self.best_model = None 21 | self.best_order = None 22 | self.best_seasonal_order = None 23 | 24 | def _init_fit(self): 25 | best_aic = float("inf") 26 | p = d = q = range(0, 2) 27 | pdq = list(itertools.product(p, d, q)) 28 | seasonal_pdq = [ 29 | (x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q)) 30 | ] 31 | for param in pdq: 32 | for param_seasonal in seasonal_pdq: 33 | model = sm.tsa.statespace.SARIMAX( 34 | list(self.window), 35 | order=param, 36 | seasonal_order=param_seasonal, 37 | enforce_stationarity=False, 38 | enforce_invertibility=False, 39 | ) 40 | result = model.fit(disp=0) 41 | aic = result.aic 42 | if aic < best_aic: 43 | self.best_model = model 44 | best_aic = aic 45 | self.best_order = param 46 | self.best_seasonal_order = param_seasonal 47 | 48 | self.best_result = self.best_model.fit(disp=0) 49 | 50 | def fit(self, X: np.ndarray, timestamp: int = None): 51 | self.window.append(X[0]) 52 | if self.index == self.window_len: 53 | self._init_fit() 54 | 55 | if self.index > self.window_len: 56 | self.best_result = self.best_result.append(X) 57 | 58 | return self 59 | 60 | def score(self, X: np.ndarray, timestamp: int = None): 61 | pred_uc = self.best_result.get_forecast(steps=1) 62 | 63 | pred_ci = pred_uc.conf_int() 64 | pred_mid = (pred_ci[0, 0] + pred_ci[0, 1]) / 2 65 | pred_range = pred_ci[0, 1] - pred_ci[0, 0] 66 | 67 | if pred_ci[0, 0] > X: 68 | score = abs((X - pred_mid) / pred_range) 69 | return score 70 | elif X > pred_ci[0, 1]: 71 | score = abs((X - pred_mid) / pred_range) 72 | return score 73 | else: 74 | score = 0 75 | return float(score) 76 | -------------------------------------------------------------------------------- /streamad/model/SR_Detector.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from streamad.base import BaseDetector 3 | from collections import deque 4 | from copy import deepcopy 5 | 6 | EPS = 1e-8 7 | 8 | 9 | class SRDetector(BaseDetector): 10 | def __init__( 11 | self, 12 | extend_len: int = 5, 13 | ahead_len: int = 10, 14 | mag_num: int = 5, 15 | **kwargs 16 | ): 17 | """Spectral Residual Detector :cite:`DBLP:conf/kdd/RenXWYHKXYTZ19`. 18 | 19 | Args: 20 | window_len (int, optional): Length of sliding window. Defaults to 50. 21 | extend_len (int, optional): Length to be extended, for FFT transforme. Defaults to 5. 22 | ahead_len (int, optional): Length to look ahead for references. Defaults to 10. 23 | mag_num (int, optional): Number of FFT magnitude. Defaults to 5. 24 | """ 25 | super().__init__(data_type="univariate", **kwargs) 26 | self.extend_len = extend_len 27 | assert ahead_len > 1, "ahead_len must be greater than 1" 28 | self.ahead_len = ahead_len 29 | self.mag_num = mag_num 30 | 31 | def fit(self, X: np.ndarray, timestamp: int = None): 32 | self.window.append(X[0]) 33 | 34 | return self 35 | 36 | def score(self, X: np.ndarray, timestamp: int = None) -> float: 37 | window = deepcopy(self.window) 38 | 39 | window.pop() 40 | window.append(X[0]) 41 | 42 | extended_window = self._extend_window(window) 43 | 44 | mags = self._sr_transform(extended_window) 45 | anomaly_scores = self._spectral_score(mags) 46 | 47 | return anomaly_scores[-1 - self.extend_len] 48 | 49 | def _spectral_score(self, mags): 50 | avg_mag = self._average_filter(mags, n=self.mag_num * 10) 51 | safeDivisors = np.clip(avg_mag, EPS, avg_mag.max()) 52 | 53 | raw_scores = np.divide( 54 | np.abs(mags - avg_mag), 55 | safeDivisors, 56 | out=np.zeros_like(mags), 57 | where=safeDivisors != 0, 58 | ) 59 | scores = np.clip(raw_scores / 10.0, 0, 1.0) 60 | 61 | return scores 62 | 63 | def _sr_transform(self, window): 64 | trans = np.fft.fft(window) 65 | mag = np.sqrt(trans.real**2 + trans.imag**2) 66 | eps_index = np.where(mag <= EPS)[0] 67 | mag[eps_index] = EPS 68 | 69 | mag_log = np.log(mag) 70 | mag_log[eps_index] = 0 71 | 72 | spectral = np.exp( 73 | mag_log - self._average_filter(mag_log, n=self.mag_num) 74 | ) 75 | 76 | trans.real = trans.real * spectral / mag 77 | trans.imag = trans.imag * spectral / mag 78 | 79 | trans.real[eps_index] = 0 80 | trans.imag[eps_index] = 0 81 | 82 | wave_r = np.fft.ifft(trans) 83 | 84 | mag = np.sqrt(wave_r.real**2, wave_r.imag**2) 85 | 86 | return mag 87 | 88 | def _average_filter(self, values, n=3): 89 | if n >= len(values): 90 | n = len(values) 91 | 92 | res = np.cumsum(values, dtype=float) 93 | res[n:] = res[n:] - res[:-n] 94 | res[n:] = res[n:] / n 95 | 96 | for i in range(1, n): 97 | res[i] /= i + 1 98 | 99 | return res 100 | 101 | def _extend_window(self, window): 102 | predicted_window = [ 103 | self._predict_next(list(window)[-self.ahead_len : -1]) 104 | ] * self.extend_len 105 | 106 | extended_window = np.concatenate((window, predicted_window), axis=0) 107 | 108 | return extended_window 109 | 110 | def _predict_next(self, ahead_window): 111 | assert ( 112 | len(ahead_window) > 1 113 | ), "ahead window must have at least 2 elements" 114 | 115 | ele_last = ahead_window[-1] 116 | n = len(ahead_window) 117 | 118 | slopes = [ 119 | (ele_last - ele) / (n - 1 - i) 120 | for i, ele in enumerate(ahead_window[:-1]) 121 | ] 122 | 123 | return ahead_window[1] + sum(slopes) 124 | -------------------------------------------------------------------------------- /streamad/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .KNN_Detector import KNNDetector 2 | from .xStream_Detector import xStreamDetector 3 | from .spot_Detector import SpotDetector 4 | from .rshash_Detector import RShashDetector 5 | from .random_Detector import RandomDetector 6 | from .SR_Detector import SRDetector 7 | from .rrcf_Detector import RrcfDetector 8 | from .hstree_Detector import HSTreeDetector 9 | from .zscore_Detector import ZScoreDetector 10 | from .loda_Detector import LodaDetector 11 | from .OCSVM_Detector import OCSVMDetector 12 | from .Mad_Dectector import MadDetector 13 | from .SArima_Detector import SArimaDetector 14 | from .zspot_Detector import ZSpotDetector 15 | 16 | __all__ = [ 17 | "KNNDetector", 18 | "xStreamDetector", 19 | "SpotDetector", 20 | "RandomDetector", 21 | "RShashDetector", 22 | "SRDetector", 23 | "RrcfDetector", 24 | "HSTreeDetector", 25 | "ZScoreDetector", 26 | "LodaDetector", 27 | "OCSVMDetector", 28 | "MadDetector", 29 | "SArimaDetector", 30 | "ZSpotDetector" 31 | ] 32 | -------------------------------------------------------------------------------- /streamad/model/hstree_Detector.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from streamad.base import BaseDetector 3 | from streamad.util import StreamStatistic 4 | 5 | 6 | class Leaf: 7 | def __init__( 8 | self, 9 | left=None, 10 | right=None, 11 | depth=0, 12 | ): 13 | self.left = left 14 | self.right = right 15 | self.r = 0 16 | self.l = 0 17 | self.split_attrib = 0 18 | self.split_value = 0.0 19 | self.k = depth 20 | 21 | 22 | class HSTreeDetector(BaseDetector): 23 | def __init__(self, tree_height: int = 10, tree_num: int = 20, **kwargs): 24 | """Half space tree detectors. :cite:`DBLP:conf/ijcai/TanTL11`. 25 | 26 | Args: 27 | tree_height (int, optional): Height of a half space tree. Defaults to 10. 28 | tree_num (int, optional): Totla number of the trees. Defaults to 20. 29 | """ 30 | super().__init__(data_type="multivariate", **kwargs) 31 | self.tree_height = tree_height 32 | self.tree_num = tree_num 33 | self.forest = [] 34 | self.data_stats = StreamStatistic() 35 | 36 | self.dimensions = None 37 | 38 | def _generate_max_min(self): 39 | max_arr = np.zeros(self.dimensions) 40 | min_arr = np.zeros(self.dimensions) 41 | for q in range(self.dimensions): 42 | s_q = np.random.random_sample() 43 | max_value = max(s_q, 1 - s_q) 44 | max_arr[q] = s_q + max_value 45 | min_arr[q] = s_q - max_value 46 | 47 | return max_arr, min_arr 48 | 49 | def _init_a_tree(self, max_arr, min_arr, k): 50 | if k == self.tree_height: 51 | return Leaf(depth=k) 52 | 53 | leaf = Leaf() 54 | q = np.random.randint(self.dimensions) 55 | p = (max_arr[q] + min_arr[q]) / 2.0 56 | temp = max_arr[q] 57 | max_arr[q] = p 58 | leaf.left = self._init_a_tree(max_arr, min_arr, k + 1) 59 | max_arr[q] = temp 60 | min_arr[q] = p 61 | leaf.right = self._init_a_tree(max_arr, min_arr, k + 1) 62 | leaf.split_attrib = q 63 | leaf.split_value = p 64 | leaf.k = k 65 | return leaf 66 | 67 | def _update_tree_mass(self, tree, X, is_ref_window): 68 | if tree: 69 | if tree.k != 0: 70 | if is_ref_window: 71 | tree.r += 1 72 | 73 | tree.l += 1 74 | if X[tree.split_attrib] > tree.split_value: 75 | tree_new = tree.right 76 | else: 77 | tree_new = tree.left 78 | self._update_tree_mass(tree_new, X, is_ref_window) 79 | 80 | def _reset_tree(self, tree): 81 | if tree: 82 | tree.r = tree.l 83 | tree.l = 0 84 | self._reset_tree(tree.left) 85 | self._reset_tree(tree.right) 86 | 87 | def fit(self, X: np.ndarray, timestamp: int = None): 88 | self.data_stats.update(X) 89 | 90 | X_normalized = np.divide( 91 | X - self.data_stats.get_min(), 92 | self.data_stats.get_max() - self.data_stats.get_min(), 93 | out=np.zeros_like(X, dtype=float), 94 | where=self.data_stats.get_max() - self.data_stats.get_min() != 0, 95 | dtype=float, 96 | ) 97 | X_normalized[np.abs(X_normalized) == np.inf] = 0 98 | 99 | if self.dimensions is None: 100 | self.dimensions = len(X) 101 | for _ in range(self.tree_num): 102 | max_arr, min_arr = self._generate_max_min() 103 | tree = self._init_a_tree(max_arr, min_arr, 0) 104 | self.forest.append(tree) 105 | 106 | if self.index < self.window_len: 107 | for tree in self.forest: 108 | self._update_tree_mass(tree, X_normalized, True) 109 | else: 110 | if self.index % self.window_len == 0: 111 | for tree in self.forest: 112 | self._reset_tree(tree) 113 | 114 | for tree in self.forest: 115 | self._update_tree_mass(tree, X_normalized, False) 116 | 117 | return self 118 | 119 | def score(self, X: np.ndarray, timestamp: int = None) -> float: 120 | score = 0.0 121 | 122 | X_normalized = np.divide( 123 | X - self.data_stats.get_min(), 124 | self.data_stats.get_max() - self.data_stats.get_min(), 125 | out=np.zeros_like(X, dtype=float), 126 | where=self.data_stats.get_max() - self.data_stats.get_min() != 0, 127 | ) 128 | X_normalized[np.abs(X_normalized) == np.inf] = 0 129 | 130 | for tree in self.forest: 131 | score += self._score_tree(tree, X_normalized, 0) 132 | 133 | score = score / self.tree_num 134 | 135 | return float(score) 136 | 137 | def _score_tree(self, tree, X, k): 138 | s = 0 139 | if not tree: 140 | return s 141 | 142 | s += tree.r * (2**k) 143 | 144 | if X[tree.split_attrib] > tree.split_value: 145 | tree_new = tree.right 146 | else: 147 | tree_new = tree.left 148 | 149 | s += self._score_tree(tree_new, X, k + 1) 150 | 151 | return s 152 | -------------------------------------------------------------------------------- /streamad/model/loda_Detector.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | 3 | import numpy as np 4 | from streamad.base import BaseDetector 5 | from fast_histogram import histogram1d 6 | 7 | 8 | class LodaDetector(BaseDetector): 9 | def __init__(self, random_cuts_num: int = 10, **kwargs): 10 | """Multivariate LODA Detector :cite:`DBLP:journals/ml/Pevny16`. 11 | 12 | Args: 13 | window_len (int, optional): The length of window. Defaults to 50. 14 | random_cuts_num (int, optional): The number of random experiments. Defaults to 10. 15 | """ 16 | super().__init__(data_type="multivariate", **kwargs) 17 | 18 | self.random_cuts_num = random_cuts_num 19 | self.bins_num = int( 20 | 1 * (self.window_len**1) * (np.log(self.window_len) ** -1) 21 | ) 22 | self._weights = np.ones(random_cuts_num) / random_cuts_num 23 | self.components_num = None 24 | self.nonzero_components_num = None 25 | self.zero_components_num = None 26 | self._projections = None 27 | self._histograms = None 28 | self._limits = None 29 | 30 | def fit(self, X: np.ndarray, timestamp: int = None): 31 | self.window.append(X) 32 | if self.index == 0: 33 | self.components_num = len(X) 34 | self.nonzero_components_num = int(np.sqrt(self.components_num)) 35 | self.zero_components_num = ( 36 | self.components_num - self.nonzero_components_num 37 | ) 38 | 39 | elif len(self.window) == self.window.maxlen: 40 | self._projections = np.random.randn( 41 | self.random_cuts_num, self.components_num 42 | ) 43 | self._histograms = np.zeros([self.random_cuts_num, self.bins_num]) 44 | self._limits = np.zeros([self.random_cuts_num, self.bins_num + 1]) 45 | 46 | for i in range(self.random_cuts_num): 47 | rands = np.random.permutation(self.components_num)[ 48 | : self.zero_components_num 49 | ] 50 | self._projections[i, rands] = 0.0 51 | projected_data = self._projections[i, :].dot( 52 | np.array(self.window).T 53 | ) 54 | 55 | try: 56 | self._histograms[i, :] = ( 57 | histogram1d( 58 | projected_data, 59 | range=( 60 | projected_data.min(), 61 | projected_data.max() + 1e-12, 62 | ), 63 | bins=self.bins_num, 64 | ) 65 | + 1e-12 66 | ) 67 | except: 68 | self._histograms[i, :] = ( 69 | histogram1d( 70 | projected_data, 71 | range=( 72 | projected_data.min(), 73 | projected_data.max() + 1e-5, 74 | ), 75 | bins=self.bins_num, 76 | ) 77 | + 1e-12 78 | ) 79 | self._limits[i, :] = np.linspace( 80 | projected_data.min(), 81 | projected_data.max() + 1e-12, 82 | num=self.bins_num + 1, 83 | ) 84 | 85 | self._histograms[i, :] /= np.sum(self._histograms[i, :]) 86 | 87 | return self 88 | 89 | def score(self, X: np.ndarray, timestamp: int = None): 90 | score = 0 91 | 92 | for i in range(self.random_cuts_num): 93 | projected_data = self._projections[i, :].dot(np.array(X).T) 94 | inds = np.searchsorted( 95 | self._limits[i, : self.bins_num - 1], 96 | projected_data, 97 | side="left", 98 | ) 99 | score += -self._weights[i] * np.log(self._histograms[i, inds]) 100 | 101 | score = score / self.random_cuts_num 102 | return float(score) 103 | 104 | 105 | if __name__ == "__main__": 106 | import cProfile 107 | import resource 108 | 109 | # from line_profiler import LineProfiler 110 | 111 | # lp = LineProfiler() 112 | 113 | model = LodaDetector() 114 | 115 | # lp.add_function(model.fit) 116 | # lp.add_function(model.score) 117 | # lp_wrapper = lp(model.fit_score) 118 | import sys 119 | 120 | for i in range(1500): 121 | # lp_wrapper(np.array([i])) 122 | model.fit_score(np.array([i * 10])) 123 | 124 | r = sys.getsizeof(model) 125 | # r = resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss 126 | print(r) 127 | 128 | # lp.print_stats() 129 | -------------------------------------------------------------------------------- /streamad/model/random_Detector.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | from streamad.base import BaseDetector 5 | 6 | 7 | class RandomDetector(BaseDetector): 8 | """Return random anomaly score. A minimum score for benchmark.""" 9 | 10 | def __init__(self, **kwargs): 11 | super().__init__(data_type="multivariate", **kwargs) 12 | 13 | def fit(self, X: np.ndarray, timestamp: int = None): 14 | return self 15 | 16 | def score(self, X: np.ndarray, timestamp: int = None): 17 | 18 | return random.random() 19 | -------------------------------------------------------------------------------- /streamad/model/rrcf_Detector.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | 3 | import numpy as np 4 | import rrcf 5 | from streamad.base import BaseDetector 6 | from copy import deepcopy 7 | 8 | 9 | class RrcfDetector(BaseDetector): 10 | def __init__(self, num_trees=10, tree_size=12, **kwargs): 11 | """Rrcf detector :cite:`DBLP:conf/icml/GuhaMRS16`. 12 | 13 | Args: 14 | window_len (int, optional): Length of sliding window. Defaults to 50. 15 | num_trees (int, optional): Number of trees. Defaults to 10. 16 | tree_size (int, optional): Size of each tree. Defaults to 12. 17 | """ 18 | 19 | super().__init__(data_type="multivariate", **kwargs) 20 | self.num_trees = num_trees 21 | self.tree_size = tree_size 22 | self.forest = [] 23 | for _ in range(num_trees): 24 | tree = rrcf.RCTree() 25 | self.forest.append(tree) 26 | self.avg_codisp = {} 27 | 28 | self.shingle = deque(maxlen=int(np.sqrt(self.window_len))) 29 | 30 | def fit(self, X: np.ndarray, timestamp: int = None): 31 | self.shingle.append(X) 32 | 33 | if not self.forest[0].ndim: 34 | dim = X.shape[0] 35 | for tree in self.forest: 36 | tree.ndim = dim 37 | 38 | if self.shingle.maxlen == len(self.shingle): 39 | if self.index > (self.shingle.maxlen + self.tree_size): 40 | list( 41 | map( 42 | lambda x: x.forget_point(self.index - self.tree_size), 43 | self.forest, 44 | ) 45 | ) 46 | 47 | list( 48 | map( 49 | lambda x: x.insert_point(self.shingle, self.index), 50 | self.forest, 51 | ) 52 | ) 53 | 54 | return self 55 | 56 | def score(self, X: np.ndarray, timestamp: int = None): 57 | score_list = list(map(lambda x: x.codisp(self.index), self.forest)) 58 | 59 | score = sum(score_list) / self.num_trees 60 | 61 | return float(score) 62 | 63 | 64 | if __name__ == "__main__": 65 | import cProfile 66 | from line_profiler import LineProfiler 67 | 68 | lp = LineProfiler() 69 | 70 | model = RrcfDetector() 71 | 72 | # lp.add_function(_Chain.fit) 73 | # lp.add_function(_Chain.score) 74 | # lp.add_function(_Chain.bincount) 75 | lp.add_function(model.fit) 76 | lp.add_function(model.score) 77 | lp_wrapper = lp(model.fit_score) 78 | 79 | for i in range(1500): 80 | lp_wrapper(np.array([i])) 81 | # model.fit_score(np.array([i])) 82 | 83 | lp.print_stats() 84 | -------------------------------------------------------------------------------- /streamad/model/rshash_Detector.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from streamad.base import BaseDetector 3 | from streamad.util import StreamStatistic 4 | from collections import deque 5 | 6 | 7 | class RShashDetector(BaseDetector): 8 | def __init__( 9 | self, decay=0.015, components_num=10, hash_num: int = 10, **kwargs 10 | ): 11 | """Multivariate RSHashDetector :cite:`DBLP:conf/icdm/SatheA16`. 12 | 13 | Args: 14 | window_len (int, optional): Length of data to burn in/init. Defaults to 50. 15 | decay (float, optional): Decay ratio. Defaults to 0.015. 16 | components_num (int, optional): Number of components. Defaults to 10. 17 | hash_num (int, optional): Number of hash functions. Defaults to 10. 18 | """ 19 | super().__init__(data_type="multivariate", **kwargs) 20 | 21 | self.decay = decay 22 | self.data_stats = StreamStatistic() 23 | 24 | self.hash_num = hash_num 25 | self.components_num = components_num 26 | self.cmsketches = [{} for _ in range(hash_num)] 27 | 28 | self.alpha = None 29 | 30 | self.effective_s = max(1000, 1.0 / (1 - np.power(2, -self.decay))) 31 | self.f = np.random.uniform( 32 | low=1.0 / np.sqrt(self.effective_s), 33 | high=1 - (1.0 / np.sqrt(self.effective_s)), 34 | size=self.components_num, 35 | ) 36 | 37 | def _burn_in(self): 38 | # Normalized the init data 39 | buffer = np.array(self.window) 40 | buffer_normalized = np.divide( 41 | buffer - self.data_stats.get_min(), 42 | self.data_stats.get_max() - self.data_stats.get_min(), 43 | out=np.zeros_like(buffer).astype(float), 44 | where=self.data_stats.get_max() - self.data_stats.get_min() != 0, 45 | ) 46 | buffer_normalized[np.abs(buffer_normalized) == np.inf] = 0 47 | 48 | for r in range(self.components_num): 49 | for i in range(buffer.shape[0]): 50 | Y = np.floor( 51 | (buffer_normalized[i, :] + np.array(self.alpha[r])) 52 | / self.f[r] 53 | ) 54 | 55 | # mod_entry = np.insert(Y, 0, r) 56 | mod_entry = np.concatenate(([r], Y)) 57 | mod_entry = tuple(mod_entry.astype(int)) 58 | 59 | for w in range(self.hash_num): 60 | try: 61 | value = self.cmsketches[w][mod_entry] 62 | except KeyError: 63 | value = (0, 0) 64 | 65 | value = (0, value[1] + 1) 66 | self.cmsketches[w][mod_entry] = value 67 | 68 | def fit(self, X: np.ndarray, timestamp: int = None): 69 | if self.index == 0: 70 | self.alpha = [ 71 | np.random.uniform(low=0, high=self.f[r], size=len(X)) 72 | for r in range(self.components_num) 73 | ] 74 | 75 | self.data_stats.update(X) 76 | 77 | if self.index == self.window.maxlen - 1: 78 | self._burn_in() 79 | 80 | if len(self.window) < self.window.maxlen: 81 | self.window.append(X) 82 | return self 83 | 84 | return self 85 | 86 | def score(self, X: np.ndarray, timestamp: int = None) -> float: 87 | X_normalized = np.divide( 88 | X - self.data_stats.get_min(), 89 | self.data_stats.get_max() - self.data_stats.get_min(), 90 | out=np.zeros_like(X).astype(float), 91 | where=self.data_stats.get_max() - self.data_stats.get_min() != 0, 92 | ) 93 | X_normalized[np.abs(X_normalized) == np.inf] = 0 94 | 95 | score_instance = 0 96 | 97 | for r in range(self.components_num): 98 | Y = np.floor((X_normalized + np.array(self.alpha[r])) / self.f[r]) 99 | # mod_entry = np.insert(Y, 0, r) 100 | mod_entry = np.concatenate(([r], Y)) 101 | mod_entry = tuple(mod_entry.astype(int)) 102 | 103 | c = [] 104 | 105 | for w in range(len(self.cmsketches)): 106 | try: 107 | value = self.cmsketches[w][mod_entry] 108 | except KeyError: 109 | value = (self.index, 0) 110 | 111 | tstamp = value[0] 112 | wt = value[1] 113 | new_wt = wt * np.power(2, -self.decay * (self.index - tstamp)) 114 | c.append(new_wt) 115 | new_tstamp = self.index 116 | self.cmsketches[w][mod_entry] = (new_tstamp, new_wt + 1) 117 | 118 | min_c = min(c) 119 | c = np.log(1 + min_c) 120 | score_instance += c 121 | 122 | score = score_instance / self.components_num 123 | 124 | return float(score) 125 | -------------------------------------------------------------------------------- /streamad/model/spot_Detector.py: -------------------------------------------------------------------------------- 1 | from streamad.base import BaseDetector 2 | import numpy as np 3 | from math import log 4 | from scipy.optimize import minimize 5 | from collections import deque 6 | import heapq 7 | 8 | np.seterr(divide="ignore", invalid="ignore") 9 | 10 | 11 | class SpotDetector(BaseDetector): 12 | def __init__( 13 | self, 14 | prob: float = 1e-4, 15 | back_mean_len: int = 20, 16 | num_threshold_up: int = 20, 17 | num_threshold_down: int = 20, 18 | deviance_ratio: float = 0.01, 19 | global_memory: bool = True, 20 | **kwargs 21 | ): 22 | """Univariate Spot model :cite:`DBLP:conf/kdd/SifferFTL17`. 23 | 24 | Args: 25 | prob (float, optional): Threshold for the probability of anomalies, a small float value. Defaults to 1e-4.. Defaults to 1e-4. 26 | back_mean_len (int, optional): The length of backward window to calculate the first-order difference. Defaults to 20. 27 | num_threshold_up (int, optional): Number of peaks over upper threshold to estimate distribution. Defaults to 20. 28 | num_threshold_down (int, optional): Number of peaks over lower threshold to estimate distribution. Defaults to 20. 29 | deviance_ratio (float, optional): Deviance ratio aginest the absolute value of data, which is useful when the value is very large and deviances are small. Defaults to 0.01. 30 | window_len (int, optional): Length of the window for reference. Defaults to 200. 31 | """ 32 | 33 | super().__init__(data_type="univariate", **kwargs) 34 | 35 | self.prob = prob 36 | self.deviance_ratio = deviance_ratio 37 | self.global_memory = global_memory 38 | # self.window = deque(maxlen=self.window_len) 39 | self.back_mean_len = back_mean_len 40 | self.back_mean_window = deque(maxlen=self.back_mean_len) 41 | # self.window_len = self.window_len - self.back_mean_len 42 | assert ( 43 | self.window_len > 0 44 | ), "window_len is too small, default value is 200" 45 | 46 | self.num_threshold = { 47 | "up": num_threshold_up, 48 | "down": num_threshold_down, 49 | } 50 | 51 | nonedict = {"up": None, "down": None} 52 | 53 | self.extreme_quantile = dict.copy(nonedict) 54 | self.init_threshold = dict.copy(nonedict) 55 | self.peaks = dict.copy(nonedict) 56 | self.history_peaks = {"up": [], "down": []} 57 | # self.peaks = {'up':deque(maxlen=20),'down':deque(maxlen=20)} 58 | self.gamma = dict.copy(nonedict) 59 | self.sigma = dict.copy(nonedict) 60 | self.normal_X = None 61 | 62 | # self.thup = [] 63 | # self.thdown = [] 64 | 65 | def _grimshaw(self, side, epsilon=1e-8, n_points=10): 66 | def u(s): 67 | return 1 + np.log(s).mean() 68 | 69 | def v(s): 70 | return np.mean(1 / s) 71 | 72 | def w(Y, t): 73 | s = 1 + t * Y 74 | us = u(s) 75 | vs = v(s) 76 | return us * vs - 1 77 | 78 | def jac_w(Y, t): 79 | s = 1 + t * Y 80 | us = u(s) 81 | vs = v(s) 82 | jac_us = np.divide( 83 | 1, t, out=np.array(1 / epsilon), where=t != 0 84 | ) * (1 - vs) 85 | jac_vs = np.divide( 86 | 1, t, out=np.array(1 / epsilon), where=t != 0 87 | ) * (-vs + np.mean(1 / s**2)) 88 | return us * jac_vs + vs * jac_us 89 | 90 | self.peaks[side][self.peaks[side] == 0] = epsilon 91 | Ym = self.peaks[side].min() 92 | YM = self.peaks[side].max() 93 | Ymean = self.peaks[side].mean() 94 | 95 | a = np.divide(-1, YM, out=np.array(-epsilon), where=YM != 0) 96 | if abs(a) < 2 * epsilon: 97 | epsilon = abs(a) / n_points 98 | 99 | # a = a + epsilon 100 | b = 2 * np.divide( 101 | (Ymean - Ym), 102 | (Ymean * Ym), 103 | out=np.array((Ymean - Ym) / epsilon - epsilon), 104 | where=(Ymean * Ym) != 0, 105 | ) 106 | c = 2 * np.divide( 107 | Ymean - Ym, 108 | Ym**2, 109 | out=np.array((Ymean - Ym) / epsilon + epsilon), 110 | where=Ym != 0, 111 | ) 112 | 113 | d = a + epsilon 114 | e = -epsilon 115 | 116 | left_zeros = self._rootsFinder( 117 | lambda t: w(self.peaks[side], t), 118 | lambda t: jac_w(self.peaks[side], t), 119 | (d, e) if d < e else (e, d), 120 | n_points, 121 | "regular", 122 | ) 123 | 124 | right_zeros = self._rootsFinder( 125 | lambda t: w(self.peaks[side], t), 126 | lambda t: jac_w(self.peaks[side], t), 127 | (b, c) if b < c else (c, b), 128 | n_points, 129 | "regular", 130 | ) 131 | 132 | # all the possible roots 133 | zeros = np.concatenate((left_zeros, right_zeros)) 134 | 135 | # 0 is always a solution so we initialize with it 136 | gamma_best = 0 137 | sigma_best = Ymean 138 | ll_best = self._log_likelihood(self.peaks[side], gamma_best, sigma_best) 139 | 140 | # we look for better candidates 141 | for z in zeros: 142 | gamma = u(1 + z * self.peaks[side]) - 1 143 | sigma = np.divide( 144 | gamma, z, out=np.array(gamma / epsilon), where=z != 0 145 | ) 146 | ll = self._log_likelihood(self.peaks[side], gamma, sigma) 147 | if ll > ll_best: 148 | gamma_best = gamma 149 | sigma_best = sigma 150 | ll_best = ll 151 | 152 | return gamma_best, sigma_best, ll_best 153 | 154 | def _rootsFinder(self, fun, jac, bounds, npoints, method): 155 | """ 156 | Find possible roots of a scalar function 157 | 158 | Parameters 159 | ---------- 160 | fun : function 161 | scalar function 162 | jac : function 163 | first order derivative of the function 164 | bounds : tuple 165 | (min,max) interval for the roots search 166 | npoints : int 167 | maximum number of roots to output 168 | method : str 169 | 'regular' : regular sample of the search interval, 'random' : uniform (distribution) sample of the search interval 170 | 171 | Returns 172 | ---------- 173 | numpy.array 174 | possible roots of the function 175 | """ 176 | if method == "regular": 177 | step = (bounds[1] - bounds[0]) / (npoints + 1) 178 | try: 179 | X0 = np.arange(bounds[0] + step, bounds[1], step) 180 | except: 181 | X0 = np.random.uniform(bounds[0], bounds[1], npoints) 182 | elif method == "random": 183 | X0 = np.random.uniform(bounds[0], bounds[1], npoints) 184 | 185 | def objFun(X, f, jac): 186 | g = 0 187 | j = np.zeros(X.shape) 188 | i = 0 189 | for x in X: 190 | fx = f(x) 191 | g = g + fx**2 192 | j[i] = 2 * fx * jac(x) 193 | i = i + 1 194 | return g, j 195 | 196 | opt = minimize( 197 | lambda X: objFun(X, fun, jac), 198 | X0, 199 | method="L-BFGS-B", 200 | jac=True, 201 | bounds=[bounds] * len(X0), 202 | ) 203 | 204 | X = opt.x 205 | np.round(X, decimals=5) 206 | return np.unique(X) 207 | 208 | def _log_likelihood(self, Y, gamma, sigma): 209 | """ 210 | Compute the log-likelihood for the Generalized Pareto Distribution (μ=0) 211 | 212 | Parameters 213 | ---------- 214 | Y : numpy.array 215 | observations 216 | gamma : float 217 | GPD index parameter 218 | sigma : float 219 | GPD scale parameter (>0) 220 | 221 | Returns 222 | ---------- 223 | float 224 | log-likelihood of the sample Y to be drawn from a GPD(γ,σ,μ=0) 225 | """ 226 | n = Y.size 227 | if gamma != 0: 228 | tau = gamma / sigma 229 | L = ( 230 | -n * log(sigma) 231 | - (1 + (1 / gamma)) * (np.log(1 + tau * Y)).sum() 232 | ) 233 | else: 234 | L = n * (1 + log(abs(Y.mean()) + 1e-8)) 235 | 236 | return L 237 | 238 | def _quantile(self, side, gamma, sigma): 239 | if side == "up": 240 | r = self.window_len * self.prob / self.num_threshold[side] 241 | # r = 1000 * self.prob 242 | 243 | if gamma != 0: 244 | return self.init_threshold["up"] + (sigma / gamma) * ( 245 | pow(r, -gamma) - 1 246 | ) 247 | else: 248 | return self.init_threshold["up"] - sigma * log(r) 249 | elif side == "down": 250 | r = self.window_len * self.prob / self.num_threshold[side] 251 | # r = 1000 * self.prob 252 | 253 | if gamma != 0: 254 | return self.init_threshold["down"] - (sigma / gamma) * ( 255 | pow(r, -gamma) - 1 256 | ) 257 | else: 258 | return self.init_threshold["down"] + sigma * log(r) 259 | else: 260 | raise ValueError("The side is not right") 261 | 262 | def _init_drift(self, verbose=False): 263 | for side in ["up", "down"]: 264 | self._update_one_side(side) 265 | 266 | return self 267 | 268 | def _update_one_side(self, side: str): 269 | if side == "up": 270 | candidates = ( 271 | list(self.window) + self.history_peaks[side] 272 | if self.global_memory 273 | else list(self.window) 274 | ) 275 | 276 | self.history_peaks[side] = heapq.nlargest( 277 | self.num_threshold[side], 278 | candidates, 279 | ) 280 | self.init_threshold[side] = self.history_peaks[side][-1] 281 | self.peaks[side] = np.array(self.history_peaks[side]) - np.array( 282 | self.init_threshold[side] 283 | ) 284 | elif side == "down": 285 | candidates = ( 286 | list(self.window) + self.history_peaks[side] 287 | if self.global_memory 288 | else list(self.window) 289 | ) 290 | 291 | self.history_peaks[side] = heapq.nsmallest( 292 | self.num_threshold[side], 293 | candidates, 294 | ) 295 | self.init_threshold[side] = self.history_peaks[side][-1] 296 | self.peaks[side] = np.array(self.init_threshold[side]) - np.array( 297 | self.history_peaks[side] 298 | ) 299 | 300 | # remove the largest incase the first anomaly change the threshold 301 | # self.peaks[side] = self.peaks[side][1:] 302 | gamma, sigma, _ = self._grimshaw(side) 303 | self.extreme_quantile[side] = self._quantile(side, gamma, sigma) 304 | self.gamma[side] = gamma 305 | self.sigma[side] = sigma 306 | 307 | def _cal_back_mean(self, X): 308 | back_mean = ( 309 | np.mean(self.back_mean_window) 310 | if self.back_mean_window.maxlen > 0 311 | else np.array(0.0) 312 | ) 313 | 314 | return X - back_mean 315 | 316 | def fit(self, X: np.ndarray, timestamp: int = None): 317 | X = float(X[0]) 318 | 319 | self.back_mean_window.append(X) 320 | 321 | if self.index >= self.back_mean_len: 322 | self.normal_X = self._cal_back_mean(X) 323 | self.window.append(self.normal_X) 324 | 325 | if self.index == self.window_len: 326 | self._init_drift() 327 | 328 | if self.index >= self.window_len: 329 | last_X = ( 330 | self.window[-2] 331 | if self.back_mean_len == 0 332 | else (X - self.window[-1]) 333 | ) 334 | 335 | if ( 336 | abs( 337 | np.divide( 338 | X - last_X, last_X, np.array(X), where=last_X != 0 339 | ) 340 | ) 341 | < self.deviance_ratio 342 | ): 343 | return self 344 | 345 | if self.normal_X > self.init_threshold["up"]: 346 | self._update_one_side("up") 347 | 348 | elif self.normal_X < self.init_threshold["down"]: 349 | self._update_one_side("down") 350 | 351 | return self 352 | 353 | def score(self, X: np.ndarray, timestamp: int = None): 354 | X = float(X[0]) 355 | 356 | # if self.score_first: 357 | # last_X = self._cal_back_mean(X) 358 | # else: 359 | last_X = ( 360 | self.window[-2] 361 | if self.back_mean_len == 0 362 | else (X - self.window[-1]) 363 | ) 364 | 365 | if ( 366 | abs(np.divide(X - last_X, last_X, np.array(X), where=last_X != 0)) 367 | < self.deviance_ratio 368 | ): 369 | score = 0.0 370 | 371 | elif ( 372 | self.normal_X > self.extreme_quantile["up"] 373 | or self.normal_X < self.extreme_quantile["down"] 374 | ): 375 | score = 1.0 376 | 377 | elif self.normal_X > self.init_threshold["up"]: 378 | side = "up" 379 | score = np.divide( 380 | self.normal_X - self.init_threshold[side], 381 | (self.extreme_quantile[side] - self.init_threshold[side]), 382 | np.array(0.5), 383 | where=( 384 | self.extreme_quantile[side] - self.init_threshold[side] != 0 385 | ), 386 | ) 387 | 388 | elif self.normal_X < self.init_threshold["down"]: 389 | side = "down" 390 | score = np.divide( 391 | self.init_threshold[side] - self.normal_X, 392 | (self.init_threshold[side] - self.extreme_quantile[side]), 393 | np.array(0.5), 394 | where=( 395 | self.init_threshold[side] - self.extreme_quantile[side] != 0 396 | ), 397 | ) 398 | else: 399 | score = 0.0 400 | 401 | # self.thup.append(self.extreme_quantile["up"] + hist_mean) 402 | # self.thdown.append(self.extreme_quantile["down"] + hist_mean) 403 | 404 | return float(score) 405 | -------------------------------------------------------------------------------- /streamad/model/xStream_Detector.py: -------------------------------------------------------------------------------- 1 | from streamad.base import BaseDetector 2 | import numpy as np 3 | import mmh3 4 | from math import floor 5 | 6 | 7 | class xStreamDetector(BaseDetector): 8 | def __init__( 9 | self, 10 | n_components: int = 8, 11 | n_chains: int = 8, 12 | depth: int = 8, 13 | **kwargs, 14 | ): 15 | """Multivariate xStreamDetector :cite:`DBLP:conf/kdd/ManzoorLA18`. 16 | 17 | Args: 18 | n_components (int, optional): Number of streamhash projection, similar to feature numbers. Defaults to 50. 19 | n_chains (int, optional): Number of half-space chains. Defaults to 8. 20 | depth (int, optional): Maximum depth for each chain. Defaults to 8. 21 | """ 22 | 23 | super().__init__(data_type="multivariate", **kwargs) 24 | self.projector = StreamhashProjector( 25 | num_components=n_components, density=1 / 3.0 26 | ) 27 | self.cur_window = [] 28 | self.ref_window = [] 29 | 30 | delta = np.ones(n_components) * 0.5 31 | self.hs_chains = _hsChains( 32 | deltamax=delta, n_chains=n_chains, depth=depth 33 | ) 34 | 35 | def fit(self, X: np.ndarray, timestamp: int = None): 36 | projected_X = self.projector.transform(X) 37 | self.cur_window.append(projected_X) 38 | self.hs_chains.fit(projected_X) 39 | 40 | self.ref_window = self.cur_window 41 | self.cur_window = [] 42 | 43 | deltamax = np.ptp(self.ref_window, axis=0) / 2.0 44 | deltamax[np.abs(deltamax) <= 0.0001] = 1.0 45 | 46 | self.hs_chains.set_deltamax(deltamax=deltamax) 47 | 48 | return self 49 | 50 | def score(self, X: np.ndarray, timestamp: int = None): 51 | projected_X = self.projector.transform(X) 52 | 53 | score = -1.0 * self.hs_chains.score_chains(projected_X) 54 | 55 | return score 56 | 57 | 58 | class _Chain: 59 | def __init__(self, deltamax, depth): 60 | self.depth = depth 61 | self.deltamax = deltamax 62 | self.rand = np.random.rand(len(deltamax)) 63 | self.rand_shift = self.rand * deltamax 64 | self.cmsketch_ref = [{} for _ in range(depth)] * depth 65 | self.is_first_window = True 66 | self.fs = [np.random.randint(0, len(deltamax)) for _ in range(depth)] 67 | 68 | @staticmethod 69 | def float_to_int(x): 70 | return x // 1 71 | 72 | def bincount(self, X): 73 | scores = np.zeros(self.depth) 74 | prebins = np.zeros(X.shape[0], dtype=float) 75 | depthcount = np.zeros(len(self.deltamax), dtype=int) 76 | for depth in range(self.depth): 77 | f = self.fs[depth] 78 | depthcount[f] += 1 79 | if depthcount[f] == 1: 80 | prebins[f] = X[f] + self.rand_shift[f] / self.deltamax[f] 81 | else: 82 | prebins[f] = ( 83 | 2.0 * prebins[f] - self.rand_shift[f] / self.deltamax[f] 84 | ) 85 | 86 | cmsketch = self.cmsketch_ref[depth] 87 | 88 | l = tuple(map(floor, prebins)) 89 | 90 | if l in cmsketch: 91 | scores[depth] = cmsketch[l] 92 | else: 93 | scores[depth] = 0.0 94 | 95 | return scores 96 | 97 | def score(self, X): 98 | scores = self.bincount(X) 99 | 100 | depths = np.arange(1, self.depth + 1) 101 | 102 | scores = np.log2(1.0 + scores) + depths 103 | return np.min(scores) 104 | 105 | def fit(self, X): 106 | prebins = np.zeros(X.shape, dtype=float) 107 | depthcount = np.zeros(len(self.deltamax), dtype=int) 108 | for depth in range(self.depth): 109 | f = self.fs[depth] 110 | depthcount[f] += 1 111 | 112 | if depthcount[f] == 1: 113 | prebins[f] = (X[f] + self.rand_shift[f]) / self.deltamax[f] 114 | else: 115 | prebins[f] = ( 116 | 2.0 * prebins[f] - self.rand_shift[f] / self.deltamax[f] 117 | ) 118 | 119 | if self.is_first_window: 120 | cmsketch = self.cmsketch_ref[depth] 121 | 122 | l = tuple(map(floor, prebins)) 123 | 124 | if l not in cmsketch: 125 | cmsketch[l] = 0 126 | cmsketch[l] += 1 127 | 128 | self.cmsketch_ref[depth] = cmsketch 129 | else: 130 | cmsketch = self.cmsketch_ref[depth] 131 | 132 | l = tuple(map(floor, prebins)) 133 | 134 | if l not in cmsketch: 135 | cmsketch[l] = 0 136 | cmsketch[l] += 1 137 | self.cmsketch_ref[depth] = cmsketch 138 | 139 | return self 140 | 141 | 142 | class _hsChains: 143 | def __init__(self, deltamax, n_chains: int = 100, depth: int = 25) -> None: 144 | self.nchains = n_chains 145 | self.depth = depth 146 | self.chains = [_Chain(deltamax, depth) for _ in range(n_chains)] 147 | 148 | def score_chains(self, X): 149 | scores = 0 150 | for chain in self.chains: 151 | scores += chain.score(X) 152 | 153 | scores = float(scores) / float(self.nchains) 154 | 155 | return scores 156 | 157 | def fit(self, X): 158 | # for chain in self.chains: 159 | # chain.fit(X) 160 | list(map(lambda x: x.fit(X), self.chains)) 161 | 162 | def set_deltamax(self, deltamax): 163 | # list(map(lambda x: x.deltamax = deltamax, self.chains)) 164 | # list(map(lambda x: x.rand_shift = x.rand * deltamax, self.chains)) 165 | for chain in self.chains: 166 | chain.deltamax = deltamax 167 | chain.rand_shift = chain.rand * deltamax 168 | 169 | 170 | class StreamhashProjector: 171 | def __init__(self, num_components, density=1 / 3.0): 172 | self.keys = np.arange(0, num_components, 1) 173 | self.constant = np.sqrt(1.0 / density) / np.sqrt(num_components) 174 | self.density = density 175 | self.n_components = num_components 176 | 177 | def transform(self, X): 178 | """Projects particular (next) timestep's vector to (possibly) lower dimensional space. 179 | 180 | Args: 181 | X (float array of shape (num_features,)): Input feature vector. 182 | 183 | Returns: 184 | projected_X (float array of shape (num_components,)): Projected feature vector. 185 | """ 186 | ndim = X.shape[0] 187 | 188 | feature_names = [str(i) for i in range(ndim)] 189 | 190 | R = np.array( 191 | [ 192 | [self._hash_string(k, f) for f in feature_names] 193 | for k in self.keys 194 | ] 195 | ) 196 | 197 | Y = np.dot(X, R.T).squeeze() 198 | 199 | return Y 200 | 201 | def _hash_string(self, k, s): 202 | hash_value = int(mmh3.hash(s, signed=False, seed=k)) / (2.0**32 - 1) 203 | s = self.density 204 | if hash_value <= s / 2.0: 205 | return -1 * self.constant 206 | elif hash_value <= s: 207 | return self.constant 208 | else: 209 | return 0 210 | 211 | 212 | if __name__ == "__main__": 213 | import cProfile 214 | from line_profiler import LineProfiler 215 | 216 | lp = LineProfiler() 217 | 218 | model = xStreamDetector() 219 | 220 | lp.add_function(_Chain.fit) 221 | lp.add_function(_Chain.score) 222 | lp.add_function(_Chain.bincount) 223 | # lp.add_function(model.fit) 224 | # lp.add_function(model.score) 225 | lp_wrapper = lp(model.fit_score) 226 | 227 | for i in range(1500): 228 | # lp_wrapper(np.array([i])) 229 | model.fit_score(np.array([i])) 230 | 231 | lp.print_stats() 232 | -------------------------------------------------------------------------------- /streamad/model/zscore_Detector.py: -------------------------------------------------------------------------------- 1 | from streamad.base import BaseDetector 2 | import numpy as np 3 | from streamad.util import StreamStatistic 4 | 5 | 6 | class ZScoreDetector(BaseDetector): 7 | def __init__(self, is_global: bool = False, **kwargs): 8 | """Univariate Z-Score Detecto :cite:`enwiki:1086685336` 9 | 10 | Args: 11 | window_len (int, optional): Length of the window for reference. Defaults to 50. 12 | is_global (bool, optional): Whether to detect anomalies from a global view. Defaults to False. 13 | """ 14 | super().__init__(data_type="univariate", **kwargs) 15 | 16 | self.stat = StreamStatistic( 17 | is_global=is_global, window_len=self.window_len 18 | ) 19 | 20 | def fit(self, X: np.ndarray, timestamp: int = None): 21 | self.stat.update(X[0]) 22 | return self 23 | 24 | def score(self, X: np.ndarray, timestamp: int = None): 25 | mean = self.stat.get_mean() 26 | std = self.stat.get_std() 27 | 28 | score = np.divide( 29 | (X[0] - mean), std, out=np.zeros_like(X[0]), where=std != 0 30 | ) 31 | 32 | return score 33 | -------------------------------------------------------------------------------- /streamad/model/zspot_Detector.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import heapq 3 | from collections import deque 4 | from copy import deepcopy 5 | 6 | import numpy as np 7 | from streamad.base import BaseDetector 8 | 9 | 10 | class ZSpotDetector(BaseDetector): 11 | def __init__( 12 | self, 13 | back_mean_len: int = 20, 14 | num_over_threshold: int = 30, 15 | deviance_ratio: float = 0.01, 16 | z: int = 2, 17 | expire_days: int = 14, 18 | ignore_n: int = 10, 19 | **kwargs 20 | ): 21 | 22 | super().__init__(data_type="univariate", **kwargs) 23 | 24 | self.deviance_ratio = deviance_ratio 25 | 26 | self.back_mean_len = back_mean_len 27 | self.back_mean_window = deque(maxlen=max(self.back_mean_len, 2)) 28 | 29 | self.num_over_threshold = num_over_threshold 30 | 31 | nonedict = {"up": None, "down": None} 32 | 33 | self.extreme_quantile = dict.copy(nonedict) 34 | self.local_init_threshold = dict.copy(nonedict) 35 | self.global_init_threshold = dict.copy(nonedict) 36 | 37 | self.last_date = None 38 | 39 | self.date = deque(maxlen=expire_days) 40 | self.date_peaks = deque(maxlen=expire_days) 41 | 42 | self.history_peaks = {"up": [], "down": []} 43 | self.normal_X = None 44 | self.time_X = None 45 | self.z = z 46 | self.ignore_n = ignore_n 47 | 48 | assert self.window_len > self.ignore_n + self.back_mean_len, "window_len must be larger than (ignore_n + back_mean_len)" 49 | 50 | def _update_oneside(self, side: str, init: bool = False): 51 | if side == "up": 52 | if init is False: 53 | self.local_init_threshold[side] = heapq.heappushpop( 54 | self.history_peaks[side], self.normal_X 55 | ) 56 | else: 57 | self.local_init_threshold[side] = self.history_peaks[side][0] 58 | 59 | peaks = deepcopy(self.history_peaks[side]) 60 | for i in self.date_peaks: 61 | peaks.extend(i[side]) 62 | 63 | selected_peaks = heapq.nlargest(self.num_over_threshold, peaks) 64 | self.global_init_threshold[side] = selected_peaks[-1] 65 | selected_peaks = np.array(selected_peaks) - np.array( 66 | self.global_init_threshold[side] 67 | ) 68 | std = np.sqrt( 69 | np.sum([i**2 for i in selected_peaks]) 70 | / self.num_over_threshold 71 | ) 72 | self.extreme_quantile[side] = ( 73 | self.global_init_threshold[side] + self.z * std 74 | ) 75 | elif side == "down": 76 | 77 | if init is False: 78 | self.local_init_threshold[side] = -heapq.heappushpop( 79 | self.history_peaks[side], -self.normal_X 80 | ) 81 | else: 82 | self.local_init_threshold[side] = -self.history_peaks[side][0] 83 | 84 | peaks = deepcopy(self.history_peaks[side]) 85 | for i in self.date_peaks: 86 | peaks.extend(i[side]) 87 | 88 | selected_peaks = heapq.nlargest(self.num_over_threshold, peaks) 89 | self.global_init_threshold[side] = -selected_peaks[-1] 90 | selected_peaks = np.array(selected_peaks) + np.array( 91 | self.global_init_threshold[side] 92 | ) 93 | std = np.sqrt( 94 | np.sum([i**2 for i in selected_peaks]) 95 | / self.num_over_threshold 96 | ) 97 | self.extreme_quantile[side] = ( 98 | self.global_init_threshold[side] - self.z * std 99 | ) 100 | 101 | else: 102 | raise NotImplementedError 103 | 104 | def _cal_back_mean(self, X): 105 | 106 | back_mean = np.array(0) 107 | 108 | if self.back_mean_len == 1: 109 | # least back_mean_window is 2 110 | back_mean = self.back_mean_window[-1] 111 | elif self.back_mean_len > 1: 112 | back_mean = np.mean(self.back_mean_window) 113 | 114 | return X - back_mean 115 | 116 | def fit(self, X: np.ndarray, timestamp=None): 117 | """Fit the data to the detector. 118 | 119 | Args: 120 | X (np.ndarray): Data of current observation. 121 | """ 122 | X = float(X[0]) 123 | 124 | if self.index >= self.back_mean_len + self.ignore_n: 125 | self.normal_X = self._cal_back_mean(X) 126 | self.time_X = datetime.datetime.fromtimestamp(timestamp) 127 | 128 | if self.last_date is None: 129 | self.last_date = self.time_X.date() 130 | self.history_peaks["up"] = [self.normal_X] 131 | self.history_peaks["down"] = [-self.normal_X] 132 | 133 | elif self.last_date != self.time_X.date(): 134 | self.date.append(self.last_date) 135 | self.date_peaks.append(deepcopy(self.history_peaks)) 136 | self.last_date = self.time_X.date() 137 | self.history_peaks["up"] = [self.normal_X] 138 | self.history_peaks["down"] = [-self.normal_X] 139 | 140 | elif self.last_date == self.time_X.date(): 141 | if len(self.history_peaks["up"]) < self.num_over_threshold: 142 | heapq.heappush(self.history_peaks["up"], self.normal_X) 143 | # We use negative x to simulate a maxheap 144 | heapq.heappush(self.history_peaks["down"], -self.normal_X) 145 | 146 | # if len(self.history_peaks["up"]) == self.num_over_threshold: 147 | self._update_oneside("up", init=True) 148 | self._update_oneside("down", init=True) 149 | 150 | elif self.normal_X > self.local_init_threshold["up"]: 151 | self._update_oneside("up") 152 | elif self.normal_X < self.local_init_threshold["down"]: 153 | self._update_oneside("down") 154 | 155 | if self.index >= self.ignore_n: 156 | self.back_mean_window.append(X) 157 | return self 158 | 159 | def score(self, X: np.ndarray, timestamp=None) -> float: 160 | 161 | curr_X = self.back_mean_window[-1] 162 | last_X = self.back_mean_window[-2] 163 | 164 | if ( 165 | abs( 166 | np.divide( 167 | curr_X - last_X, last_X, np.array(curr_X), where=last_X != 0 168 | ) 169 | ) 170 | < self.deviance_ratio 171 | ): 172 | score = 0.0 173 | 174 | elif ( 175 | self.normal_X > self.extreme_quantile["up"] 176 | or self.normal_X < self.extreme_quantile["down"] 177 | ): 178 | score = 1.0 179 | 180 | elif self.normal_X > self.global_init_threshold["up"]: 181 | side = "up" 182 | score = np.divide( 183 | self.normal_X - self.global_init_threshold[side], 184 | ( 185 | self.extreme_quantile[side] 186 | - self.global_init_threshold[side] 187 | ), 188 | np.array(0.9), 189 | where=( 190 | self.extreme_quantile[side] 191 | - self.global_init_threshold[side] 192 | != 0 193 | ), 194 | ) 195 | elif self.normal_X < self.global_init_threshold["down"]: 196 | side = "down" 197 | score = np.divide( 198 | self.global_init_threshold[side] - self.normal_X, 199 | ( 200 | self.global_init_threshold[side] 201 | - self.extreme_quantile[side] 202 | ), 203 | np.array(0.5), 204 | where=( 205 | self.global_init_threshold[side] 206 | - self.extreme_quantile[side] 207 | != 0 208 | ), 209 | ) 210 | else: 211 | score = 0.0 212 | 213 | return float(score) 214 | -------------------------------------------------------------------------------- /streamad/process/__init__.py: -------------------------------------------------------------------------------- 1 | from .zscore_calibrator import ZScoreCalibrator 2 | from .tdigest_calibrator import TDigestCalibrator 3 | from .weight_ensemble import WeightEnsemble 4 | from .vote_ensemble import VoteEnsemble 5 | 6 | __all__ = [ 7 | "ZScoreCalibrator", 8 | "TDigestCalibrator", 9 | "WeightEnsemble", 10 | "VoteEnsemble", 11 | ] 12 | -------------------------------------------------------------------------------- /streamad/process/tdigest_calibrator.py: -------------------------------------------------------------------------------- 1 | from tdigest import TDigest 2 | from collections import deque 3 | 4 | 5 | class TDigestCalibrator: 6 | def __init__( 7 | self, 8 | percentile_up: float = 95, 9 | percentile_down: float = 5, 10 | is_global: bool = True, 11 | window_len: int = 100, 12 | ) -> None: 13 | """A calibrator which can filter out outliers using t-digest, and normalize the anomaly scores into [0,1] :cite:`DBLP:journals/simpa/Dunning21`. 14 | 15 | Args: 16 | percentile_up (float, optional): We regard the scores above `percentile_up` as anomalies. Defaults to 95. 17 | percentile_down (float, optional): We regard the scores below `percentile_down` as anomalies. Defaults to 5. 18 | is_global (bool, optional): Method to record, a global way or a rolling window way. Defaults to True. 19 | window_len (int, optional): The length of rolling window, ignore this when `is_global=True`. Defaults to 100. 20 | """ 21 | self.percentile_up = percentile_up 22 | self.percentile_down = percentile_down 23 | self.init_data = [] 24 | self.init_flag = False 25 | 26 | assert ( 27 | percentile_up >= 0 28 | and percentile_up <= 100 29 | and percentile_down >= 0 30 | and percentile_down <= 100 31 | ), "percentile must be between 0 and 100" 32 | 33 | self.is_global = is_global 34 | self.score_stats = TDigest() 35 | self.score_deque = deque(maxlen=window_len) 36 | 37 | def normalize(self, score: float) -> float: 38 | if not score: 39 | return None 40 | 41 | self.score_deque.append(score) 42 | 43 | if self.is_global: 44 | self.score_stats.update(score) 45 | else: 46 | self.score_stats = TDigest() 47 | self.score_stats.batch_update(self.score_deque) 48 | if self.score_deque.maxlen != len(self.score_deque): 49 | return None 50 | 51 | percentile_up = self.score_stats.percentile(self.percentile_up) 52 | percentile_down = self.score_stats.percentile(self.percentile_down) 53 | 54 | if score > percentile_up or score < percentile_down: 55 | score = 1.0 56 | else: 57 | score = 0.0 58 | 59 | return score 60 | -------------------------------------------------------------------------------- /streamad/process/vote_ensemble.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class VoteEnsemble: 5 | def __init__(self, threshold: float = 0.8): 6 | """Anomaly scores ensemble with votes. 7 | 8 | Args: 9 | threshold (float, optional): Anomaly scores that over threshold are regard as votes. Defaults to 0.8. 10 | """ 11 | self.thredshold = threshold 12 | 13 | def ensemble(self, scores: list): 14 | """Ensemble anomaly scores from ordered detectors. 15 | 16 | Args: 17 | scores (list): A list of anomaly scores with orders. 18 | 19 | Returns: 20 | float: Ensembled anomaly scores. 21 | """ 22 | 23 | assert ( 24 | type(scores) == list or type(scores) == np.ndarray 25 | ), "Unsupport score types, it should be list or numpy.ndarray" 26 | 27 | if (np.array(scores) == None).any(): 28 | return None 29 | 30 | assert ( 31 | (np.array(scores) >= 0) & (np.array(scores) <= 1) 32 | ).all(), ( 33 | "Scores should be in [0,1], you can call calibrator before ensemble" 34 | ) 35 | 36 | votes = np.array(scores) >= self.thredshold 37 | 38 | if sum(votes) > len(votes) / 2: 39 | return 1.0 40 | 41 | return 0.0 42 | -------------------------------------------------------------------------------- /streamad/process/weight_ensemble.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class WeightEnsemble: 5 | def __init__(self, ensemble_weights: list = None): 6 | """Anomaly scores ensemble with weighted average. 7 | 8 | Args: 9 | ensemble_weights (list, optional): Weights for scores with orders, we use equal weights/mean to recalculate the scores when it is None. Defaults to None. 10 | """ 11 | 12 | assert ( 13 | type(ensemble_weights) == list 14 | or type(ensemble_weights) == np.ndarray 15 | ) 16 | 17 | self.weights = ensemble_weights 18 | self.sum_weights = np.sum(self.weights) if ensemble_weights else None 19 | 20 | def ensemble(self, scores: list) -> float: 21 | """Ensemble anomaly scores from ordered detectors. 22 | 23 | Args: 24 | scores (list): A list of anomaly scores with orders. 25 | 26 | Returns: 27 | float: Ensembled anomaly scores. 28 | """ 29 | 30 | assert ( 31 | type(scores) == list or type(scores) == np.ndarray 32 | ), "Unsupport score types, it should be list or numpy.ndarray" 33 | 34 | assert len(scores) == len( 35 | self.weights 36 | ), "Inconsistent weights and scores length" 37 | 38 | if (np.array(scores) == None).any(): 39 | return None 40 | 41 | assert ( 42 | (np.array(scores) >= 0) & (np.array(scores) <= 1) 43 | ).all(), ( 44 | "Scores should be in [0,1], you can call calibrator before ensemble" 45 | ) 46 | 47 | if self.weights is None: 48 | return np.mean(scores) 49 | 50 | return np.dot(scores, self.weights) / self.sum_weights 51 | -------------------------------------------------------------------------------- /streamad/process/zscore_calibrator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from streamad.util import StreamStatistic 3 | 4 | 5 | class ZScoreCalibrator: 6 | def __init__( 7 | self, 8 | sigma: int = 3, 9 | extreme_sigma: int = 5, 10 | is_global: bool = True, 11 | window_len: int = 100, 12 | ) -> None: 13 | """A calibrator which can filter out outliers using z-score, and normalize the anomaly scores into [0,1]. 14 | 15 | Args: 16 | sigma (int, optional): Zscore threshold, we regard the scores out of sigma as potential anomalies. Defaults to 2. 17 | extreme_sigma (int, optional): Zscore threshold for extreme values, we regard the scores out of extreme_sigma as extreme anomalies. Defaults to 3. 18 | is_global (bool, optional): Method to record, a global way or a rolling window way. Defaults to True. 19 | window_len (int, optional): The length of rolling window, ignore this when `is_global=True`. Defaults to 100. 20 | """ 21 | self.sigma = sigma 22 | self.extreme_sigma = extreme_sigma 23 | self.init_data = [] 24 | self.init_flag = False 25 | self.score_stats = StreamStatistic( 26 | is_global=is_global, window_len=window_len 27 | ) 28 | 29 | def normalize(self, score: float) -> float: 30 | 31 | if score is None: 32 | return None 33 | 34 | self.score_stats.update(score) 35 | 36 | if ( 37 | self.score_stats._window.maxlen != len(self.score_stats._window) 38 | and self.score_stats._window.maxlen >= self.score_stats._num_items 39 | ): 40 | return None 41 | 42 | score_mean = self.score_stats.get_mean() 43 | score_std = self.score_stats.get_std() 44 | 45 | sigma = np.divide( 46 | (score - score_mean), 47 | score_std, 48 | out=np.array((score - score_mean) / 1e-5), 49 | where=score_std != 0, 50 | ) 51 | sigma = abs(sigma) 52 | 53 | if sigma > self.extreme_sigma: 54 | return 1.0 55 | elif sigma > self.sigma: 56 | score_max = self.score_stats.get_max() 57 | score_min = self.score_stats.get_min() 58 | score = np.divide( 59 | (score - score_min), 60 | (score_max - score_min), 61 | out=min(np.array((score - score_min) / 1e-5), np.array(1.0)), 62 | where=score_max != score_min, 63 | ) 64 | score = abs(score) 65 | else: 66 | return 0.0 67 | 68 | return score 69 | -------------------------------------------------------------------------------- /streamad/util/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # 4 | # Author: liufr 5 | # Github: https://github.com/Fengrui-Liu 6 | # LastEditTime: 2021-01-11 14:35:09 7 | # Copyright 2021 liufr 8 | # Description: 9 | # 10 | 11 | from .stream_generator import StreamGenerator 12 | from .math_toolkit import StreamStatistic 13 | from .dataset import MultivariateDS, UnivariateDS, CustomDS 14 | from .plot import plot 15 | 16 | 17 | __all__ = [ 18 | "StreamGenerator", 19 | "StreamStatistic", 20 | "MultivariateDS", 21 | "UnivariateDS", 22 | "CustomDS", 23 | "plot", 24 | ] 25 | -------------------------------------------------------------------------------- /streamad/util/dataset.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from os.path import dirname, join 3 | from typing import Union 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | warnings.simplefilter(action="ignore", category=FutureWarning) 9 | 10 | 11 | class DS: 12 | def __init__(self) -> None: 13 | 14 | self.data = None 15 | self.date = None 16 | self.label = None 17 | self.features = None 18 | self.names = None 19 | 20 | def preprocess(self) -> None: 21 | self.preprocess_data() 22 | self.preprocess_timestamp() 23 | self.preprocess_label() 24 | self.preprocess_feature() 25 | 26 | def preprocess_data(self) -> None: 27 | if type(self.path) == str: 28 | try: 29 | self.data = pd.read_csv(self.path) 30 | except FileExistsError: 31 | print("Cannot read this file:", self.path) 32 | elif type(self.path) == np.ndarray: 33 | self.data = pd.DataFrame(self.path) 34 | elif type(self.path) == pd.DataFrame: 35 | self.data = self.path 36 | self.names = self.data.columns.values 37 | 38 | def preprocess_timestamp(self) -> None: 39 | if "timestamp" in self.names.tolist(): 40 | self.date = self.data["timestamp"].values 41 | else: 42 | self.date = self.data.index.values 43 | 44 | def preprocess_label(self) -> None: 45 | if "label" in self.names.tolist(): 46 | self.label = np.array(self.data["label"].values) 47 | 48 | def preprocess_feature(self) -> None: 49 | self.features = np.setdiff1d( 50 | self.names, np.array(["label", "timestamp"]) 51 | ) 52 | self.data = np.array(self.data[self.features]) 53 | 54 | 55 | class MultivariateDS(DS): 56 | """ 57 | Load multivariate dataset. 58 | """ 59 | 60 | def __init__(self, has_names=False) -> None: 61 | super().__init__() 62 | module_path = dirname(__file__) 63 | self.path = join(module_path, "data", "multiDS.csv") 64 | self.preprocess() 65 | 66 | 67 | class UnivariateDS(DS): 68 | """ 69 | Load univariate dataset. 70 | """ 71 | 72 | def __init__(self) -> None: 73 | super().__init__() 74 | module_path = dirname(__file__) 75 | self.path = join(module_path, "data", "uniDS.csv") 76 | self.preprocess() 77 | 78 | 79 | class CustomDS(DS): 80 | """ 81 | Load custom dataset. 82 | Args: 83 | f_path (Union[str, np.ndarray]): Dataset or its path. 84 | label (np.ndarray, optional): Anomaly labels for dataset. Defaults to None. 85 | """ 86 | 87 | def __init__( 88 | self, f_path: Union[str, np.ndarray], label: np.ndarray = None 89 | ): 90 | 91 | super().__init__() 92 | self.path = f_path 93 | self.label = label 94 | self.preprocess() 95 | -------------------------------------------------------------------------------- /streamad/util/math_toolkit.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import numpy as np 4 | from collections import deque, defaultdict 5 | 6 | 7 | class StreamStatistic: 8 | """Data statistics for the streaming data, with supporting max, min, sum, mean, sum of squares, var, std and standard scaler.""" 9 | 10 | def __init__(self, is_global: bool = True, window_len: int = 10): 11 | """Statistics for the streaming data, with supporting max, min, sum, mean, sum of squares, var, std and standard scaler. 12 | 13 | Args: 14 | is_global (bool, optional): For whole stream or a windowed stream. Defaults to True. 15 | window_len (int, optional): Rolloing window length. Only works when is_global is False. Defaults to 10. 16 | """ 17 | self._is_uni = False 18 | self._is_global = is_global 19 | self._window = deque(maxlen=window_len) 20 | self._num_items = 0 21 | 22 | self._max = defaultdict(lambda: -math.inf) 23 | self._min = defaultdict(lambda: math.inf) 24 | self._sum = defaultdict(float) 25 | self._mean = defaultdict(float) 26 | self._sum_squares = defaultdict(float) 27 | self._var = defaultdict(float) 28 | self._std = defaultdict(float) 29 | 30 | def update(self, X: np.ndarray): 31 | """Update a pd.Series to stream 32 | 33 | Args: 34 | X (np.ndarray): An item from StreamGenerator 35 | 36 | """ 37 | 38 | self._num_items += 1 39 | 40 | if isinstance(X, int) or isinstance(X, float): 41 | X = np.array([X]) 42 | self._is_uni = True 43 | elif isinstance(X, np.ndarray): 44 | X = np.array([X]).flatten() 45 | if len(X) == 1: 46 | self._is_uni = True 47 | else: 48 | self._is_uni = False 49 | else: 50 | raise NotImplementedError("Only support int, float and np.ndarray") 51 | 52 | if self._is_global: 53 | 54 | tmp = defaultdict(float) 55 | 56 | for index, item in enumerate(X): 57 | self._max[index] = ( 58 | self._max[index] if self._max[index] > item else item 59 | ) 60 | self._min[index] = ( 61 | self._min[index] if self._min[index] < item else item 62 | ) 63 | self._sum[index] += X[index] 64 | old_mean = self._mean[index] 65 | tmp[index] = item - self._mean[index] 66 | self._mean[index] = self._sum[index] / self._num_items 67 | self._sum_squares[index] += (X[index] - old_mean) * ( 68 | X[index] - self._mean[index] 69 | ) 70 | self._var[index] = self._sum_squares[index] / self._num_items 71 | self._std[index] = math.sqrt(self._var[index]) 72 | else: 73 | self._window.append(X) 74 | 75 | def get_max(self): 76 | """ 77 | Get max statistic. 78 | """ 79 | 80 | if self._is_global: 81 | result = [_ for _ in self._max.values()] 82 | else: 83 | result = np.max(self._window, axis=0) 84 | 85 | return result[0] if self._is_uni else np.array(result) 86 | 87 | def get_min(self): 88 | """ 89 | Get min statistic. 90 | """ 91 | 92 | if self._is_global: 93 | result = [_ for _ in self._min.values()] 94 | else: 95 | result = np.min(self._window, axis=0) 96 | 97 | return result[0] if self._is_uni else np.array(result) 98 | 99 | def get_mean(self): 100 | """ 101 | Get mean statistic. 102 | """ 103 | 104 | if self._is_global: 105 | result = [_ for _ in self._mean.values()] 106 | else: 107 | result = np.mean(self._window, axis=0) 108 | 109 | return result[0] if self._is_uni else np.array(result) 110 | 111 | def get_std(self): 112 | """ 113 | Get max statistic. 114 | """ 115 | 116 | if self._is_global: 117 | result = [_ for _ in self._std.values()] 118 | else: 119 | result = np.std(self._window, axis=0) 120 | 121 | return result[0] if self._is_uni else np.array(result) 122 | 123 | def get_sum(self): 124 | """ 125 | Get sum statistic. 126 | """ 127 | 128 | if self._is_global: 129 | result = [_ for _ in self._sum.values()] 130 | else: 131 | result = np.sum(self._window, axis=0) 132 | 133 | return result[0] if self._is_uni else np.array(result) 134 | 135 | def get_var(self): 136 | """ 137 | Get var statistic. 138 | """ 139 | 140 | if self._is_global: 141 | result = [_ for _ in self._var.values()] 142 | else: 143 | result = np.var(self._window, axis=0) 144 | 145 | return result[0] if self._is_uni else np.array(result) 146 | 147 | 148 | class SDFT: 149 | def __init__(self, window_len) -> None: 150 | self.window_len = window_len 151 | self.window = deque(maxlen=window_len) 152 | self.coefficients = deque(maxlen=window_len) 153 | 154 | def update(self, X: np.ndarray): 155 | # def _get_coefficients(coeff, diff, i): 156 | # self.coefficients[i] = (coeff + diff) * np.exp( 157 | # 2j * np.pi * i / self.window_len 158 | # ) 159 | 160 | # return 161 | 162 | if len(self.window) < self.window_len - 1: 163 | self.window.append(X) 164 | elif len(self.window) == self.window_len - 1: 165 | self.window.append(X) 166 | self.coefficients.extend(np.fft.fft(self.window)) 167 | else: 168 | diff = X - self.window[0] 169 | 170 | for i, c in enumerate(self.coefficients): 171 | self.coefficients[i] = (c + diff) * np.exp( 172 | 2j * np.pi * i / self.window_len 173 | ) 174 | 175 | # This vectorize seems to be slower than the loop above 176 | # vfunc = np.vectorize(_get_coefficients) 177 | # vfunc( 178 | # self.coefficients, diff, [i for i in range(self.window_len)] 179 | # ) 180 | self.window.append(X) 181 | 182 | return self 183 | -------------------------------------------------------------------------------- /streamad/util/plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import plotly.graph_objects as go 3 | from plotly.subplots import make_subplots 4 | 5 | 6 | def plot( 7 | data: np.ndarray, 8 | scores: np.ndarray, 9 | date: np.ndarray = None, 10 | features: np.ndarray = None, 11 | label: np.ndarray = None, 12 | ): 13 | """Plot data, score and ground truth (if exists). 14 | 15 | Args: 16 | data (np.array): Original data stream. 17 | scores (np.array): Anomaly scores of the data stream. 18 | date (np.array, optional): Timestamp of the data. Defaults to None. 19 | features (np.array, optional): Features name. Defaults to None. 20 | label (np.array, optional): Ground truth. Defaults to None. 21 | """ 22 | 23 | if features is None: 24 | features = ["f" + str(i) for i in range(np.array(data).shape[1])] 25 | else: 26 | assert ( 27 | len(features) == data.shape[1] 28 | ), "Number of features must match data dimension." 29 | 30 | if date is None: 31 | date = [i for i in range(np.array(data).shape[0])] 32 | else: 33 | assert ( 34 | len(date) == data.shape[0] 35 | ), "Number of date must match data dimension." 36 | 37 | height = 100 * len(features) + 80 38 | row_heights = [100 / height for _ in range(len(features))] 39 | row_heights.append(80 / height) 40 | 41 | fig = make_subplots( 42 | rows=len(features) + 1, 43 | cols=1, 44 | shared_xaxes=True, 45 | vertical_spacing=20 / height, 46 | row_heights=row_heights, 47 | ) 48 | 49 | # Plot data by features 50 | for i, feature in enumerate(features): 51 | anomalies = np.where(label == 1)[0] if label is not None else [] 52 | fig.add_trace( 53 | go.Scatter( 54 | x=date, 55 | y=data[:, i], 56 | mode="lines+markers", 57 | name=str(feature), 58 | selectedpoints=anomalies, 59 | selected=dict(marker=dict(color="red", size=5)), 60 | unselected=dict(marker=dict(size=0)), 61 | ), 62 | row=i + 1, 63 | col=1, 64 | ) 65 | 66 | # Plot score 67 | fig.add_trace( 68 | go.Scatter(x=date, y=scores, name="anomaly score", marker_color="red"), 69 | row=len(features) + 1, 70 | col=1, 71 | ) 72 | # fig.update_xaxes(rangeslider={"visible": True}, row=2, col=1) 73 | fig.update_layout( 74 | margin=dict(l=10, r=10, t=10, b=10), 75 | legend=dict( 76 | orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1 77 | ), 78 | height=height, 79 | ) 80 | return fig 81 | -------------------------------------------------------------------------------- /streamad/util/stream_generator.py: -------------------------------------------------------------------------------- 1 | from typing import Generator 2 | 3 | import numpy as np 4 | 5 | 6 | class StreamGenerator: 7 | """Load static dataset and generate observation once a time. 8 | 9 | Args: 10 | X (np.ndarray): Origin static dataset. 11 | 12 | Raises: 13 | TypeError: Unexpected input data type. 14 | """ 15 | 16 | def __init__( 17 | self, X: np.ndarray, 18 | ): 19 | 20 | if isinstance(X, np.ndarray): 21 | self.X = X 22 | else: 23 | raise TypeError("Unexpected input data type, except np.ndarray.") 24 | 25 | def iter_item(self) -> Generator: 26 | """Iterate item once a time from the dataset. 27 | 28 | Yields: 29 | Generator: One observation from the dataset. 30 | """ 31 | 32 | for i in range(len(self.X)): 33 | yield self.X[i] 34 | -------------------------------------------------------------------------------- /streamad/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.3.1" # pragma: no cover 2 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fengrui-Liu/StreamAD/d2e38f4c35349b05c9bbd3ac753efc9a96e0ab05/test/__init__.py -------------------------------------------------------------------------------- /test/test_OCSVM.py: -------------------------------------------------------------------------------- 1 | from streamad.util import StreamGenerator, UnivariateDS, MultivariateDS 2 | from streamad.model import OCSVMDetector 3 | 4 | 5 | def test_score(): 6 | ds = UnivariateDS() 7 | stream = StreamGenerator(ds.data) 8 | detector = OCSVMDetector() 9 | for x in stream.iter_item(): 10 | score = detector.fit_score(x) 11 | 12 | if score is not None: 13 | assert type(score) is float 14 | 15 | 16 | def test_multi_score(): 17 | ds = MultivariateDS() 18 | stream = StreamGenerator(ds.data) 19 | detector = OCSVMDetector() 20 | for x in stream.iter_item(): 21 | score = detector.fit_score(x) 22 | 23 | if score is not None: 24 | assert type(score) is float 25 | -------------------------------------------------------------------------------- /test/test_calibrator.py: -------------------------------------------------------------------------------- 1 | from streamad.util import StreamGenerator, UnivariateDS 2 | from streamad.model import KNNDetector 3 | from streamad.process import ZScoreCalibrator, TDigestCalibrator 4 | 5 | 6 | def test_ZScoreCalibrator(): 7 | ds = UnivariateDS() 8 | stream = StreamGenerator(ds.data) 9 | detector = KNNDetector() 10 | calibrator = ZScoreCalibrator(sigma=2, extreme_sigma=3) 11 | 12 | for x in stream.iter_item(): 13 | score = detector.fit_score(x) 14 | score = calibrator.normalize(score) 15 | if score is not None: 16 | assert 0 <= score <= 1 17 | 18 | def test_ZScoreCalibrator_global(): 19 | ds = UnivariateDS() 20 | stream = StreamGenerator(ds.data) 21 | detector = KNNDetector() 22 | calibrator = ZScoreCalibrator(sigma=2, is_global=True) 23 | 24 | for x in stream.iter_item(): 25 | score = detector.fit_score(x) 26 | score = calibrator.normalize(score) 27 | if score is not None: 28 | assert 0 <= score <= 1 29 | 30 | 31 | def test_TDigestCalibrator(): 32 | ds = UnivariateDS() 33 | stream = StreamGenerator(ds.data) 34 | detector = KNNDetector() 35 | calibrator = TDigestCalibrator(percentile_up=93, percentile_down=0) 36 | 37 | for x in stream.iter_item(): 38 | score = detector.fit_score(x) 39 | normalized_score = calibrator.normalize(score) 40 | if normalized_score is not None: 41 | assert 0 <= normalized_score <= 1 42 | 43 | 44 | def test_TDigestCalibrator_global(): 45 | ds = UnivariateDS() 46 | stream = StreamGenerator(ds.data) 47 | detector = KNNDetector() 48 | calibrator = TDigestCalibrator( 49 | percentile_up=93, percentile_down=0, is_global=True 50 | ) 51 | 52 | for x in stream.iter_item(): 53 | score = detector.fit_score(x) 54 | score = calibrator.normalize(score) 55 | if score is not None: 56 | assert 0 <= score <= 1 57 | -------------------------------------------------------------------------------- /test/test_ensemble.py: -------------------------------------------------------------------------------- 1 | from streamad.util import StreamGenerator, UnivariateDS 2 | from streamad.model import KNNDetector, SpotDetector 3 | from streamad.process import ZScoreCalibrator, VoteEnsemble, WeightEnsemble 4 | 5 | 6 | def test_VoteEnsemble(): 7 | 8 | ds = UnivariateDS() 9 | stream = StreamGenerator(ds.data) 10 | knn_detector = KNNDetector() 11 | spot_detector = SpotDetector() 12 | knn_calibrator = ZScoreCalibrator(sigma=2) 13 | spot_calibrator = ZScoreCalibrator(sigma=2) 14 | ensemble = VoteEnsemble(threshold=0.8) 15 | 16 | for x in stream.iter_item(): 17 | 18 | knn_score = knn_detector.fit_score(x) 19 | spot_score = spot_detector.fit_score(x) 20 | 21 | knn_normalized_score = knn_calibrator.normalize(knn_score) 22 | spot_normalized_score = spot_calibrator.normalize(spot_score) 23 | 24 | score = ensemble.ensemble([knn_normalized_score, spot_normalized_score]) 25 | if score is not None: 26 | assert 0 <= score <= 1 27 | 28 | 29 | def test_WeightEnsemble(): 30 | 31 | ds = UnivariateDS() 32 | stream = StreamGenerator(ds.data) 33 | knn_detector = KNNDetector() 34 | spot_detector = SpotDetector() 35 | knn_calibrator = ZScoreCalibrator(sigma=3) 36 | spot_calibrator = ZScoreCalibrator(sigma=3) 37 | ensemble = WeightEnsemble(ensemble_weights=[0.6, 0.4]) 38 | 39 | for x in stream.iter_item(): 40 | knn_score = knn_detector.fit_score(x) 41 | spot_score = spot_detector.fit_score(x) 42 | 43 | knn_normalized_score = knn_calibrator.normalize(knn_score) 44 | spot_normalized_score = spot_calibrator.normalize(spot_score) 45 | 46 | score = ensemble.ensemble([knn_normalized_score, spot_normalized_score]) 47 | 48 | if score is not None: 49 | assert 0 <= score <= 1 50 | -------------------------------------------------------------------------------- /test/test_evaluate.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from streamad.evaluate import ( 3 | NumentaAwareMetircs, 4 | PointAwareMetircs, 5 | SeriesAwareMetircs, 6 | ) 7 | 8 | 9 | def test_point_aware_metrics(): 10 | values_real = np.array([0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0]) 11 | values_pred = np.array([0, 0, 0, None, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0]) 12 | 13 | metric = PointAwareMetircs(anomaly_threshold=0.8) 14 | 15 | (precision, recall, f1,) = metric.evaluate(values_real, values_pred) 16 | assert 0.0 <= precision <= 1.0 17 | assert 0.0 <= recall <= 1.0 18 | assert 0.0 <= f1 <= 1.0 19 | 20 | 21 | def test_series_aware_metrics(): 22 | values_real = np.array([0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0]) 23 | values_pred = np.array([0, 0, 0, None, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0]) 24 | 25 | # Flat bias 26 | metric = SeriesAwareMetircs( 27 | anomaly_threshold=0.8, bias_p="flat", bias_r="flat" 28 | ) 29 | 30 | (precision, recall, f1,) = metric.evaluate(values_real, values_pred) 31 | assert 0.0 <= precision <= 1.0 32 | assert 0.0 <= recall <= 1.0 33 | assert 0.0 <= f1 <= 1.0 34 | 35 | # Front bias 36 | metric = SeriesAwareMetircs( 37 | anomaly_threshold=0.8, bias_p="flat", bias_r="front" 38 | ) 39 | 40 | (precision, recall, f1,) = metric.evaluate(values_real, values_pred) 41 | assert 0.0 <= precision <= 1.0 42 | assert 0.0 <= recall <= 1.0 43 | assert 0.0 <= f1 <= 1.0 44 | 45 | # Middle bias 46 | metric = SeriesAwareMetircs( 47 | anomaly_threshold=0.8, bias_p="flat", bias_r="middle" 48 | ) 49 | 50 | (precision, recall, f1,) = metric.evaluate(values_real, values_pred) 51 | assert 0.0 <= precision <= 1.0 52 | assert 0.0 <= recall <= 1.0 53 | assert 0.0 <= f1 <= 1.0 54 | 55 | # Back bias 56 | metric = SeriesAwareMetircs( 57 | anomaly_threshold=0.8, bias_p="flat", bias_r="back" 58 | ) 59 | 60 | (precision, recall, f1,) = metric.evaluate(values_real, values_pred) 61 | assert 0.0 <= precision <= 1.0 62 | assert 0.0 <= recall <= 1.0 63 | assert 0.0 <= f1 <= 1.0 64 | 65 | 66 | def test_numenta_aware_metrics(): 67 | values_real = np.array([0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0]) 68 | values_pred = np.array([0, 0, 0, None, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0]) 69 | 70 | metric = NumentaAwareMetircs(anomaly_threshold=0.8) 71 | 72 | (precision, recall, f1,) = metric.evaluate(values_real, values_pred) 73 | assert 0.0 <= precision <= 1.0 74 | assert 0.0 <= recall <= 1.0 75 | assert 0.0 <= f1 <= 1.0 76 | -------------------------------------------------------------------------------- /test/test_hstree.py: -------------------------------------------------------------------------------- 1 | from streamad.util import StreamGenerator, UnivariateDS, MultivariateDS 2 | from streamad.model import HSTreeDetector 3 | 4 | 5 | def test_score(): 6 | ds = UnivariateDS() 7 | stream = StreamGenerator(ds.data) 8 | detector = HSTreeDetector() 9 | for x in stream.iter_item(): 10 | score = detector.fit_score(x) 11 | 12 | if score is not None: 13 | assert type(score) is float 14 | 15 | 16 | def test_multi_score(): 17 | ds = MultivariateDS() 18 | stream = StreamGenerator(ds.data) 19 | detector = HSTreeDetector() 20 | for x in stream.iter_item(): 21 | score = detector.fit_score(x) 22 | 23 | if score is not None: 24 | assert type(score) is float 25 | -------------------------------------------------------------------------------- /test/test_knncad.py: -------------------------------------------------------------------------------- 1 | from streamad.util import StreamGenerator, UnivariateDS 2 | from streamad.model import KNNDetector 3 | 4 | 5 | def test_score(): 6 | ds = UnivariateDS() 7 | stream = StreamGenerator(ds.data) 8 | detector = KNNDetector() 9 | for x in stream.iter_item(): 10 | score = detector.fit_score(x) 11 | 12 | if score is not None: 13 | assert type(score) is float 14 | -------------------------------------------------------------------------------- /test/test_loda.py: -------------------------------------------------------------------------------- 1 | from streamad.util import StreamGenerator, UnivariateDS, MultivariateDS 2 | from streamad.model import LodaDetector 3 | 4 | 5 | def test_score(): 6 | ds = UnivariateDS() 7 | stream = StreamGenerator(ds.data) 8 | detector = LodaDetector() 9 | for x in stream.iter_item(): 10 | score = detector.fit_score(x) 11 | 12 | if score is not None: 13 | assert type(score) is float 14 | 15 | 16 | def test_multi_score(): 17 | ds = MultivariateDS() 18 | stream = StreamGenerator(ds.data) 19 | detector = LodaDetector() 20 | for x in stream.iter_item(): 21 | score = detector.fit_score(x) 22 | 23 | if score is not None: 24 | assert type(score) is float 25 | -------------------------------------------------------------------------------- /test/test_mad.py: -------------------------------------------------------------------------------- 1 | from streamad.util import StreamGenerator, UnivariateDS 2 | from streamad.model import MadDetector 3 | 4 | 5 | def test_score(): 6 | ds = UnivariateDS() 7 | stream = StreamGenerator(ds.data) 8 | detector = MadDetector() 9 | for x in stream.iter_item(): 10 | score = detector.fit_score(x) 11 | 12 | if score is not None: 13 | assert type(score) is float 14 | -------------------------------------------------------------------------------- /test/test_plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from streamad.util import StreamGenerator, CustomDS, plot 3 | from streamad.model import ZScoreDetector 4 | 5 | 6 | def test_plot(): 7 | n, A, center, phi = 730, 50, 100, 30 8 | T = 2 * np.pi / 100 9 | t = np.arange(n) 10 | ds = A * np.sin(T * t - phi * T) + center 11 | ds[235:255] = 80 12 | label = np.array([0] * n) 13 | label[235:255] = 1 14 | 15 | ds = CustomDS(ds, label) # You can also use a file path here 16 | stream = StreamGenerator(ds.data) 17 | model = ZScoreDetector() 18 | 19 | scores = [] 20 | 21 | for x in stream.iter_item(): 22 | score = model.fit_score(x) 23 | scores.append(score) 24 | # print("\r Anomaly score: {}".format(score), end="", flush="True") 25 | 26 | data, label, date, features = ds.data, ds.label, ds.date, ds.features 27 | plot(data=data, scores=scores, date=date, features=features, label=label) 28 | -------------------------------------------------------------------------------- /test/test_random.py: -------------------------------------------------------------------------------- 1 | from streamad.util import StreamGenerator, UnivariateDS 2 | from streamad.model import RandomDetector 3 | 4 | 5 | def test_score(): 6 | ds = UnivariateDS() 7 | stream = StreamGenerator(ds.data) 8 | detector = RandomDetector() 9 | for x in stream.iter_item(): 10 | score = detector.fit_score(x) 11 | 12 | if score is not None: 13 | assert type(score) is float 14 | -------------------------------------------------------------------------------- /test/test_rrcf.py: -------------------------------------------------------------------------------- 1 | from streamad.util import StreamGenerator, UnivariateDS 2 | from streamad.model import RrcfDetector 3 | 4 | 5 | def test_score(): 6 | ds = UnivariateDS() 7 | stream = StreamGenerator(ds.data) 8 | detector = RrcfDetector() 9 | for x in stream.iter_item(): 10 | score = detector.fit_score(x) 11 | 12 | if score is not None: 13 | assert type(score) is float 14 | -------------------------------------------------------------------------------- /test/test_rshash.py: -------------------------------------------------------------------------------- 1 | from streamad.util import StreamGenerator, UnivariateDS, MultivariateDS 2 | from streamad.model import RShashDetector 3 | 4 | 5 | def test_score(): 6 | ds = UnivariateDS() 7 | stream = StreamGenerator(ds.data) 8 | detector = RShashDetector() 9 | for x in stream.iter_item(): 10 | score = detector.fit_score(x) 11 | 12 | if score is not None: 13 | assert type(score) is float 14 | 15 | 16 | def test_multi_score(): 17 | ds = MultivariateDS() 18 | stream = StreamGenerator(ds.data) 19 | detector = RShashDetector() 20 | for x in stream.iter_item(): 21 | score = detector.fit_score(x) 22 | 23 | if score is not None: 24 | assert type(score) is float 25 | -------------------------------------------------------------------------------- /test/test_sarima.py: -------------------------------------------------------------------------------- 1 | from streamad.util import StreamGenerator, UnivariateDS 2 | from streamad.model import SArimaDetector 3 | 4 | 5 | def test_sarima(): 6 | ds = UnivariateDS() 7 | stream = StreamGenerator(ds.data) 8 | detector = SArimaDetector() 9 | for x in stream.iter_item(): 10 | score = detector.fit_score(x) 11 | if score is not None: 12 | assert type(score) is float 13 | -------------------------------------------------------------------------------- /test/test_sdft.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from streamad.util.math_toolkit import SDFT 3 | 4 | 5 | def test_sdft(): 6 | X = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 7 | 8 | window_size = 5 9 | sdft = SDFT(window_size) 10 | for i, x in enumerate(X): 11 | sdft = sdft.update(x) 12 | if i + 1 >= window_size: 13 | print("co:", sdft.coefficients) 14 | print("----") 15 | print("np:", np.fft.fft(X[i + 1 - window_size : i + 1])) 16 | print("----------------------") 17 | # assert np.allclose( 18 | # sdft.coefficients, np.fft.fft(X[i + 1 - window_size : i + 1]) 19 | # ) 20 | 21 | 22 | def test_dft_time(): 23 | import time 24 | 25 | X = np.random.randn(1000000) 26 | 27 | sdft = SDFT(10) 28 | start_time = time.time() 29 | for x in X: 30 | sdft = sdft.update(x) 31 | print("sdft", time.time() - start_time) 32 | 33 | start_time = time.time() 34 | for i in range(len(X) - 10): 35 | np.fft.fft(X[i : i + 10]) 36 | 37 | print("np", time.time() - start_time) 38 | -------------------------------------------------------------------------------- /test/test_spot.py: -------------------------------------------------------------------------------- 1 | from streamad.util import StreamGenerator, UnivariateDS 2 | from streamad.model import SpotDetector 3 | 4 | 5 | def test_score(): 6 | ds = UnivariateDS() 7 | stream = StreamGenerator(ds.data) 8 | detector = SpotDetector() 9 | for x in stream.iter_item(): 10 | score = detector.fit_score(x) 11 | 12 | if score is not None: 13 | assert type(score) is float 14 | -------------------------------------------------------------------------------- /test/test_sr.py: -------------------------------------------------------------------------------- 1 | from streamad.util import StreamGenerator, UnivariateDS 2 | from streamad.model import SRDetector 3 | 4 | 5 | def test_score(): 6 | ds = UnivariateDS() 7 | stream = StreamGenerator(ds.data) 8 | detector = SRDetector() 9 | for x in stream.iter_item(): 10 | score = detector.fit_score(x) 11 | 12 | if score is not None: 13 | assert type(score) is float 14 | -------------------------------------------------------------------------------- /test/test_stats.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from streamad.util import ( 3 | MultivariateDS, 4 | StreamGenerator, 5 | StreamStatistic, 6 | UnivariateDS, 7 | ) 8 | 9 | 10 | def test_uni_stats(): 11 | ds = UnivariateDS() 12 | data = ds.data 13 | stream = StreamGenerator(data) 14 | stats = StreamStatistic() 15 | 16 | for X in stream.iter_item(): 17 | stats.update(X) 18 | 19 | assert stats.get_max() == np.max(data) 20 | assert stats.get_min() == np.min(data) 21 | assert abs(stats.get_sum() - np.sum(data)) < 1e-5 22 | assert abs(stats.get_mean() - np.mean(data)) < 1e-5 23 | assert abs(stats.get_std() - np.std(data)) < 1e-5 24 | assert abs(stats.get_var() - np.var(data)) < 1e-5 25 | 26 | 27 | def test_multi_stats(): 28 | ds = MultivariateDS() 29 | data = ds.data 30 | stream = StreamGenerator(data) 31 | stats = StreamStatistic() 32 | 33 | for X in stream.iter_item(): 34 | stats.update(X) 35 | 36 | assert ( 37 | sum([abs(i - j) for i, j in zip(stats.get_max(), np.max(data, axis=0))]) 38 | < 1e-5 39 | ) 40 | 41 | assert ( 42 | sum([abs(i - j) for i, j in zip(stats.get_min(), np.min(data, axis=0))]) 43 | < 1e-5 44 | ) 45 | 46 | assert ( 47 | sum([abs(i - j) for i, j in zip(stats.get_sum(), np.sum(data, axis=0))]) 48 | < 1e-5 49 | ) 50 | 51 | assert ( 52 | sum( 53 | [ 54 | abs(i - j) 55 | for i, j in zip(stats.get_mean(), np.mean(data, axis=0)) 56 | ] 57 | ) 58 | < 1e-5 59 | ) 60 | 61 | assert ( 62 | sum([abs(i - j) for i, j in zip(stats.get_std(), np.std(data, axis=0))]) 63 | < 1e-5 64 | ) 65 | 66 | assert ( 67 | sum([abs(i - j) for i, j in zip(stats.get_var(), np.var(data, axis=0))]) 68 | < 1e-5 69 | ) 70 | 71 | 72 | def test_windowed_uni_stats(): 73 | 74 | ds = UnivariateDS() 75 | data = ds.data 76 | stream = StreamGenerator(data) 77 | stats = StreamStatistic(is_global=False, window_len=10) 78 | 79 | for X in stream.iter_item(): 80 | stats.update(X) 81 | 82 | assert stats.get_max() == np.max(data[-10:]) 83 | assert stats.get_min() == np.min(data[-10:]) 84 | assert stats.get_sum() == np.sum(data[-10:]) 85 | assert stats.get_mean() == np.mean(data[-10:]) 86 | assert stats.get_std() == np.std(data[-10:]) 87 | assert stats.get_var() == np.var(data[-10:]) 88 | 89 | 90 | def test_windows_multi_stats(): 91 | 92 | ds = MultivariateDS() 93 | data = ds.data 94 | stream = StreamGenerator(data) 95 | stats = StreamStatistic(is_global=False, window_len=10) 96 | 97 | for X in stream.iter_item(): 98 | stats.update(X) 99 | 100 | assert ( 101 | sum( 102 | [ 103 | abs(i - j) 104 | for i, j in zip(stats.get_max(), np.max(data[-10:], axis=0)) 105 | ] 106 | ) 107 | < 1e-5 108 | ) 109 | 110 | assert ( 111 | sum( 112 | [ 113 | abs(i - j) 114 | for i, j in zip(stats.get_min(), np.min(data[-10:], axis=0)) 115 | ] 116 | ) 117 | < 1e-5 118 | ) 119 | 120 | assert ( 121 | sum( 122 | [ 123 | abs(i - j) 124 | for i, j in zip(stats.get_sum(), np.sum(data[-10:], axis=0)) 125 | ] 126 | ) 127 | < 1e-5 128 | ) 129 | 130 | assert ( 131 | sum( 132 | [ 133 | abs(i - j) 134 | for i, j in zip(stats.get_mean(), np.mean(data[-10:], axis=0)) 135 | ] 136 | ) 137 | < 1e-5 138 | ) 139 | 140 | assert ( 141 | sum( 142 | [ 143 | abs(i - j) 144 | for i, j in zip(stats.get_std(), np.std(data[-10:], axis=0)) 145 | ] 146 | ) 147 | < 1e-5 148 | ) 149 | 150 | assert ( 151 | sum( 152 | [ 153 | abs(i - j) 154 | for i, j in zip(stats.get_var(), np.var(data[-10:], axis=0)) 155 | ] 156 | ) 157 | < 1e-5 158 | ) 159 | -------------------------------------------------------------------------------- /test/test_xstream.py: -------------------------------------------------------------------------------- 1 | from streamad.util import StreamGenerator, UnivariateDS, MultivariateDS 2 | from streamad.model import xStreamDetector 3 | 4 | 5 | def test_score(): 6 | ds = UnivariateDS() 7 | stream = StreamGenerator(ds.data) 8 | detector = xStreamDetector() 9 | for x in stream.iter_item(): 10 | score = detector.fit_score(x) 11 | 12 | if score is not None: 13 | assert type(score) is float 14 | 15 | 16 | def test_multi_score(): 17 | ds = MultivariateDS() 18 | stream = StreamGenerator(ds.data) 19 | detector = xStreamDetector() 20 | for x in stream.iter_item(): 21 | score = detector.fit_score(x) 22 | 23 | if score is not None: 24 | assert type(score) is float 25 | -------------------------------------------------------------------------------- /test/test_zscore.py: -------------------------------------------------------------------------------- 1 | from streamad.util import StreamGenerator, UnivariateDS 2 | from streamad.model import ZScoreDetector 3 | 4 | 5 | def test_score(): 6 | ds = UnivariateDS() 7 | stream = StreamGenerator(ds.data) 8 | detector = ZScoreDetector() 9 | for x in stream.iter_item(): 10 | score = detector.fit_score(x) 11 | 12 | if score is not None: 13 | assert type(score) is float 14 | --------------------------------------------------------------------------------