├── .gitattributes
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── SECURITY.md
├── aml_component
├── README.md
├── ad-pipeline.png
├── ad_component.yaml
├── conda.yaml
├── constants.py
├── error_messages.py
├── invoker.py
├── sr_detector.py
├── tests
│ ├── __init__.py
│ ├── test_error_input.py
│ └── test_functionality.py
└── validation.py
├── main.py
├── msanomalydetector
├── __init__.py
├── _anomaly_kernel_cython.c
├── _anomaly_kernel_cython.pyx
├── boundary_utils.py
├── spectral_residual.py
└── util.py
├── requirements.txt
├── samples
└── sample.csv
├── setup.py
├── srcnn
├── competition_metric.py
├── evalue.py
├── generate_data.py
├── net.py
├── train.py
└── utils.py
├── tests
├── __init__.py
├── test_boundary_utils.py
└── test_spectral_residual.py
└── version.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Set the default behavior, in case people don't have core.autocrlf set.
2 | * text=auto
3 |
4 | # Use text conventions for commonly used text extensions.
5 | *.csv text
6 | *.ini text
7 | *.json text
8 | *.txt text
9 | *.xml text
10 |
11 | # Denote all files that are truly binary and should not be modified.
12 | *.dll binary
13 | *.exe binary
14 | *.gz binary
15 | *.ico binary
16 | *.jpg binary
17 | *.lib binary
18 | *.pdb binary
19 | *.pdf binary
20 | *.png binary
21 | *.wim binary
22 | *.zip binary
23 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Mac DS_Store files
2 | .DS_Store
3 |
4 | # Compiled class file
5 | *.class
6 |
7 | # Log file
8 | *.log
9 |
10 | # BlueJ files
11 | *.ctxt
12 |
13 | # Mobile Tools for Java (J2ME)
14 | .mtj.tmp/
15 |
16 | # Package Files #
17 | *.jar
18 | *.war
19 | *.nar
20 | *.ear
21 | *.zip
22 | *.tar.gz
23 | *.rar
24 |
25 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
26 | hs_err_pid*
27 |
28 | # Byte-compiled / optimized / DLL files
29 | __pycache__/
30 | *.py[cod]
31 | *$py.class
32 |
33 | # C extensions
34 | *.so
35 |
36 | # Distribution / packaging
37 | .Python
38 | build/
39 | develop-eggs/
40 | dist/
41 | downloads/
42 | eggs/
43 | .eggs/
44 | lib/
45 | lib64/
46 | parts/
47 | sdist/
48 | var/
49 | .idea/
50 | wheels/
51 | *.egg-info/
52 | .installed.cfg
53 | *.egg
54 | MANIFEST
55 |
56 | # PyInstaller
57 | # Usually these files are written by a python script from a template
58 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
59 | *.manifest
60 | *.spec
61 |
62 | # Installer logs
63 | pip-log.txt
64 | pip-delete-this-directory.txt
65 |
66 | # Unit test / coverage reports
67 | htmlcov/
68 | .tox/
69 | .coverage
70 | .coverage.*
71 | .cache
72 | nosetests.xml
73 | coverage.xml
74 | *.cover
75 | .hypothesis/
76 | .pytest_cache/
77 |
78 | # Translations
79 | *.mo
80 | *.pot
81 |
82 | # Django stuff:
83 | *.log
84 | local_settings.py
85 | db.sqlite3
86 |
87 | # Flask stuff:
88 | instance/
89 | .webassets-cache
90 |
91 | # Scrapy stuff:
92 | .scrapy
93 |
94 | # Sphinx documentation
95 | docs/_build/
96 |
97 | # PyBuilder
98 | target/
99 |
100 | # Jupyter Notebook
101 | .ipynb_checkpoints
102 |
103 | # pyenv
104 | .python-version
105 |
106 | # celery beat schedule file
107 | celerybeat-schedule
108 |
109 | # SageMath parsed files
110 | *.sage.py
111 |
112 | # Environments
113 | .env
114 | .venv
115 | env/
116 | venv/
117 | ENV/
118 | env.bak/
119 | venv.bak/
120 |
121 | # Spyder project settings
122 | .spyderproject
123 | .spyproject
124 |
125 | # Rope project settings
126 | .ropeproject
127 |
128 | # mkdocs documentation
129 | /site
130 |
131 | # mypy
132 | .mypy_cache/
133 |
134 | # Mac DS_Store files
135 | .DS_Store
136 | # VS code
137 | .vscode
138 |
139 | # Ev2 Generator binaries
140 | bin
141 | packages
142 | debug/
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) Microsoft Corporation. All rights reserved.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include version.py
2 | include setup.py
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # Contributing
3 |
4 | This project welcomes contributions and suggestions. Most contributions require you to agree to a
5 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
6 | the rights to use your contribution. For details, visit https://cla.microsoft.com.
7 |
8 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide
9 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions
10 | provided by the bot. You will only need to do this once across all repos using our CLA.
11 |
12 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
13 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
14 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
15 |
16 | Users can run SR by refering sample here
17 |
18 | https://github.com/microsoft/anomalydetector/blob/master/main.py
19 | This sample only RUN SR, for SR-CNN please refer the below section. Both SR and SR-CNN use the same evaluation in evaluate.py.
20 |
21 | The SR-CNN project is consisted of three major parts.
22 | 1.generate_data.py is used for preprocess the data, where the original continuous time series are splited according to window size and artificial outliers are injected in proportion.
23 | `
24 | python generate_data.py --data
25 | `
26 | where dataset is the file name of data folder.If you want to change the default config, you can use the command line args:
27 | `
28 | python generate_data.py -data --window 256 --step 128
29 | `
30 | 2.train.py is the network trianing module of SR-CNN. SR transformer is applied on each time-series before training.
31 | `
32 | python trian.py -data
33 | `
34 | 3.evalue.py is the evaluation module.As mentioned in our paper,
35 | `
36 | We evaluate our model from three aspects,accuracy,efficiency and generality.We use precision,recall and F1-score to indicate the accuracy of our model.In real applications,the human operators do not care about the point-wise metrics. It is acceptable for an algorithm to trigger an alert for any point in a contiguous anomaly segment if the delay is not too long.Thus,we adopt the evaluation strategy following[23].We mark the whole segment of continuous anomalies as a positive sample which means no matter how many anomalies have been detected in this segment,only one effective detection will be counted.If any point in ananomaly segment can be detected by the algorithm,and the delay of this point is no more than k from the start point of the anomaly segment, we say this segment is detected correctly.Thus,all points in this segment are treated as correct,and the points outside the anomaly segments are treated as normal.
37 | `
38 | we set different delays to verify whether a whole section of anomalies can be detected in time. For example, When delay = 7, for an entire segment of anomaly, if the anomaly detector can issue an alarm at its first 7 points, it is considered that the entire segment of anomaly has been successfully detected, otherwise it is considered to have not been detected.
39 | Run the code:
40 | `
41 | python evalue.py -data
42 | `
43 |
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## Security
4 |
5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
6 |
7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
8 |
9 | ## Reporting Security Issues
10 |
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 |
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 |
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 |
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc).
18 |
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 |
21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 | * Full paths of source file(s) related to the manifestation of the issue
23 | * The location of the affected source code (tag/branch/commit or direct URL)
24 | * Any special configuration required to reproduce the issue
25 | * Step-by-step instructions to reproduce the issue
26 | * Proof-of-concept or exploit code (if possible)
27 | * Impact of the issue, including how an attacker might exploit the issue
28 |
29 | This information will help us triage your report more quickly.
30 |
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 |
33 | ## Preferred Languages
34 |
35 | We prefer all communications to be in English.
36 |
37 | ## Policy
38 |
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 |
41 |
42 |
--------------------------------------------------------------------------------
/aml_component/README.md:
--------------------------------------------------------------------------------
1 | # Spectral Residual Anomaly Detection Component
2 |
3 | This folder specifies the Spectral Residual Anomaly Detection component that can be used in Azure Machine Learning designer. The details of the Spectral Residual algorithm can be found at https://arxiv.org/pdf/1906.03821.pdf.
4 |
5 | ## Component Specification
6 |
7 | This section describes the specification of [Spectral Residual Anomaly Detection Component](./ad_component.yaml).
8 |
9 | ### Input Specification
10 |
11 | * `Input`. AnyDirectory type means you need to register your dataset as **File dataset** to the workspace. The data set should contain at least 12 rows. Each row should contain a timestamp column and one or more columns that are to be detected.
12 | * `Detect Mode`. The following two detect modes are supported.
13 | 1. `AnomalyOnly`. In this mode, the module outputs columns `isAnomaly`, `mag` and `score`.
14 | 2. `AnomalyAndMargin`. In this mode, the module outputs columns `isAnomaly`, `mag`, `score`, `expectedValue`, `lowerBoundary`, `upperBoundary`.
15 | * `Timestamp Column`. The column that contains timestamp. The timestamp should be in ascending order. No duplication is allowed in timestamp.
16 | * `Value Column`. One or more columns that are to be detected. The data in these columns should be numeric. Absolute value greater than 1e100 is not allowed.
17 | * `Batch Size`. The number of rows to be detected in each batch. The batch size should be at least 12. Set this parameter to 0 or negative number if you want to detect all rows in one batch.
18 | * `Threshold`. In AnomalyOnly mode, points are detected as anomaly if its `score` is greater than threshold. In AnomalyAndMargin mode, this parameter and `sensitivity` works together to filter anomaly.
19 | * `Sensitivity`. This parameter is used in AnomalyAndMargin mode to determine the range of the boundaries.
20 | * `Append result column to output`. If this parameter is set, the input data set will be output together with the results. Otherwise, only the results will be output.
21 |
22 | ### Output Specification
23 | The output data set will contain a fraction of the following columns according to the `Detect Mode` parameter. If multiple value colums are selected, the result columns will add value column names as postfix.
24 | * `isAnomaly`. The anomaly result.
25 | * `mag`. The magnitude after spectral residual transformation.
26 | * `score`. A value indicates the significance of the anomaly.
27 | In AnomalyAndMargin mode, the following columns will be output in addition the the above three columns.
28 | * `expectedValue`. The expected value of each point.
29 | * `lowerBoundary`. The lower boundary at each point that the algorithm can tolerant as not anomaly.
30 | * `upperBoundary`. The upper boundary at each point that the algorithm can tolerant as not anomaly.
31 |
32 | ## How to create a new component in Azure Machine Learning
33 |
34 | Follow [this tutorial](https://github.com/Azure/AzureMachineLearningGallery/blob/main/tutorial/tutorial1-use-existing-components.md) to create a new component in your Azure Machine Learning workspace.
35 |
36 | After creating component successfully, you can use it in Azure Machine Learning designer.
37 |
38 | ## How to build a pipeline in AML designer
39 |
40 | 1. Prepare input dataset for the component.
41 | Register this [sample AnomalyDetector-Manufacture dataset](https://github.com/microsoft/Cognitive-Samples-IntelligentKiosk/blob/master/Kiosk/Assets/AnomalyDetector/AnomalyDetector-Manufacture.csv) as **Tabular dataset** in your Azure Machine Learning workspace.
42 |
43 | The dataset above is a sample dataset. You can use your own dataset, make sure that it is registered as Tabular dataset and you can also preprocess your dataset using Designer built-in modules. Make sure that the input dataset of **Spectral Residual Anomaly Detection** is with following format, and the count of time series must be more than 12:
44 |
45 | |Timestamp|Value|
46 | |---|---|
47 | |2018/7/1 0:00|22|
48 | |2018/7/1 2:00|22|
49 | |2018/7/1 4:00|22|
50 | |2018/7/1 6:00|22|
51 | |2018/7/1 8:00|52.93218322|
52 | |2018/7/1 10:00|52.81943684|
53 | |2018/7/1 12:00|52.33277765|
54 | |2018/7/1 14:00|52.82106858|
55 | |2018/7/1 16:00|52.93218322|
56 | |2018/7/1 18:00|22|
57 | |2018/7/1 20:00|22|
58 | |2018/7/1 22:00|22|
59 | |2018/7/2 0:00|22|
60 | |2018/7/2 2:00|22|
61 | |2018/7/2 4:00|22|
62 | |2018/7/2 6:00|22|
63 |
64 | 1. Open AML designer, create a new pipeline draft and drag the registered dataset to canvas.
65 |
66 | Add **Spectral Residual Anomaly Detection** to canvas, connect it to the dataset, and configure the parameters. The pipeline graph is like following:
67 |
68 | 
69 |
70 | 1. Submit the pipeline.
71 | 1. When the pipeline runs completed, you can click on **Visualize** icon in the **Outputs+logs** tab in the right panel of the **Spectral Residual Anomaly Detection** module, or right-click the module to select **Visualize**.
72 |
73 |
--------------------------------------------------------------------------------
/aml_component/ad-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/anomalydetector/a3260ea0ddfb868986b924a245e003a97143f9df/aml_component/ad-pipeline.png
--------------------------------------------------------------------------------
/aml_component/ad_component.yaml:
--------------------------------------------------------------------------------
1 | $schema: http://azureml/sdk-2-0/CommandComponent.json
2 | name: microsoft.com.office.spectral.residual.anomaly.detection
3 | version: 1.1.1
4 | display_name: Spectral Residual Anomaly Detection
5 | is_deterministic: True
6 | type: CommandComponent
7 | description: This module implements the spectral residual anomaly detection algorithm for time-series.
8 | tags:
9 | time series: ''
10 | anomaly detection: ''
11 | inputs:
12 | dataset:
13 | type: DataFrameDirectory
14 | optional: False
15 | detect_mode:
16 | type: Enum
17 | optional: False
18 | default: AnomalyOnly
19 | description: Specify the detection mode.
20 | enum:
21 | - AnomalyOnly
22 | - AnomalyAndMargin
23 | timestamp_column:
24 | type: String
25 | optional: False
26 | description: Choose the column that contains timestamps.
27 | value_column:
28 | type: String
29 | optional: False
30 | description: Choose the column that contains values.
31 | batch_size:
32 | type: Integer
33 | optional: False
34 | default: 2000
35 | description: This parameter specifies the size of each batch that the detection is perfomed, 0 indicates to run all data in a single batch.
36 | min: 0
37 | threshold:
38 | type: Float
39 | optional: False
40 | default: 0.3
41 | description: This parameter specifies the threshold anomaly score that a point is judged as anomaly.
42 | min: 0.0
43 | max: 1.0
44 | sensitivity:
45 | type: Float
46 | optional: False
47 | default: 99
48 | description: This parameter is used in AnomalyAndMargin mode to control the width of margin.
49 | min: 0.0
50 | max: 100.0
51 | append_result_columns_to_output:
52 | type: Boolean
53 | optional: False
54 | default: True
55 | description: Append result columns to the original columns as output
56 | compute_stats_in_visualization:
57 | type: Boolean
58 | optional: False
59 | default: True
60 | description: Compute stats in visualization
61 | outputs:
62 | output_port:
63 | type: DataFrameDirectory
64 | environment:
65 | conda:
66 | conda_dependencies:
67 | name: project_environment
68 | channels:
69 | - defaults
70 | dependencies:
71 | - python=3.6.8
72 | - cython=0.29.2
73 | - numpy=1.18.1
74 | - pip=20.0
75 | - pip:
76 | - azureml-sdk==0.1.0.*
77 | - azureml-designer-core==0.0.31
78 | - --index-url https://azuremlsdktestpypi.azureedge.net/dev/aml/office/134157926D8F
79 | - --extra-index-url https://pypi.org/simple
80 | - pandas==0.25.3
81 | - pyarrow==0.16.0
82 | - matplotlib==3.1.0
83 | - git+https://github.com/microsoft/anomalydetector.git@1.1
84 | docker:
85 | image: mcr.microsoft.com/azureml/base:intelmpi2018.3-ubuntu16.04
86 | os: Linux
87 | command: python invoker.py --input {inputs.dataset} --detect-mode {inputs.detect_mode} --timestamp-column {inputs.timestamp_column} --value-column {inputs.value_column} --batch-size {inputs.batch_size} --threshold {inputs.threshold} --sensitivity {inputs.sensitivity} --append-mode {inputs.append_result_columns_to_output} --compute_stats_in_visualization {inputs.compute_stats_in_visualization} --output {outputs.output_port}
88 | ...
--------------------------------------------------------------------------------
/aml_component/conda.yaml:
--------------------------------------------------------------------------------
1 | name: project_environment
2 | channels:
3 | - defaults
4 | dependencies:
5 | - python=3.6.8
6 | - cython=0.29.2
7 | - numpy=1.18.1
8 | - pip:
9 | - azureml-sdk==0.1.0.*
10 | - --index-url https://azuremlsdktestpypi.azureedge.net/dev/aml/office/134157926D8F
11 | - --extra-index-url https://pypi.org/simple
12 | - pandas==0.25.3
13 | - pyarrow==0.16.0
14 | - matplotlib==3.1.0
15 | - git+https://github.com/microsoft/anomalydetector.git@1.1
16 |
--------------------------------------------------------------------------------
/aml_component/constants.py:
--------------------------------------------------------------------------------
1 | VALUE_LOWER_BOUND = -1.0e100
2 | VALUE_UPPER_BOUND = 1.0e100
3 | MIN_POINTS = 12
4 |
--------------------------------------------------------------------------------
/aml_component/error_messages.py:
--------------------------------------------------------------------------------
1 | InvalidTimestamps = '''The timestamp column specified is malformed.'''
2 | InvalidSeriesOrder = '''The timestamp column specified is not in ascending order.'''
3 | DuplicateSeriesTimestamp = '''The timestamp column specified has duplicated timestamps.'''
4 | InvalidValueFormat = '''The data in column "{0}" can not be parsed as float values.'''
5 | InvalidSeriesValue = '''The data in column "{0}" contains nan values.'''
6 | ValueOverflow = '''The magnitude of data in column "{0}" exceeds limitation.'''
7 | NotEnoughPoints = '''The dataset should contain at least {0} points to run this module.'''
8 | InvalidBatchSize = '''The "batchSize" parameter should be at least {0} or 0 ''' \
9 | '''that indicates to run all data in a batch.'''
10 | ColumnNotFoundError = '''Column with name or index "{0}" not found.'''
11 |
--------------------------------------------------------------------------------
/aml_component/invoker.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 | import os
4 | import pathlib
5 | import sr_detector
6 | import numpy as np
7 | import pandas as pd
8 | from error_messages import *
9 | from constants import *
10 | from azureml.studio.core.io.data_frame_directory import load_data_frame_from_directory, save_data_frame_to_directory
11 |
12 | PACKAGE_NAME = 'spectral_residual_anomaly_detection_module'
13 | VERSION = '1.0.0'
14 |
15 |
16 | def str2bool(v):
17 | if isinstance(v, bool):
18 | return v
19 | if v.lower() in ('yes', 'true', 't', 'y', '1'):
20 | return True
21 | elif v.lower() in ('no', 'false', 'f', 'n', '0'):
22 | return False
23 | else:
24 | raise argparse.ArgumentTypeError('Boolean value expected.')
25 |
26 |
27 | def is_timestamp_ascending(timestamps):
28 | count = len(timestamps)
29 |
30 | for i in range(count - 1):
31 | if timestamps[i] > timestamps[i + 1]:
32 | return -1
33 | elif timestamps[i] == timestamps[i + 1]:
34 | return -2
35 | return 0
36 |
37 |
38 | def invoke(input_path, detect_mode, timestamp_column, value_column, batch_size, threshold, sensitivity,
39 | appendMode, compute_stats_in_visualization, output_path):
40 | df = load_data_frame_from_directory(input_path).data
41 | logging.info(f"Shape of loaded DataFrame: {df.shape}")
42 |
43 | if df.shape[0] < MIN_POINTS:
44 | raise Exception(NotEnoughPoints.format(MIN_POINTS))
45 |
46 | if 0 < batch_size < MIN_POINTS:
47 | raise Exception(InvalidBatchSize.format(MIN_POINTS))
48 |
49 | if timestamp_column not in list(df.columns):
50 | raise Exception(ColumnNotFoundError.format(timestamp_column))
51 |
52 | if value_column not in list(df.columns):
53 | raise Exception(ColumnNotFoundError.format(value_column))
54 |
55 | timestamp = pd.DataFrame(df, columns=[timestamp_column])
56 | timestamps = pd.to_datetime(timestamp.iloc[:, 0].values)
57 |
58 | if np.any(np.isnat(timestamps)):
59 | raise Exception(InvalidTimestamps)
60 |
61 | res = is_timestamp_ascending(timestamps)
62 |
63 | if res == -1:
64 | raise Exception(InvalidSeriesOrder)
65 | elif res == -2:
66 | raise Exception(DuplicateSeriesTimestamp)
67 |
68 | data_columns = pd.DataFrame(df, columns=[value_column])
69 |
70 | for col in data_columns:
71 | try:
72 | float_data = data_columns[col].apply(float)
73 | except Exception as e:
74 | raise Exception(InvalidValueFormat.format(col))
75 |
76 | if not np.all(np.isfinite(float_data)):
77 | raise Exception(InvalidSeriesValue.format(col))
78 |
79 | if np.any(np.less(float_data, VALUE_LOWER_BOUND)) or np.any(np.greater(float_data, VALUE_UPPER_BOUND)):
80 | raise Exception(ValueOverflow.format(col))
81 |
82 | data_columns[col] = float_data
83 |
84 | result = sr_detector.detect(timestamps, data_columns, detect_mode=detect_mode,
85 | batch_size=batch_size, threshold=threshold, sensitivity=sensitivity)
86 |
87 | if appendMode is True:
88 | result = pd.merge(df, result, left_index=True, right_index=True)
89 |
90 | save_data_frame_to_directory(output_path, result, compute_stats_in_visualization=compute_stats_in_visualization)
91 |
92 | def main():
93 | parser = argparse.ArgumentParser()
94 |
95 | parser.add_argument(
96 | '--input-path',
97 | help='Input Dataframe path'
98 | )
99 |
100 | parser.add_argument(
101 | '--detect-mode',
102 | choices=['AnomalyOnly', 'AnomalyAndMargin'],
103 | help='Specify the detect mode.'
104 | )
105 |
106 | parser.add_argument(
107 | '--timestamp-column',
108 | help='This parameter specifies the column that contains timestamps.'
109 | )
110 |
111 | parser.add_argument(
112 | '--value-column',
113 | help='This parameter specifies the column that contains values.'
114 | )
115 |
116 | parser.add_argument(
117 | '--batch-size', type=int,
118 | help='This parameter specifies the size of each batch that the detection is perfomed.'
119 | )
120 |
121 | parser.add_argument(
122 | '--threshold', type=float,
123 | help='This parameter specifies the threshold anomaly score that a point is judged as anomaly.'
124 | )
125 |
126 | parser.add_argument(
127 | '--sensitivity', type=float,
128 | help='This parameter is used in AnomalyAndMargin mode to control the width of margin.'
129 | )
130 |
131 | parser.add_argument(
132 | '--append-mode', type=str2bool, default=False,
133 | help='This parameter is used in AnomalyAndMargin mode to control the width of margin.'
134 | )
135 |
136 | parser.add_argument(
137 | '--compute-stats-in-visualization', type=str2bool, default=True,
138 | help='Enable this parameter to get stats visualization.'
139 | )
140 |
141 | parser.add_argument(
142 | '--output-path',
143 | help='Output Dataframe path'
144 | )
145 |
146 | args, _ = parser.parse_known_args()
147 |
148 | logging.info(f"Hello world from {PACKAGE_NAME} {VERSION}")
149 |
150 | logging.debug("Received parameters:")
151 | logging.debug(f"input: {args.input_path}")
152 | logging.debug(f"detect mode: {args.detect_mode}")
153 | logging.debug(f"timestamp column: {args.timestamp_column}")
154 | logging.debug(f"value column: {args.value_column}")
155 | logging.debug(f"batch size: {args.batch_size}")
156 | logging.debug(f"threshold: {args.threshold}")
157 | logging.debug(f"sensitivity: {args.sensitivity}")
158 | logging.debug(f"appendMode: {args.append_mode}")
159 | logging.debug(f"output path: {args.output_path}")
160 |
161 | invoke(args.input_path, args.detect_mode, args.timestamp_column, args.value_column,
162 | args.batch_size, args.threshold, args.sensitivity, args.append_mode,
163 | args.compute_stats_in_visualization, args.output_path)
164 |
165 |
166 | if __name__ == '__main__':
167 | main()
168 |
--------------------------------------------------------------------------------
/aml_component/sr_detector.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from msanomalydetector import SpectralResidual, DetectMode
3 | import matplotlib
4 | import matplotlib.pyplot as plt
5 | import logging
6 | from azureml.core.run import Run
7 | import os
8 |
9 |
10 | def log_plot_result(input_df, output_df, col_name, mode):
11 | fig = plt.figure(figsize=(20, 10))
12 | ax1 = fig.add_subplot(211)
13 | if mode == 'AnomalyAndMargin':
14 | ax1.fill_between(output_df.index, output_df['lowerBoundary'], output_df['upperBoundary'], color='grey', alpha=0.2, zorder=1)
15 | ax1.plot(output_df.index, output_df['expectedValue'], alpha=0.5, label='expected value', zorder=8)
16 | ax1.plot(input_df.index, input_df['value'], label='value', zorder=5)
17 | ax1.legend()
18 | anomalies = input_df[output_df['isAnomaly']]
19 | ax1.scatter(anomalies.index, anomalies['value'], c='red', zorder=10)
20 | ax1.set_title(col_name)
21 |
22 | ax2 = fig.add_subplot(212)
23 | ax2.plot(output_df.index, output_df['mag'])
24 | ax2.set_title('mag')
25 |
26 | run = Run.get_context()
27 | run.log_image(col_name, plot=plt)
28 |
29 |
30 | def sr_detect(frame, detect_mode, batch_size, threshold, sensitivity):
31 | model = SpectralResidual(frame, threshold=threshold, mag_window=3, score_window=40,
32 | sensitivity=sensitivity, detect_mode=DetectMode(detect_mode), batch_size=batch_size)
33 | result = model.detect()
34 |
35 | if detect_mode == DetectMode.anomaly_and_margin.value:
36 | return result[['isAnomaly', 'mag', 'score', 'expectedValue', 'lowerBoundary', 'upperBoundary']]
37 | return result[['isAnomaly', 'mag', 'score']]
38 |
39 |
40 | def detect(timestamp, data_to_detect, detect_mode, batch_size, threshold=0.3, sensitivity=99):
41 |
42 | column_length = len(data_to_detect.columns)
43 | if column_length == 1:
44 | logging.debug('single column to detect')
45 |
46 | frame = pd.DataFrame(columns=['timestamp', 'value'])
47 | frame['timestamp'] = timestamp
48 | frame['value'] = data_to_detect.iloc[:, 0]
49 | output = sr_detect(frame, detect_mode, batch_size, threshold, sensitivity)
50 | log_plot_result(frame, output, data_to_detect.columns[0], detect_mode)
51 | else:
52 | logging.debug(f'detect {column_length} columns')
53 | output = pd.DataFrame()
54 |
55 | for col in data_to_detect.columns:
56 | frame = pd.DataFrame(columns=['timestamp', 'value'])
57 | frame['timestamp'] = timestamp
58 | frame['value'] = data_to_detect[col]
59 | result = sr_detect(frame, detect_mode, batch_size, threshold, sensitivity)
60 | log_plot_result(frame, result, col, detect_mode)
61 | result.columns = [f'{rc}_{col}' for rc in result.columns]
62 | output = pd.concat((output, result), axis=1)
63 |
64 | return output
65 |
--------------------------------------------------------------------------------
/aml_component/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/anomalydetector/a3260ea0ddfb868986b924a245e003a97143f9df/aml_component/tests/__init__.py
--------------------------------------------------------------------------------
/aml_component/tests/test_error_input.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append('../')
3 |
4 | import unittest
5 | import numpy as np
6 | import pandas as pd
7 | import shutil
8 | import os
9 | import invoker
10 |
11 |
12 | class TestErrorInput(unittest.TestCase):
13 | def setUp(self):
14 | self.__input_path = './error_test_input_file.csv'
15 | self.__detect_mode = 'AnomalyOnly'
16 | self.__timestamp_column = 'timestamp'
17 | self.__value_column = 'value'
18 | self.__batch_size = 2000
19 | self.__threshold = 0.3
20 | self.__sensitivity = 99
21 | self.__append_mode = True
22 | self.compute_stats_in_visualization = False
23 | self.__output_path = './error_test_output_directory'
24 |
25 | def tearDown(self):
26 | self.deleteDataFrameDirectory()
27 |
28 | def deleteDataFrameDirectory(self):
29 | if os.path.exists(self.__input_path):
30 | os.remove(self.__input_path)
31 |
32 | if os.path.exists(self.__output_path):
33 | shutil.rmtree(self.__output_path)
34 |
35 | def test_empty_input(self):
36 | df = pd.DataFrame()
37 | df.to_csv(self.__input_path)
38 | self.assertRaisesRegexp(Exception, "The dataset should contain at least 12 points to run this module.",
39 | invoker.invoke,
40 | self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column,
41 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode,
42 | self.__output_path)
43 |
44 | def test_invalid_timestamp(self):
45 | df = pd.DataFrame()
46 | df['timestamp'] = 'invalid'
47 | df['value'] = np.ones(20)
48 | df.to_csv(self.__input_path)
49 | self.assertRaisesRegexp(Exception, "The timestamp column specified is malformed.",
50 | invoker.invoke,
51 | self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column,
52 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode,
53 | self.__output_path)
54 |
55 | def test_invalid_series_order(self):
56 | df = pd.DataFrame()
57 | timestamps = pd.date_range(start='2020-01-01', periods=20, freq='1D')[::-1]
58 | df['timestamp'] = timestamps
59 | df['value'] = np.ones(20)
60 | df.to_csv(self.__input_path)
61 | self.assertRaisesRegexp(Exception, "The timestamp column specified is not in ascending order.",
62 | invoker.invoke,
63 | self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column,
64 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode,
65 | self.__output_path)
66 |
67 | def test_dunplicate_sereis(self):
68 | df = pd.DataFrame()
69 | df['value'] = np.ones(20)
70 | df['timestamp'] = '2020-01-01'
71 | df.to_csv(self.__input_path)
72 | self.assertRaisesRegexp(Exception, "The timestamp column specified has duplicated timestamps.",
73 | invoker.invoke,
74 | self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column,
75 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode,
76 | self.__output_path)
77 |
78 | def test_invalid_value_format(self):
79 | df = pd.DataFrame()
80 | timestamps = pd.date_range(start='2020-01-01', periods=20, freq='1D')
81 | df['timestamp'] = timestamps
82 | df['value'] = 'invalid'
83 | df.to_csv(self.__input_path)
84 | self.assertRaisesRegexp(Exception, 'The data in column "value" can not be parsed as float values.',
85 | invoker.invoke,
86 | self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column,
87 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode,
88 | self.__output_path)
89 |
90 | def test_invalid_series_value(self):
91 | df = pd.DataFrame()
92 | timestamps = pd.date_range(start='2020-01-01', periods=20, freq='1D')
93 | df['timestamp'] = timestamps
94 | df['value'] = np.nan
95 | df.to_csv(self.__input_path)
96 | self.assertRaisesRegexp(Exception, 'The data in column "value" contains nan values.',
97 | invoker.invoke,
98 | self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column,
99 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode,
100 | self.__output_path)
101 |
102 | def test_value_overflow(self):
103 | df = pd.DataFrame()
104 | timestamps = pd.date_range(start='2020-01-01', periods=20, freq='1D')
105 | df['timestamp'] = timestamps
106 | df['value'] = 1e200
107 | df.to_csv(self.__input_path)
108 | self.assertRaisesRegexp(Exception, 'The magnitude of data in column "value" exceeds limitation.',
109 | invoker.invoke,
110 | self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column,
111 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode,
112 | self.__output_path)
113 |
114 | def test_not_enough_points(self):
115 | df = pd.DataFrame()
116 | timestamps = pd.date_range(start='2020-01-01', periods=10, freq='1D')
117 | df['timestamp'] = timestamps
118 | df['value'] = np.sin(np.linspace(1, 10, 10))
119 | df.to_csv(self.__input_path)
120 | self.assertRaisesRegexp(Exception, "The dataset should contain at least 12 points to run this module.",
121 | invoker.invoke,
122 | self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column,
123 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode,
124 | self.__output_path)
125 |
126 | def test_invalid_batch_size(self):
127 | df = pd.DataFrame()
128 | timestamps = pd.date_range(start='2020-01-01', periods=20, freq='1D')
129 | df['timestamp'] = timestamps
130 | df['value'] = np.sin(np.linspace(1, 10, 20))
131 | df.to_csv(self.__input_path)
132 | self.assertRaisesRegexp(Exception, 'The "batchSize" parameter should be at least 12 or 0 that indicates to run all data in a batch',
133 | invoker.invoke,
134 | self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column,
135 | 5, self.__threshold, self.__sensitivity, self.__append_mode, self.__output_path)
136 |
137 | def test_timestamp_column_missing(self):
138 | df = pd.DataFrame()
139 | timestamps = pd.date_range(start='2020-01-01', periods=20, freq='1D')
140 | df['time'] = timestamps
141 | df['value'] = np.sin(np.linspace(1, 10, 20))
142 | df.to_csv(self.__input_path)
143 | self.assertRaisesRegexp(Exception, 'Column with name or index "timestamp" not found.',
144 | invoker.invoke,
145 | self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column,
146 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode,
147 | self.__output_path)
148 |
149 | def test_value_column_missing(self):
150 | df = pd.DataFrame()
151 | timestamps = pd.date_range(start='2020-01-01', periods=20, freq='1D')
152 | df['timestamp'] = timestamps
153 | df['missed'] = np.sin(np.linspace(1, 10, 20))
154 | df.to_csv(self.__input_path)
155 | self.assertRaisesRegexp(Exception, 'Column with name or index "value" not found.',
156 | invoker.invoke,
157 | self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column,
158 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode,
159 | self.__output_path)
160 |
161 |
162 | if __name__ == '__main__':
163 | unittest.main()
164 |
--------------------------------------------------------------------------------
/aml_component/tests/test_functionality.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append('../')
3 |
4 | import unittest
5 | import numpy as np
6 | import pandas as pd
7 | import shutil
8 | import os
9 | import invoker
10 |
11 |
12 | class TestErrorInput(unittest.TestCase):
13 | def setUp(self):
14 | self.__input_path = './functional_test_input_folder'
15 | self.__input_csv_file = './functional_test_input_file.csv'
16 | self.__input_parquet_file = './functional_test_input_file.parquet'
17 | self.__detect_mode = 'AnomalyOnly'
18 | self.__timestamp_column = 'timestamp'
19 | self.__value_column = 'value'
20 | self.__batch_size = 2000
21 | self.__threshold = 0.3
22 | self.__sensitivity = 99
23 | self.__append_mode = True
24 | self.__output_path = './functional_test_output_directory'
25 |
26 | def tearDown(self):
27 | self.deleteDataFrameDirectory()
28 |
29 | def deleteDataFrameDirectory(self):
30 | if os.path.exists(self.__input_path):
31 | shutil.rmtree(self.__input_path)
32 |
33 | if os.path.exists(self.__input_csv_file):
34 | os.remove(self.__input_csv_file)
35 |
36 | if os.path.exists(self.__input_parquet_file):
37 | os.remove(self.__input_parquet_file)
38 |
39 | if os.path.exists(self.__output_path):
40 | shutil.rmtree(self.__output_path)
41 |
42 | def generate_input_data_frame(self, start_date: str = '2020-01-01'):
43 | df = pd.DataFrame()
44 | df['timestamp'] = pd.date_range(start=start_date, periods=200, freq='1D')
45 | df['value'] = np.sin(np.linspace(1, 20, 200))
46 | return df
47 |
48 | def generate_input_folder(self, file_type: str = 'csv'):
49 | if not os.path.isdir(self.__input_path):
50 | os.mkdir(self.__input_path)
51 | start_dates = ['2018-01-01', '2019-01-01', '2020-01-01']
52 | for start_date in start_dates:
53 | df = self.generate_input_data_frame(start_date)
54 | if file_type == 'csv':
55 | df.to_csv(f"{self.__input_path}/{start_date}.csv", index=False)
56 | elif file_type == 'parquet':
57 | df.to_parquet(f"{self.__input_path}/{start_date}.parquet", index=False)
58 | else:
59 | raise Exception(f'Unsupported input data type {file_type}, only csv and parquet file are allowed')
60 |
61 | def testAnomalyOnlyModeCsvFile(self):
62 | df = self.generate_input_data_frame()
63 | df.to_csv(self.__input_csv_file, index=False)
64 | invoker.invoke(self.__input_csv_file, self.__detect_mode, self.__timestamp_column, self.__value_column,
65 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, self.__output_path)
66 | result = pd.read_csv(f"{self.__output_path}/output.csv")
67 | self.assertEqual(result.shape[0], 200)
68 | self.assertTrue('value' in result.columns)
69 | self.assertTrue('isAnomaly' in result.columns)
70 | self.assertTrue('score' in result.columns)
71 | self.assertTrue('expectedValue' not in result.columns)
72 | self.assertTrue('upperBoundary' not in result.columns)
73 | self.assertTrue('lowerBoundary' not in result.columns)
74 |
75 | def testAnomalyOnlyModeCsvFolder(self):
76 | self.generate_input_folder()
77 | invoker.invoke(self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column,
78 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, self.__output_path)
79 | result = pd.read_csv(f"{self.__output_path}/output.csv")
80 | self.assertEqual(result.shape[0], 600)
81 | self.assertTrue('value' in result.columns)
82 | self.assertTrue('isAnomaly' in result.columns)
83 | self.assertTrue('score' in result.columns)
84 | self.assertTrue('expectedValue' not in result.columns)
85 | self.assertTrue('upperBoundary' not in result.columns)
86 | self.assertTrue('lowerBoundary' not in result.columns)
87 |
88 | def testAnomalyOnlyModeParquetFile(self):
89 | df = self.generate_input_data_frame()
90 | df.to_parquet(self.__input_parquet_file, index=False)
91 | invoker.invoke(self.__input_parquet_file, self.__detect_mode, self.__timestamp_column, self.__value_column,
92 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, self.__output_path)
93 | result = pd.read_csv(f"{self.__output_path}/output.csv")
94 | self.assertEqual(result.shape[0], 200)
95 | self.assertTrue('value' in result.columns)
96 | self.assertTrue('isAnomaly' in result.columns)
97 | self.assertTrue('score' in result.columns)
98 | self.assertTrue('expectedValue' not in result.columns)
99 | self.assertTrue('upperBoundary' not in result.columns)
100 | self.assertTrue('lowerBoundary' not in result.columns)
101 |
102 | def testAnomalyOnlyModeParquetFolder(self):
103 | self.generate_input_folder('parquet')
104 | invoker.invoke(self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column,
105 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, self.__output_path)
106 | result = pd.read_csv(f"{self.__output_path}/output.csv")
107 | self.assertEqual(result.shape[0], 600)
108 | self.assertTrue('value' in result.columns)
109 | self.assertTrue('isAnomaly' in result.columns)
110 | self.assertTrue('score' in result.columns)
111 | self.assertTrue('expectedValue' not in result.columns)
112 | self.assertTrue('upperBoundary' not in result.columns)
113 | self.assertTrue('lowerBoundary' not in result.columns)
114 |
115 | def testAnomalyAndMarginCsvFile(self):
116 | df = self.generate_input_data_frame()
117 | df.to_csv(self.__input_csv_file, index=False)
118 | invoker.invoke(self.__input_csv_file, "AnomalyAndMargin", self.__timestamp_column, self.__value_column,
119 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, self.__output_path)
120 | result = pd.read_csv(f"{self.__output_path}/output.csv")
121 | self.assertEqual(result.shape[0], 200)
122 | self.assertTrue('value' in result.columns)
123 | self.assertTrue('isAnomaly' in result.columns)
124 | self.assertTrue('score' in result.columns)
125 | self.assertTrue('expectedValue' in result.columns)
126 | self.assertTrue('upperBoundary' in result.columns)
127 | self.assertTrue('lowerBoundary' in result.columns)
128 |
129 | def testAnomalyAndMarginCsvFolder(self):
130 | self.generate_input_folder()
131 | invoker.invoke(self.__input_path, "AnomalyAndMargin", self.__timestamp_column, self.__value_column,
132 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, self.__output_path)
133 | result = pd.read_csv(f"{self.__output_path}/output.csv")
134 | self.assertEqual(result.shape[0], 600)
135 | self.assertTrue('value' in result.columns)
136 | self.assertTrue('isAnomaly' in result.columns)
137 | self.assertTrue('score' in result.columns)
138 | self.assertTrue('expectedValue' in result.columns)
139 | self.assertTrue('upperBoundary' in result.columns)
140 | self.assertTrue('lowerBoundary' in result.columns)
141 |
142 | def testAnomalyAndMarginParquetFile(self):
143 | df = self.generate_input_data_frame()
144 | df.to_parquet(self.__input_parquet_file, index=False)
145 | invoker.invoke(self.__input_parquet_file, "AnomalyAndMargin", self.__timestamp_column, self.__value_column,
146 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, self.__output_path)
147 | result = pd.read_csv(f"{self.__output_path}/output.csv")
148 | self.assertEqual(result.shape[0], 200)
149 | self.assertTrue('value' in result.columns)
150 | self.assertTrue('isAnomaly' in result.columns)
151 | self.assertTrue('score' in result.columns)
152 | self.assertTrue('expectedValue' in result.columns)
153 | self.assertTrue('upperBoundary' in result.columns)
154 | self.assertTrue('lowerBoundary' in result.columns)
155 |
156 | def testAnomalyAndMarginParquetFolder(self):
157 | self.generate_input_folder('parquet')
158 | invoker.invoke(self.__input_path, "AnomalyAndMargin", self.__timestamp_column, self.__value_column,
159 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, self.__output_path)
160 | result = pd.read_csv(f"{self.__output_path}/output.csv")
161 | self.assertEqual(result.shape[0], 600)
162 | self.assertTrue('value' in result.columns)
163 | self.assertTrue('isAnomaly' in result.columns)
164 | self.assertTrue('score' in result.columns)
165 | self.assertTrue('expectedValue' in result.columns)
166 | self.assertTrue('upperBoundary' in result.columns)
167 | self.assertTrue('lowerBoundary' in result.columns)
168 |
169 | def testBatchModeCsvFile(self):
170 | df = self.generate_input_data_frame()
171 | df.to_csv(self.__input_csv_file, index=False)
172 | invoker.invoke(self.__input_csv_file, "AnomalyAndMargin", self.__timestamp_column, self.__value_column,
173 | 66, self.__threshold, self.__sensitivity, self.__append_mode, self.__output_path)
174 | result = pd.read_csv(f"{self.__output_path}/output.csv")
175 | self.assertEqual(result.shape[0], 200)
176 | self.assertTrue('value' in result.columns)
177 | self.assertTrue('isAnomaly' in result.columns)
178 | self.assertTrue('score' in result.columns)
179 | self.assertTrue('expectedValue' in result.columns)
180 | self.assertTrue('upperBoundary' in result.columns)
181 | self.assertTrue('lowerBoundary' in result.columns)
182 |
183 | def testBatchModeCsvFolder(self):
184 | self.generate_input_folder()
185 | invoker.invoke(self.__input_path, "AnomalyAndMargin", self.__timestamp_column, self.__value_column,
186 | 66, self.__threshold, self.__sensitivity, self.__append_mode, self.__output_path)
187 | result = pd.read_csv(f"{self.__output_path}/output.csv")
188 | self.assertEqual(result.shape[0], 600)
189 | self.assertTrue('value' in result.columns)
190 | self.assertTrue('isAnomaly' in result.columns)
191 | self.assertTrue('score' in result.columns)
192 | self.assertTrue('expectedValue' in result.columns)
193 | self.assertTrue('upperBoundary' in result.columns)
194 | self.assertTrue('lowerBoundary' in result.columns)
195 |
196 | def testBatchModeParquetFile(self):
197 | df = self.generate_input_data_frame()
198 | df.to_parquet(self.__input_parquet_file, index=False)
199 | invoker.invoke(self.__input_parquet_file, "AnomalyAndMargin", self.__timestamp_column, self.__value_column,
200 | 66, self.__threshold, self.__sensitivity, self.__append_mode, self.__output_path)
201 | result = pd.read_csv(f"{self.__output_path}/output.csv")
202 | self.assertEqual(result.shape[0], 200)
203 | self.assertTrue('value' in result.columns)
204 | self.assertTrue('isAnomaly' in result.columns)
205 | self.assertTrue('score' in result.columns)
206 | self.assertTrue('expectedValue' in result.columns)
207 | self.assertTrue('upperBoundary' in result.columns)
208 | self.assertTrue('lowerBoundary' in result.columns)
209 |
210 | def testBatchModeParquetFolder(self):
211 | self.generate_input_folder('parquet')
212 | invoker.invoke(self.__input_path, "AnomalyAndMargin", self.__timestamp_column, self.__value_column,
213 | 66, self.__threshold, self.__sensitivity, self.__append_mode, self.__output_path)
214 | result = pd.read_csv(f"{self.__output_path}/output.csv")
215 | self.assertEqual(result.shape[0], 600)
216 | self.assertTrue('value' in result.columns)
217 | self.assertTrue('isAnomaly' in result.columns)
218 | self.assertTrue('score' in result.columns)
219 | self.assertTrue('expectedValue' in result.columns)
220 | self.assertTrue('upperBoundary' in result.columns)
221 | self.assertTrue('lowerBoundary' in result.columns)
222 |
223 | if __name__ == '__main__':
224 | unittest.main()
225 |
--------------------------------------------------------------------------------
/aml_component/validation.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/anomalydetector/a3260ea0ddfb868986b924a245e003a97143f9df/aml_component/validation.py
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | from msanomalydetector import SpectralResidual
2 | from msanomalydetector import THRESHOLD, MAG_WINDOW, SCORE_WINDOW, DetectMode
3 | import os
4 | import pandas as pd
5 |
6 |
7 | def detect_anomaly(series, threshold, mag_window, score_window, sensitivity, detect_mode):
8 | detector = SpectralResidual(series=series, threshold=threshold, mag_window=mag_window, score_window=score_window,
9 | sensitivity=sensitivity, detect_mode=detect_mode)
10 | print(detector.detect())
11 |
12 |
13 | if __name__ == '__main__':
14 | sample_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "samples"))
15 | for sample_file in os.listdir(sample_dir):
16 | sample = pd.read_csv(os.path.join(sample_dir, sample_file))
17 | detect_anomaly(sample, THRESHOLD, MAG_WINDOW, SCORE_WINDOW, 99, DetectMode.anomaly_only)
18 |
--------------------------------------------------------------------------------
/msanomalydetector/__init__.py:
--------------------------------------------------------------------------------
1 | from msanomalydetector.spectral_residual import SpectralResidual
2 | from msanomalydetector.util import MAX_RATIO, THRESHOLD, MAG_WINDOW, SCORE_WINDOW, DetectMode
3 |
4 | __all__ = ['SpectralResidual', 'MAX_RATIO', 'THRESHOLD', 'MAG_WINDOW', 'SCORE_WINDOW', 'DetectMode']
5 |
--------------------------------------------------------------------------------
/msanomalydetector/_anomaly_kernel_cython.pyx:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | cimport numpy as np
3 | import array
4 | import bisect
5 |
6 |
7 | cpdef float sorted_median(float[:] data, int i, int j):
8 | cdef int n = j - i
9 | cdef int mid
10 | if n == 0:
11 | raise Exception("no median for empty data")
12 | if n % 2 == 1:
13 | return data[i + n // 2]
14 | else:
15 | mid = i + n // 2
16 | return (data[mid - 1] + data[mid])/2
17 |
18 | cpdef median_filter(np.ndarray data, int window, bint need_two_end=False):
19 | cdef int w_len = window // 2 * 2 + 1
20 | cdef int t_len = len(data)
21 | cdef float[:] val = array.array('f', [x for x in data])
22 | cdef float[:] ans = array.array('f', [x for x in data])
23 | cdef float[:] cur_windows = array.array('f', [0 for x in range(w_len)])
24 | cdef int delete_id
25 | cdef int add_id
26 | cdef int index
27 | if t_len < w_len:
28 | return ans
29 | for i in range(0, w_len):
30 | index = i
31 | add_id = bisect.bisect_right(cur_windows[:i], val[i])
32 | while index > add_id:
33 | cur_windows[index] = cur_windows[index - 1]
34 | index -= 1
35 | cur_windows[add_id] = data[i]
36 | if i >= w_len // 2 and need_two_end:
37 | ans[i - w_len // 2] = sorted_median(cur_windows, 0, i + 1)
38 | ans[window // 2] = sorted_median(cur_windows, 0, w_len)
39 | for i in range(window // 2 + 1, t_len - window // 2):
40 | delete_id = bisect.bisect_right(cur_windows, val[i - window // 2 - 1]) - 1
41 | index = delete_id
42 | while index < w_len - 1:
43 | cur_windows[index] = cur_windows[index + 1]
44 | index += 1
45 |
46 | add_id = bisect.bisect_right(cur_windows[:w_len - 1], val[i + window // 2])
47 | index = w_len - 1
48 | while index > add_id:
49 | cur_windows[index] = cur_windows[index - 1]
50 | index -= 1
51 | cur_windows[add_id] = data[i + window // 2]
52 |
53 | ans[i] = sorted_median(cur_windows, 0, w_len)
54 |
55 | if need_two_end:
56 | for i in range(t_len - window // 2, t_len):
57 | delete_id = bisect.bisect_right(cur_windows[: w_len], data[i - window // 2 - 1]) - 1
58 | index = delete_id
59 | while index < w_len - 1:
60 | cur_windows[index] = cur_windows[index + 1]
61 | index += 1
62 | w_len -= 1
63 | ans[i] = sorted_median(cur_windows[: w_len], 0, w_len)
64 |
65 | return ans
66 |
--------------------------------------------------------------------------------
/msanomalydetector/boundary_utils.py:
--------------------------------------------------------------------------------
1 | import bisect
2 | import numpy as np
3 | from msanomalydetector._anomaly_kernel_cython import median_filter
4 |
5 |
6 | # pseudo - code to generate the factors.
7 | # factors = [1]
8 | # for i in range(50):
9 | # if i < 40:
10 | # factors.append(factors[-1] / (1.15 + 0.001 * i))
11 | # else:
12 | # factors.append(factors[-1] / (1.25 + 0.01 * i))
13 | # for i in range(50):
14 | # factors.insert(0, factors[0] * (1.25 + 0.001 * i))
15 |
16 | factors = [
17 | 184331.62871148242, 141902.71648305038, 109324.12672037778, 84289.9974713784, 65038.57829581667, 50222.84038287002,
18 | 38812.08684920403, 30017.081863266845, 23233.035497884553, 17996.15452973242, 13950.50738738947, 10822.736530170265,
19 | 8402.745753237783, 6528.939979205737, 5076.93622022219, 3950.92312857758, 3077.042935029268, 2398.318733460069,
20 | 1870.7634426365591, 1460.393007522685, 1140.9320371270976, 892.0500681212648, 698.0047481387048, 546.5972968979678,
21 | 428.36778753759233, 335.97473532360186, 263.71643275007995, 207.16137686573444, 162.8627176617409, 128.13746472206208,
22 | 100.8956415134347, 79.50799173635517, 62.70346351447568, 49.48971074544253, 39.09139869308257, 30.90229145698227,
23 | 24.448015393182175, 19.35709849024717, 15.338429865489042, 12.163703303322, 9.653732780414286, 7.667778221139226,
24 | 6.095213212352326, 4.8490160798347866, 3.8606815922251485, 3.076240312529999, 2.4531421949999994, 1.9578149999999996,
25 | 1.5637499999999998, 1.25, 1.0, 0.8695652173913044, 0.7554867223208555, 0.655804446459076, 0.5687809596349316,
26 | 0.4928777813127657, 0.4267340097946024, 0.36914706729636887, 0.3190553736355825, 0.27552277516026125, 0.23772456873189068,
27 | 0.20493497304473338, 0.17651591132190647, 0.1519069804835684, 0.13061649224726435, 0.11221348131208278, 0.09632058481723846,
28 | 0.08260770567516164, 0.0707863801843716, 0.06060477755511267, 0.051843265658779024, 0.0443104834690419, 0.03783986632710667,
29 | 0.03228657536442549, 0.027524787181948417, 0.02344530424356765, 0.019953450420057577, 0.01696721974494692, 0.014415649740821513,
30 | 0.012237393667929978, 0.010379468759906684, 0.008796159966022614, 0.0074480609365136455, 0.006301235986898177,
31 | 0.00532648857725966, 0.004498723460523362, 0.0037963911059268884, 0.0032010043051660104, 0.002696718032995797,
32 | 0.0022699646742388863, 0.0019091376570554135, 0.0011570531254881296, 0.000697019955113331, 0.00041737721863073713,
33 | 0.000248438820613534, 0.00014700521929794912, 8.647365841055832e-05, 5.056939088336744e-05, 2.9400808653120604e-05,
34 | 1.6994687082728674e-05, 9.767061541798089e-06
35 | ]
36 |
37 |
38 | def calculate_boundary_unit_last(data):
39 | if len(data) == 0:
40 | return 0
41 |
42 | calculation_size = len(data) - 1
43 | window = int(min(calculation_size // 3, 512))
44 | trends = np.abs(np.asarray(median_filter(data[:calculation_size], window, need_two_end=True), dtype=float))
45 |
46 | unit = max(np.mean(trends), 1.0)
47 |
48 | if not np.isfinite(unit):
49 | raise Exception('Not finite unit value')
50 |
51 | return unit
52 |
53 |
54 | def calculate_boundary_unit_entire(data, is_anomaly):
55 | if len(data) == 0:
56 | return []
57 |
58 | window = int(min(len(data)//3, 512))
59 | trend_fraction = 0.5
60 | trends = np.abs(np.asarray(median_filter(data, window, need_two_end=True), dtype=float))
61 | valid_trend = [t for a, t in zip(is_anomaly, trends) if not a]
62 |
63 | if len(valid_trend) > 0:
64 | average_part = np.mean(valid_trend)
65 | units = trend_fraction * trends + average_part * (1 - trend_fraction)
66 | else:
67 | units = trends
68 |
69 | if not np.all(np.isfinite(units)):
70 | raise Exception('Not finite unit values')
71 |
72 | units = np.clip(units, 1.0, max(np.max(units), 1.0))
73 |
74 | return units
75 |
76 |
77 | def calculate_margin(unit, sensitivity):
78 |
79 | def calculate_margin_core(unit, sensitivity):
80 | lb = int(sensitivity)
81 | # if lb == sensitivity:
82 | # return unit * factors[lb]
83 |
84 | return (factors[lb + 1] + (factors[lb] - factors[lb + 1]) * (1 - sensitivity + lb)) * unit
85 |
86 | if 0 > sensitivity or sensitivity > 100:
87 | raise Exception('sensitivity should be integer in [0, 100]')
88 |
89 | if unit <= 0:
90 | raise Exception('unit should be a positive number')
91 |
92 | if sensitivity == 100:
93 | return 0.0
94 |
95 | return calculate_margin_core(unit, sensitivity)
96 |
97 |
98 | def calculate_anomaly_score(value, expected_value, unit, is_anomaly):
99 | if not is_anomaly:
100 | return 0.0
101 |
102 | distance = np.abs(expected_value - value)
103 | margins = [calculate_margin(unit, i) for i in range(101)][::-1]
104 | lb = bisect.bisect_left(margins, distance)
105 |
106 | if lb == 0:
107 | return 0
108 | elif lb >= 100:
109 | return 1.0
110 | else:
111 | a, b = margins[lb-1], margins[lb]
112 | score = lb - 1 + (distance - a) / (b - a)
113 |
114 | return score / 100.0
115 |
116 |
117 | def calculate_anomaly_scores(values, expected_values, units, is_anomaly):
118 | scores = [calculate_anomaly_score(value, exp, unit, anomaly)
119 | for value, exp, unit, anomaly in zip(values, expected_values, units, is_anomaly)]
120 | return scores
121 |
--------------------------------------------------------------------------------
/msanomalydetector/spectral_residual.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) Microsoft Corporation. All rights reserved.
3 |
4 | Microsoft Corporation (“Microsoft”) grants you a nonexclusive, perpetual,
5 | royalty-free right to use, copy, and modify the software code provided by us
6 | ("Software Code"). You may not sublicense the Software Code or any use of it
7 | (except to your affiliates and to vendors to perform work on your behalf)
8 | through distribution, network access, service agreement, lease, rental, or
9 | otherwise. This license does not purport to express any claim of ownership over
10 | data you may have shared with Microsoft in the creation of the Software Code.
11 | Unless applicable law gives you more rights, Microsoft reserves all other
12 | rights not expressly granted herein, whether by implication, estoppel or
13 | otherwise.
14 |
15 | THE SOFTWARE CODE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 | MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
19 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
21 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
22 | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23 | ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE
24 | POSSIBILITY OF SUCH DAMAGE.
25 | """
26 |
27 | import pandas as pd
28 | import numpy as np
29 |
30 | from msanomalydetector.util import *
31 | import msanomalydetector.boundary_utils as boundary_helper
32 | from msanomalydetector._anomaly_kernel_cython import median_filter
33 |
34 |
35 | class SpectralResidual:
36 | def __init__(self, series, threshold, mag_window, score_window, sensitivity, detect_mode, batch_size):
37 | self.__series__ = series
38 | self.__values__ = self.__series__['value'].tolist()
39 | self.__threshold__ = threshold
40 | self.__mag_window = mag_window
41 | self.__score_window = score_window
42 | self.__sensitivity = sensitivity
43 | self.__detect_mode = detect_mode
44 | self.__anomaly_frame = None
45 | self.__batch_size = batch_size
46 | if self.__batch_size <= 0:
47 | self.__batch_size = len(series)
48 |
49 | self.__batch_size = max(12, self.__batch_size)
50 | self.__batch_size = min(len(series), self.__batch_size)
51 |
52 | def detect(self):
53 | if self.__anomaly_frame is None:
54 | self.__anomaly_frame = self.__detect()
55 |
56 | return self.__anomaly_frame
57 |
58 | def __detect(self):
59 | anomaly_frames = []
60 | for i in range(0, len(self.__series__), self.__batch_size):
61 | start = i
62 | end = i + self.__batch_size
63 | end = min(end, len(self.__series__))
64 | if end - start >= 12:
65 | anomaly_frames.append(self.__detect_core(self.__series__[start:end]))
66 | else:
67 | ext_start = max(0, end - self.__batch_size)
68 | ext_frame = self.__detect_core(self.__series__[ext_start:end])
69 | anomaly_frames.append(ext_frame[start-ext_start:])
70 |
71 | return pd.concat(anomaly_frames, axis=0, ignore_index=True)
72 |
73 | def __detect_core(self, series):
74 | values = series['value'].values
75 | extended_series = SpectralResidual.extend_series(values)
76 | mags = self.spectral_residual_transform(extended_series)
77 | anomaly_scores = self.generate_spectral_score(mags)
78 | anomaly_frame = pd.DataFrame({Timestamp: series['timestamp'].values,
79 | Value: values,
80 | Mag: mags[:len(values)],
81 | AnomalyScore: anomaly_scores[:len(values)]})
82 | anomaly_frame[IsAnomaly] = np.where(anomaly_frame[AnomalyScore] > self.__threshold__, True, False)
83 |
84 | if self.__detect_mode == DetectMode.anomaly_and_margin:
85 | anomaly_index = anomaly_frame[anomaly_frame[IsAnomaly]].index.tolist()
86 | anomaly_frame[ExpectedValue] = self.calculate_expected_value(values, anomaly_index)
87 | boundary_units = boundary_helper.calculate_boundary_unit_entire(values,
88 | anomaly_frame[IsAnomaly].values)
89 | anomaly_frame[AnomalyScore] = boundary_helper.calculate_anomaly_scores(
90 | values=values,
91 | expected_values=anomaly_frame[ExpectedValue].values,
92 | units=boundary_units,
93 | is_anomaly=anomaly_frame[IsAnomaly].values
94 | )
95 |
96 | margins = [boundary_helper.calculate_margin(u, self.__sensitivity) for u in boundary_units]
97 | anomaly_frame['unit'] = boundary_units
98 |
99 | anomaly_frame[LowerBoundary] = anomaly_frame[ExpectedValue].values - margins
100 | anomaly_frame[UpperBoundary] = anomaly_frame[ExpectedValue].values + margins
101 | isLowerAnomaly = np.logical_and(anomaly_frame[IsAnomaly].values,
102 | anomaly_frame[LowerBoundary].values > values)
103 | isUpperAnomaly = np.logical_and(anomaly_frame[IsAnomaly].values,
104 | values > anomaly_frame[UpperBoundary].values)
105 | anomaly_frame[IsAnomaly] = np.logical_or(isLowerAnomaly, isUpperAnomaly)
106 |
107 | return anomaly_frame
108 |
109 | def generate_spectral_score(self, mags):
110 | ave_mag = average_filter(mags, n=self.__score_window)
111 | safeDivisors = np.clip(ave_mag, EPS, ave_mag.max())
112 |
113 | raw_scores = np.abs(mags - ave_mag) / safeDivisors
114 | scores = np.clip(raw_scores / 10.0, 0, 1.0)
115 |
116 | return scores
117 |
118 | def spectral_residual_transform(self, values):
119 | """
120 | This method transform a time series into spectral residual series
121 | :param values: list.
122 | a list of float values.
123 | :return: mag: list.
124 | a list of float values as the spectral residual values
125 | """
126 |
127 | trans = np.fft.fft(values)
128 | mag = np.sqrt(trans.real ** 2 + trans.imag ** 2)
129 | eps_index = np.where(mag <= EPS)[0]
130 | mag[eps_index] = EPS
131 |
132 | mag_log = np.log(mag)
133 | mag_log[eps_index] = 0
134 |
135 | spectral = np.exp(mag_log - average_filter(mag_log, n=self.__mag_window))
136 |
137 | trans.real = trans.real * spectral / mag
138 | trans.imag = trans.imag * spectral / mag
139 | trans.real[eps_index] = 0
140 | trans.imag[eps_index] = 0
141 |
142 | wave_r = np.fft.ifft(trans)
143 | mag = np.sqrt(wave_r.real ** 2 + wave_r.imag ** 2)
144 | return mag
145 |
146 | @staticmethod
147 | def predict_next(values):
148 | """
149 | Predicts the next value by sum up the slope of the last value with previous values.
150 | Mathematically, g = 1/m * sum_{i=1}^{m} g(x_n, x_{n-i}), x_{n+1} = x_{n-m+1} + g * m,
151 | where g(x_i,x_j) = (x_i - x_j) / (i - j)
152 | :param values: list.
153 | a list of float numbers.
154 | :return : float.
155 | the predicted next value.
156 | """
157 |
158 | if len(values) <= 1:
159 | raise ValueError(f'data should contain at least 2 numbers')
160 |
161 | v_last = values[-1]
162 | n = len(values)
163 |
164 | slopes = [(v_last - v) / (n - 1 - i) for i, v in enumerate(values[:-1])]
165 |
166 | return values[1] + sum(slopes)
167 |
168 | @staticmethod
169 | def extend_series(values, extend_num=5, look_ahead=5):
170 | """
171 | extend the array data by the predicted next value
172 | :param values: list.
173 | a list of float numbers.
174 | :param extend_num: int, default 5.
175 | number of values added to the back of data.
176 | :param look_ahead: int, default 5.
177 | number of previous values used in prediction.
178 | :return: list.
179 | The result array.
180 | """
181 |
182 | if look_ahead < 1:
183 | raise ValueError('look_ahead must be at least 1')
184 |
185 | extension = [SpectralResidual.predict_next(values[-look_ahead - 2:-1])] * extend_num
186 | return np.concatenate((values, extension), axis=0)
187 |
188 | @staticmethod
189 | def calculate_expected_value(values, anomaly_index):
190 | values = deanomaly_entire(values, anomaly_index)
191 | length = len(values)
192 | fft_coef = np.fft.fft(values)
193 | fft_coef.real = [v if length * 3 / 8 >= i or i >= length * 5 / 8 else 0 for i, v in enumerate(fft_coef.real)]
194 | fft_coef.imag = [v if length * 3 / 8 >= i or i >= length * 5 / 8 else 0 for i, v in enumerate(fft_coef.imag)]
195 | exps = np.fft.ifft(fft_coef)
196 | return exps.real
197 |
--------------------------------------------------------------------------------
/msanomalydetector/util.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) Microsoft Corporation. All rights reserved.
3 |
4 | Microsoft Corporation (“Microsoft”) grants you a nonexclusive, perpetual,
5 | royalty-free right to use, copy, and modify the software code provided by us
6 | ("Software Code"). You may not sublicense the Software Code or any use of it
7 | (except to your affiliates and to vendors to perform work on your behalf)
8 | through distribution, network access, service agreement, lease, rental, or
9 | otherwise. This license does not purport to express any claim of ownership over
10 | data you may have shared with Microsoft in the creation of the Software Code.
11 | Unless applicable law gives you more rights, Microsoft reserves all other
12 | rights not expressly granted herein, whether by implication, estoppel or
13 | otherwise.
14 |
15 | THE SOFTWARE CODE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 | MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
19 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
21 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
22 | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23 | ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE
24 | POSSIBILITY OF SUCH DAMAGE.
25 | """
26 | from enum import Enum
27 | import numpy as np
28 |
29 | IsAnomaly = "isAnomaly"
30 | AnomalyId = "id"
31 | AnomalyScore = "score"
32 | Value = "value"
33 | Timestamp = "timestamp"
34 | Mag = "mag"
35 | ExpectedValue = "expectedValue"
36 | UpperBoundary = "upperBoundary"
37 | LowerBoundary = "lowerBoundary"
38 |
39 | MAX_RATIO = 0.25
40 | EPS = 1e-8
41 | THRESHOLD = 0.3
42 | MAG_WINDOW = 3
43 | SCORE_WINDOW = 40
44 |
45 |
46 | class DetectMode(Enum):
47 | anomaly_only = 'AnomalyOnly'
48 | anomaly_and_margin = 'AnomalyAndMargin'
49 |
50 |
51 | def average_filter(values, n=3):
52 | """
53 | Calculate the sliding window average for the give time series.
54 | Mathematically, res[i] = sum_{j=i-t+1}^{i} values[j] / t, where t = min(n, i+1)
55 | :param values: list.
56 | a list of float numbers
57 | :param n: int, default 3.
58 | window size.
59 | :return res: list.
60 | a list of value after the average_filter process.
61 | """
62 |
63 | if n >= len(values):
64 | n = len(values)
65 |
66 | res = np.cumsum(values, dtype=float)
67 | res[n:] = res[n:] - res[:-n]
68 | res[n:] = res[n:] / n
69 |
70 | for i in range(1, n):
71 | res[i] /= (i + 1)
72 |
73 | return res
74 |
75 |
76 | def leastsq(x, y):
77 | n = len(x)
78 | sum_x = np.sum(x)
79 | sum_y = np.sum(y)
80 | sum_xx = np.sum(np.multiply(x, x))
81 | sum_xy = np.sum(np.multiply(x, y))
82 | a = (n * sum_xy - sum_x * sum_y) / (n * sum_xx - sum_x * sum_x)
83 | b = (sum_xx * sum_y - sum_x * sum_xy) / (n * sum_xx - sum_x * sum_x)
84 | return a, b
85 |
86 |
87 | def deanomaly_entire(values, entire_anomalies):
88 | deanomaly_data = np.copy(values)
89 | min_points_to_fit = 4
90 | for idx in entire_anomalies:
91 | step = 1
92 | start = max(idx - step, 0)
93 | end = min(len(values) - 1, idx + step)
94 | fit_values = [(i, values[i]) for i in range(start, end+1) if i not in entire_anomalies]
95 | while len(fit_values) < min_points_to_fit and (start > 0 or end < len(values)-1):
96 | step = step + 2
97 | start = max(idx - step, 0)
98 | end = min(len(values) - 1, idx + step)
99 | fit_values = [(i, values[i]) for i in range(start, end+1) if i not in entire_anomalies]
100 |
101 | if len(fit_values) > 1:
102 | x, y = tuple(zip(*fit_values))
103 | a, b = leastsq(x, y)
104 | deanomaly_data[idx] = a * idx + b
105 |
106 | return deanomaly_data
107 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Cython>=0.29.2
2 | numpy==1.18.1
3 | pandas==0.25.3
4 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages, Extension
2 | from Cython.Build import cythonize
3 | from Cython.Distutils import build_ext
4 | import numpy as np
5 |
6 | __version__ = "can't find version.py"
7 | exec(compile(open('version.py').read(),
8 | 'version.py', 'exec'))
9 |
10 | extensions = [
11 | Extension("msanomalydetector._anomaly_kernel_cython", ["msanomalydetector/_anomaly_kernel_cython.pyx"],
12 | define_macros=[('CYTHON_TRACE', '1')])
13 | ]
14 |
15 | cmdclass = {'build_ext': build_ext}
16 |
17 | install_requires = [
18 | 'Cython>=0.29.2',
19 | 'numpy==1.18.1',
20 | 'pandas==0.25.3'
21 | ]
22 |
23 | setup(
24 | name="msanomalydetector",
25 | description='Microsoft Anomaly Detector Package Based On Saliency Detection',
26 | packages=find_packages(),
27 | include_dirs=[np.get_include()],
28 | cmdclass=cmdclass,
29 | ext_modules=cythonize(extensions),
30 | version=__version__,
31 | setup_requires=['Cython>=0.29.2', 'numpy==1.18.1'],
32 | install_requires=install_requires,
33 | requires=['Cython', 'numpy', 'pandas'],
34 | python_requires='>=3.6.0',
35 | package_data={'': ['*.txt']}
36 | )
37 |
--------------------------------------------------------------------------------
/srcnn/competition_metric.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is referenced from https://github.com/iopsai/iops/blob/master/evaluation/evaluation.py
3 | """
4 |
5 | import numpy as np
6 | from sklearn.metrics import f1_score, precision_score, recall_score
7 |
8 |
9 | def get_range_proba(predict, label, delay=7):
10 | predict = np.array(predict)
11 | label = np.array(label)
12 |
13 | splits = np.where(label[1:] != label[:-1])[0] + 1
14 | is_anomaly = label[0] == 1
15 | new_predict = np.array(predict)
16 | pos = 0
17 |
18 | for sp in splits:
19 | if is_anomaly:
20 | if 1 in predict[pos:min(pos + delay + 1, sp)]:
21 | new_predict[pos: sp] = 1
22 | else:
23 | new_predict[pos: sp] = 0
24 | is_anomaly = not is_anomaly
25 | pos = sp
26 | sp = len(label)
27 |
28 | if is_anomaly:
29 | if 1 in predict[pos: min(pos + delay + 1, sp)]:
30 | new_predict[pos: sp] = 1
31 | else:
32 | new_predict[pos: sp] = 0
33 |
34 | return new_predict
35 |
36 |
37 | def reconstruct_label(timestamp, label):
38 | timestamp = np.asarray(timestamp, np.int64)
39 | index = np.argsort(timestamp)
40 |
41 | timestamp_sorted = np.asarray(timestamp[index])
42 | interval = np.min(np.diff(timestamp_sorted))
43 |
44 | label = np.asarray(label, np.int64)
45 | label = np.asarray(label[index])
46 |
47 | idx = (timestamp_sorted - timestamp_sorted[0]) // interval
48 |
49 | new_label = np.zeros(shape=((timestamp_sorted[-1] - timestamp_sorted[0]) // interval + 1,), dtype=np.int)
50 | new_label[idx] = label
51 |
52 | return new_label
53 |
54 |
55 | def reconstruct_series(timestamp, label, predict, delay=7):
56 | label = reconstruct_label(timestamp, label)
57 | predict = reconstruct_label(timestamp, predict)
58 | predict = get_range_proba(predict, label, delay)
59 | return label.tolist(), predict.tolist()
60 |
61 |
62 | def calc(pred, true):
63 | TP = 0
64 | FP = 0
65 | TN = 0
66 | FN = 0
67 | for pre, gt in zip(pred, true):
68 | if gt == 1:
69 | if pre == 1:
70 | TP += 1
71 | else:
72 | FN += 1
73 | if gt == 0:
74 | if pre == 1:
75 | FP += 1
76 | else:
77 | TN += 1
78 | return TP, FP, TN, FN
79 |
80 |
81 | def evaluate_for_all_series(lst_timestamp_label_predict, delay=7, prt=True):
82 | labels, predicts = [], []
83 | for timestamp, label, predict, _ in lst_timestamp_label_predict:
84 | if timestamp == []:
85 | continue
86 | lbl, pdt = reconstruct_series(timestamp, label, predict, delay)
87 | labels += lbl
88 | predicts += pdt
89 |
90 | f1 = f1_score(labels, predicts)
91 | pre = precision_score(labels, predicts)
92 | rec = recall_score(labels, predicts)
93 | TP, FP, TN, FN = calc(predicts, labels)
94 | if prt:
95 | print('precision', pre)
96 | print('recall', rec)
97 | print('f1', f1)
98 | print('-------------------------------')
99 | return f1, pre, rec, TP, FP, TN, FN
100 |
101 |
102 | def bi_get_range_proba(predict, label, left, right):
103 | i = 1
104 | rs = predict[:]
105 | while i < len(label):
106 | if label[i] == 1 and label[i - 1] == 0:
107 | start = max(0, i - left)
108 | end = min(i + right + 1, len(label))
109 | if 1 in predict[start: end]:
110 | j = i
111 | while j < len(label) and label[j] == 1:
112 | rs[j] = 1
113 | j += 1
114 | i = j
115 | rs[start: end] = label[start: end]
116 | else:
117 | j = i
118 | while j < len(label) and label[j] == 1:
119 | rs[j] = 0
120 | j += 1
121 | i = j
122 | i += 1
123 | return rs
124 |
125 |
126 | def bi_reconstruct_series(timestamp, label, predict, left, right):
127 | label = reconstruct_label(timestamp, label).tolist()
128 | predict = reconstruct_label(timestamp, predict).tolist()
129 | predict = bi_get_range_proba(predict, label, left, right)
130 | return label, predict
131 |
132 |
133 | def bi_evaluate_for_all_series(lst_timestamp_label_predict, left, right, prt=True):
134 | import json
135 | labels, predicts = [], []
136 | save = []
137 | for timestamp, label, predict in lst_timestamp_label_predict:
138 | if timestamp == []:
139 | continue
140 | try:
141 | lbl, pdt = bi_reconstruct_series(timestamp, label, predict, left, right)
142 | except:
143 | continue
144 | ifi = f1_score(lbl, pdt)
145 | save.append(ifi)
146 | labels += lbl
147 | predicts += pdt
148 | with open('eachscore.json', 'w+') as fout:
149 | json.dump(save, fout)
150 | f1 = f1_score(labels, predicts)
151 | pre = precision_score(labels, predicts)
152 | rec = recall_score(labels, predicts)
153 | if prt:
154 | print('precision', pre)
155 | print('recall', rec)
156 | print('f1', f1)
157 | print('-------------------------------')
158 | return f1, pre, rec
159 |
160 |
161 | def get_variance(f_score, all_fscore):
162 | va = 0.0
163 | for i in range(len(all_fscore)):
164 | va += 1.0 * (all_fscore[i] - f_score) * (all_fscore[i] - f_score)
165 |
166 | return va / len(all_fscore)
167 |
--------------------------------------------------------------------------------
/srcnn/evalue.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) Microsoft Corporation. All rights reserved.
3 |
4 | Microsoft Corporation ("Microsoft") grants you a nonexclusive, perpetual,
5 | royalty-free right to use, copy, and modify the software code provided by us
6 | ("Software Code"). You may not sublicense the Software Code or any use of it
7 | (except to your affiliates and to vendors to perform work on your behalf)
8 | through distribution, network access, service agreement, lease, rental, or
9 | otherwise. This license does not purport to express any claim of ownership over
10 | data you may have shared with Microsoft in the creation of the Software Code.
11 | Unless applicable law gives you more rights, Microsoft reserves all other
12 | rights not expressly granted herein, whether by implication, estoppel or
13 | otherwise.
14 |
15 | THE SOFTWARE CODE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 | MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
19 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
21 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
22 | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23 | ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE
24 | POSSIBILITY OF SUCH DAMAGE.
25 | """
26 |
27 | import os
28 | from srcnn.competition_metric import get_variance, evaluate_for_all_series
29 | import time
30 | import json
31 | import argparse
32 | from msanomalydetector.spectral_residual import SpectralResidual
33 | from srcnn.utils import *
34 |
35 |
36 | def auto():
37 | path_auto = os.getcwd() + '/auto.json'
38 | with open(path_auto, 'r+') as f:
39 | store = json.load(f)
40 | window = store['window']
41 | epoch = store['epoch']
42 | return window, epoch
43 |
44 |
45 | def getfid(path):
46 | return path.split('/')[-1]
47 |
48 |
49 | def get_path(data_source):
50 | if data_source == 'kpi':
51 | dir_ = root + '/Test/'
52 | trainfiles = [dir_ + _ for _ in os.listdir(dir_)]
53 | files = trainfiles
54 | else:
55 | dir_ = root + '/' + data_source + '/'
56 | files = [dir_ + _ for _ in os.listdir(dir_)]
57 | return files
58 |
59 |
60 | def get_score(data_source, files, thres, option):
61 | total_time = 0
62 | results = []
63 | savedscore = []
64 | for f in files:
65 | print('reading', f)
66 | if data_source == 'kpi' or data_source == 'test_kpi':
67 | in_timestamp, in_value, in_label = read_csv_kpi(f)
68 | else:
69 | tmp_data = read_pkl(f)
70 | in_timestamp, in_value, in_label = tmp_data['timestamp'], tmp_data['value'], tmp_data['label']
71 | length = len(in_timestamp)
72 | if model == 'sr_cnn' and len(in_value) < window:
73 | print("length is shorter than win_size", len(in_value), window)
74 | continue
75 | time_start = time.time()
76 | timestamp, label, pre, scores = models[model](in_timestamp, in_value, in_label, window, net, option, thres)
77 | time_end = time.time()
78 | total_time += time_end - time_start
79 | results.append([timestamp, label, pre, f])
80 | savedscore.append([label, scores, f, timestamp])
81 | return total_time, results, savedscore
82 |
83 |
84 | if __name__ == '__main__':
85 | parser = argparse.ArgumentParser(description='SRCNN')
86 | parser.add_argument('--data', type=str, required=True, help='location of the data file')
87 | parser.add_argument('--window', type=int, default=128, help='window size')
88 | parser.add_argument('--epoch', type=int, default=10)
89 | parser.add_argument('--model_path', type=str, default='snapshot', help='model path')
90 | parser.add_argument('--delay', type=int, default=3, help='delay')
91 | parser.add_argument('--thres', type=int, default=0.95, help='initial threshold of SR')
92 | parser.add_argument('--auto', type=bool, default=False, help='Automatic filling parameters')
93 | parser.add_argument('--model', type=str, default='sr_cnn', help='model')
94 | parser.add_argument('--missing_option', type=str, default='anomaly',
95 | help='missing data option, anomaly means treat missing data as anomaly')
96 |
97 | args = parser.parse_args()
98 | if args.auto:
99 | window, epoch = auto()
100 | else:
101 | window = args.window
102 | epoch = args.epoch
103 | data_source = args.data
104 | delay = args.delay
105 | model = args.model
106 | root = os.getcwd()
107 | print(data, window, epoch)
108 | models = {
109 | 'sr_cnn': sr_cnn_eval,
110 | }
111 |
112 | model_path = root + '/' + args.model_path + '/srcnn_retry' + str(epoch) + '_' + str(window) + '.bin'
113 | srcnn_model = Anomaly(window)
114 | net = load_model(srcnn_model, model_path).cuda()
115 | files = get_path(data_source)
116 | total_time, results, savedscore = get_score(data_source, files, args.thres, args.missing_option)
117 | print('\n***********************************************')
118 | print('data source:', data_source, ' model:', model)
119 | print('-------------------------------')
120 | total_fscore, pre, rec, TP, FP, TN, FN = evaluate_for_all_series(results, delay)
121 | with open(data_source + '_saved_scores.json', 'w') as f:
122 | json.dump(savedscore, f)
123 | print('time used for making predictions:', total_time, 'seconds')
124 |
125 |
126 | best = 0.
127 | bestthre = 0.
128 | print('delay :', delay)
129 | if data_source == 'yahoo':
130 | sru = {}
131 | rf = open(data_source + 'sr3.json', 'r')
132 | srres = json.load(rf)
133 | for (srtime, srl, srpre, srf) in srres:
134 | sru[getfid(srf)] = [srtime, srl, srpre]
135 | for i in range(98):
136 | newresults = []
137 | threshold = 0.01 + i * 0.01
138 | for f, (srtt, srlt, srpret, srft), (flabel, cnnscores, cnnf, cnnt) in zip(files, srres, savedscore):
139 | fid = getfid(cnnf)
140 | srtime = sru[fid][0]
141 | srl = sru[fid][1]
142 | srpre = sru[fid][2]
143 | srtime = [(srtime[0] - 3600 * (64 - j)) for j in range(64)] + srtime
144 | srl = [0] * 64 + srl
145 | srpre = [0] * 64 + srpre
146 | print(len(srl), len(flabel), '!!')
147 | assert (len(srl) == len(flabel))
148 | pre = [1 if item > threshold else 0 for item in cnnscores]
149 | newresults.append([srtime, srpre, pre, f])
150 | total_fscore, pre, rec, TP, FP, TN, FN = evaluate_for_all_series(newresults, delay, prt=False)
151 | if total_fscore > best:
152 | best = total_fscore
153 | bestthre = threshold
154 | results = []
155 | threshold = bestthre
156 | print('guided threshold :', threshold)
157 | for f, (flabel, cnnscores, _, ftimestamp) in zip(files, savedscore):
158 | pre = [1 if item > threshold else 0 for item in cnnscores]
159 | results.append([ftimestamp, flabel, pre, f])
160 | print('score\n')
161 | total_fscore, pre, rec, TP, FP, TN, FN = evaluate_for_all_series(results, delay)
162 | print(total_fscore)
163 | best = 0.
164 | for i in range(98):
165 | newresults = []
166 | threshold = 0.01 + i * 0.01
167 | for f, (flabel, cnnscores, _, ftimestamp) in zip(files, savedscore):
168 | pre = [1 if item > threshold else 0 for item in cnnscores]
169 | newresults.append([ftimestamp, flabel, pre, f])
170 | total_fscore, pre, rec, TP, FP, TN, FN = evaluate_for_all_series(newresults, delay, prt=False)
171 | if total_fscore > best:
172 | best = total_fscore
173 | bestthre = threshold
174 | print('tem best', best, threshold)
175 | threshold = bestthre
176 | print('best overall threshold :', threshold, 'best score :', best)
177 |
--------------------------------------------------------------------------------
/srcnn/generate_data.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) Microsoft Corporation. All rights reserved.
3 |
4 | Microsoft Corporation ("Microsoft") grants you a nonexclusive, perpetual,
5 | royalty-free right to use, copy, and modify the software code provided by us
6 | ("Software Code"). You may not sublicense the Software Code or any use of it
7 | (except to your affiliates and to vendors to perform work on your behalf)
8 | through distribution, network access, service agreement, lease, rental, or
9 | otherwise. This license does not purport to express any claim of ownership over
10 | data you may have shared with Microsoft in the creation of the Software Code.
11 | Unless applicable law gives you more rights, Microsoft reserves all other
12 | rights not expressly granted herein, whether by implication, estoppel or
13 | otherwise.
14 |
15 | THE SOFTWARE CODE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 | MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
19 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
21 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
22 | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23 | ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE
24 | POSSIBILITY OF SUCH DAMAGE.
25 | """
26 |
27 | import argparse
28 | from srcnn.utils import *
29 | import os
30 | import time
31 | from msanomalydetector.util import average_filter
32 |
33 |
34 | class gen():
35 | def __init__(self, win_siz, step, nums):
36 | self.control = 0
37 | self.win_siz = win_siz
38 | self.step = step
39 | self.number = nums
40 |
41 | def generate_train_data(self, value, back_k=0):
42 | def normalize(a):
43 | amin = np.min(a)
44 | amax = np.max(a)
45 | a = (a - amin) / (amax - amin + 1e-5)
46 | return 3 * a
47 |
48 | if back_k <= 5:
49 | back = back_k
50 | else:
51 | back = 5
52 | length = len(value)
53 | tmp = []
54 | for pt in range(self.win_siz, length - back, self.step):
55 | head = max(0, pt - self.win_siz)
56 | tail = min(length - back, pt)
57 | data = np.array(value[head:tail])
58 | data = data.astype(np.float64)
59 | data = normalize(data)
60 | num = np.random.randint(1, self.number)
61 | ids = np.random.choice(self.win_siz, num, replace=False)
62 | lbs = np.zeros(self.win_siz, dtype=np.int64)
63 | if (self.win_siz - 6) not in ids:
64 | self.control += np.random.random()
65 | else:
66 | self.control = 0
67 | if self.control > 100:
68 | ids[0] = self.win_siz - 6
69 | self.control = 0
70 | mean = np.mean(data)
71 | dataavg = average_filter(data)
72 | var = np.var(data)
73 | for id in ids:
74 | data[id] += (dataavg[id] + mean) * np.random.randn() * min((1 + var), 10)
75 | lbs[id] = 1
76 | tmp.append([data.tolist(), lbs.tolist()])
77 | return tmp
78 |
79 |
80 | def auto(dic):
81 | path_auto = os.getcwd() + '/auto.json'
82 | auto = {}
83 | for item, value in dic:
84 | if value != None:
85 | auto[item] = value
86 | with open(path_auto, 'w+') as f:
87 | json.dump(auto, f)
88 |
89 |
90 | def get_path(data):
91 | dir_ = os.getcwd() + '/' + data + '/'
92 | fadir = [_ for _ in os.listdir(dir_)]
93 | print(fadir, 'fadir')
94 | files = []
95 | for eachdir in fadir:
96 | files += [dir_ + eachdir + '/' + _ for _ in os.listdir(dir_ + eachdir)]
97 | print(files, 'files')
98 | return files
99 |
100 |
101 | if __name__ == '__main__':
102 | parser = argparse.ArgumentParser(description='SRCNN')
103 | parser.add_argument('--data', type=str, required=True, help='location of the data file')
104 | parser.add_argument('--window', type=int, default=128, help='window size')
105 | parser.add_argument('--step', type=int, default=64, help='step')
106 | parser.add_argument('--seed', type=int, default=54321, help='random seed')
107 | parser.add_argument('--num', type=int, default=10, help='upper limit value for the number of anomaly points')
108 | args = parser.parse_args()
109 | np.random.seed(args.seed)
110 | auto(vars(args).items())
111 | files = get_path(args.data)
112 |
113 | train_data_path = os.getcwd() + '/' + args.data + '_' + str(args.window) + '_train.json'
114 | total_time = 0
115 | results = []
116 | print("generating train data")
117 | generator = gen(args.window, args.step, args.num)
118 | for f in files:
119 | print('reading', f)
120 | in_timestamp, in_value = read_csv(f)
121 | in_label = []
122 | if len(in_value) < args.window:
123 | print("value's length < window size", len(in_value), args.window)
124 | continue
125 | time_start = time.time()
126 | train_data = generator.generate_train_data(in_value)
127 | time_end = time.time()
128 | total_time += time_end - time_start
129 | results += train_data
130 | print('file num:', len(files))
131 | print('total fake data size:', len(results))
132 | with open(train_data_path, 'w+') as f:
133 | print(train_data_path)
134 | json.dump(results, f)
135 |
--------------------------------------------------------------------------------
/srcnn/net.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) Microsoft Corporation. All rights reserved.
3 |
4 | Microsoft Corporation ("Microsoft") grants you a nonexclusive, perpetual,
5 | royalty-free right to use, copy, and modify the software code provided by us
6 | ("Software Code"). You may not sublicense the Software Code or any use of it
7 | (except to your affiliates and to vendors to perform work on your behalf)
8 | through distribution, network access, service agreement, lease, rental, or
9 | otherwise. This license does not purport to express any claim of ownership over
10 | data you may have shared with Microsoft in the creation of the Software Code.
11 | Unless applicable law gives you more rights, Microsoft reserves all other
12 | rights not expressly granted herein, whether by implication, estoppel or
13 | otherwise.
14 |
15 | THE SOFTWARE CODE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 | MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
19 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
21 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
22 | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23 | ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE
24 | POSSIBILITY OF SUCH DAMAGE.
25 | """
26 | import torch
27 | import torch.utils.data
28 | from torch import nn, optim
29 | from torch.nn import functional as F
30 | from torchvision import datasets, transforms
31 | from torchvision.utils import save_image
32 |
33 | configs = [()]
34 |
35 |
36 | def make_layers(Bn=True, input=256):
37 | global configs
38 | layers = []
39 | layer = nn.Conv2d(input, input, kernel_size=1, stride=1, padding=0)
40 | layers.append(layer)
41 | if Bn:
42 | layers.append(nn.BatchNorm2d(input))
43 |
44 | for k, s, c in configs:
45 | if c == -1:
46 | layer = nn.Conv2d(kernel_size=k, stride=s, padding=0)
47 | else:
48 | now = []
49 | now.append(nn.Conv1d(input, c, kernel_size=k, stride=s, padding=0))
50 | input = c
51 | if Bn:
52 | now.append(nn.BatchNorm2d(input))
53 | now.append(nn.Relu(inplace=True))
54 | layer = nn.Sequential(*now)
55 | layers.append(layer)
56 | return nn.Sequential(*layers), input
57 |
58 |
59 | class trynet(nn.Module):
60 | def __init__(self):
61 | super(trynet, self).__init__()
62 | self.layer1 = nn.Conv1d(1, 128, kernel_size=128, stride=0, padding=0)
63 | self.layer2 = nn.BatchNorm1d(128)
64 |
65 | self.feature = make_layers()
66 |
67 |
68 | class Anomaly(nn.Module):
69 | def __init__(self, window=1024):
70 | self.window = window
71 | super(Anomaly, self).__init__()
72 | self.layer1 = nn.Conv1d(window, window, kernel_size=1, stride=1, padding=0)
73 | self.layer2 = nn.Conv1d(window, 2 * window, kernel_size=1, stride=1, padding=0)
74 | self.fc1 = nn.Linear(2 * window, 4 * window)
75 | self.fc2 = nn.Linear(4 * window, window)
76 | self.relu = nn.ReLU(inplace=True)
77 |
78 | def forward(self, x):
79 | x = x.view(x.size(0), self.window, 1)
80 | x = self.layer1(x)
81 | x = self.relu(x)
82 | x = self.layer2(x)
83 | x = x.view(x.size(0), -1)
84 | x = self.relu(x)
85 | x = self.fc1(x)
86 | x = self.relu(x)
87 | x = self.fc2(x)
88 | return torch.sigmoid(x)
89 |
90 |
91 | def save_model(model, model_path):
92 | try:
93 | torch.save(model.state_dict(), model_path)
94 | except:
95 | torch.save(model, model_path)
96 |
97 |
98 | def load_model(model, path):
99 | print("loading %s" % path)
100 | with open(path, 'rb') as f:
101 | pretrained = torch.load(f, map_location=lambda storage, loc: storage)
102 | model_dict = model.state_dict()
103 | pretrained = {k: v for k, v in pretrained.items() if k in model_dict}
104 | model_dict.update(pretrained)
105 | model.load_state_dict(model_dict)
106 | return model
107 |
--------------------------------------------------------------------------------
/srcnn/train.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) Microsoft Corporation. All rights reserved.
3 |
4 | Microsoft Corporation ("Microsoft") grants you a nonexclusive, perpetual,
5 | royalty-free right to use, copy, and modify the software code provided by us
6 | ("Software Code"). You may not sublicense the Software Code or any use of it
7 | (except to your affiliates and to vendors to perform work on your behalf)
8 | through distribution, network access, service agreement, lease, rental, or
9 | otherwise. This license does not purport to express any claim of ownership over
10 | data you may have shared with Microsoft in the creation of the Software Code.
11 | Unless applicable law gives you more rights, Microsoft reserves all other
12 | rights not expressly granted herein, whether by implication, estoppel or
13 | otherwise.
14 |
15 | THE SOFTWARE CODE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 | MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
19 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
21 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
22 | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23 | ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE
24 | POSSIBILITY OF SUCH DAMAGE.
25 | """
26 | import argparse
27 | from srcnn.utils import *
28 | import numpy as np
29 | import os
30 | import time
31 |
32 |
33 | def auto(epoch):
34 | path_auto = os.getcwd() + '/auto.json'
35 | with open(path_auto, 'r+') as f:
36 | store = json.load(f)
37 | data = store['data']
38 | window = store['window']
39 | store['epoch'] = epoch
40 | with open(path_auto, 'w+') as f:
41 | json.dump(store, f)
42 | return data, window
43 |
44 |
45 | if __name__ == '__main__':
46 | parser = argparse.ArgumentParser(description='SRCNN')
47 | parser.add_argument('--data', type=str, required=True, help='location of the data file')
48 | parser.add_argument('--window', type=int, default=128, help='window size')
49 | parser.add_argument('--lr', type=int, default=1e-6, help='learning rate')
50 | parser.add_argument('--step', type=int, default=64, help='step')
51 |
52 | parser.add_argument('--seed', type=int, default=54321, help='random seed')
53 | parser.add_argument('--load', type=bool, default=False, help='load the existed model')
54 | parser.add_argument('--save', type=str, default='snapshot', help='path to save the model')
55 | parser.add_argument('--epoch', type=int, default=10)
56 | parser.add_argument('--batch_size', type=int, default=256, help='path to save the model')
57 | parser.add_argument('--num_workers', type=int, default=8, help='number of workers of pytorch')
58 | parser.add_argument('--model', type=str, default='sr_cnn', help='model')
59 | parser.add_argument('--auto', type=bool, default=False, help='Automatic filling parameters')
60 |
61 | args = parser.parse_args()
62 | if args.auto:
63 | data, window = auto(args.epoch)
64 | else:
65 | data, window = args.data, args.window
66 | torch.cuda.manual_seed(args.seed)
67 | np.random.seed(args.seed)
68 | models = {
69 | 'sr_cnn': sr_cnn,
70 | }
71 | model = args.model
72 | root_path = os.getcwd()
73 | train_data_path = root_path + '/' + data + '_' + str(window) + '_train.json'
74 | model_path = root_path + '/' + args.save + '/'
75 | if args.load:
76 | load_path = root_path + '/' + args.load
77 | else:
78 | load_path = None
79 |
80 | total_time = 0
81 | time_start = time.time()
82 | models[model](train_data_path, model_path, window, args.lr, args.epoch, args.batch_size, args.num_workers,
83 | load_path=load_path)
84 | time_end = time.time()
85 | total_time += time_end - time_start
86 | print('time used for training:', total_time, 'seconds')
87 |
--------------------------------------------------------------------------------
/srcnn/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (C) Microsoft Corporation. All rights reserved.
3 |
4 | Microsoft Corporation ("Microsoft") grants you a nonexclusive, perpetual,
5 | royalty-free right to use, copy, and modify the software code provided by us
6 | ("Software Code"). You may not sublicense the Software Code or any use of it
7 | (except to your affiliates and to vendors to perform work on your behalf)
8 | through distribution, network access, service agreement, lease, rental, or
9 | otherwise. This license does not purport to express any claim of ownership over
10 | data you may have shared with Microsoft in the creation of the Software Code.
11 | Unless applicable law gives you more rights, Microsoft reserves all other
12 | rights not expressly granted herein, whether by implication, estoppel or
13 | otherwise.
14 |
15 | THE SOFTWARE CODE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 | MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
19 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
21 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
22 | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23 | ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE
24 | POSSIBILITY OF SUCH DAMAGE.
25 | """
26 |
27 | import pickle
28 | import csv
29 | import numpy as np
30 | import torch.nn as nn
31 | import torch.utils.data as data
32 | from torch.autograd import Variable
33 | from tqdm import tqdm
34 | from torch.utils.data import Dataset
35 | from srcnn.net import *
36 | import json
37 | from msanomalydetector.util import average_filter
38 | from msanomalydetector.spectral_residual import SpectralResidual
39 |
40 |
41 | def read_pkl(path):
42 | with open(path, 'rb') as f:
43 | return pickle.load(f)
44 |
45 |
46 | def read_csv_kpi(path):
47 | tm = []
48 | vl = []
49 | lb = []
50 | with open(path) as f:
51 | input = csv.reader(f, delimiter=',')
52 | cnt = 0
53 | for row in input:
54 | if cnt == 0:
55 | cnt += 1
56 | continue
57 | tm.append(int(row[0]))
58 | vl.append(float(row[1]))
59 | lb.append(int(row[2]))
60 | cnt += 1
61 | f.close()
62 | return tm, vl, lb
63 |
64 |
65 | def read_csv(path):
66 | tm = []
67 | vl = []
68 | with open(path, 'r+') as f:
69 | input = csv.reader(f, delimiter=',')
70 | cnt = 0
71 | for row in input:
72 | if cnt == 0:
73 | cnt += 1
74 | continue
75 | tm.append(cnt)
76 | vl.append(float(row[1]))
77 | f.close()
78 | return tm, vl
79 |
80 |
81 | def sr_cnn(data_path, model_path, win_size, lr, epochs, batch, num_worker, load_path=None):
82 | def adjust_lr(optimizer, epoch):
83 | base_lr = lr
84 | cur_lr = base_lr * (0.5 ** ((epoch + 10) // 10))
85 | for param in optimizer.param_groups:
86 | param['lr'] = cur_lr
87 |
88 | def Var(x):
89 | return Variable(x.cuda())
90 |
91 | def loss_function(x, lb):
92 | l2_reg = 0.
93 | l2_weight = 0.
94 | for W in net.parameters():
95 | l2_reg = l2_reg + W.norm(2)
96 | kpiweight = torch.ones(lb.shape)
97 | kpiweight[lb == 1] = win_size // 100
98 | kpiweight = kpiweight.cuda()
99 | BCE = F.binary_cross_entropy(x, lb, weight=kpiweight, reduction='sum')
100 | return l2_reg * l2_weight + BCE
101 |
102 | def calc(pred, true):
103 | TP = 0
104 | FP = 0
105 | TN = 0
106 | FN = 0
107 | for pre, gt in zip(pred, true):
108 | if gt == 1:
109 | if pre == 1:
110 | TP += 1
111 | else:
112 | FN += 1
113 | if gt == 0:
114 | if pre == 1:
115 | FP += 1
116 | else:
117 | TN += 1
118 | print('TP=%d FP=%d TN=%d FN=%d' % (TP, FP, TN, FN))
119 | return TP, FP, TN, FN
120 |
121 | def train(epoch, net, gen_set):
122 | train_loader = data.DataLoader(dataset=gen_set, shuffle=True, num_workers=num_worker, batch_size=batch,
123 | pin_memory=True)
124 | net.train()
125 | train_loss = 0
126 | totTP, totFP, totTN, totFN = 0, 0, 0, 0
127 | threshold = 0.5
128 | for batch_idx, (inputs, lb) in enumerate(tqdm(train_loader, desc="Iteration")):
129 | optimizer.zero_grad()
130 | inputs = inputs.float()
131 | lb = lb.float()
132 | valueseq = Var(inputs)
133 | lb = Var(lb)
134 | output = net(valueseq)
135 | if epoch > 110:
136 | aa = output.detach().cpu().numpy().reshape(-1)
137 | res = np.zeros(aa.shape, np.int64)
138 | res[aa > threshold] = 1
139 | bb = lb.detach().cpu().numpy().reshape(-1)
140 | TP, FP, TN, FN = calc(res, bb)
141 | totTP += TP
142 | totFP += FP
143 | totTN += TN
144 | totFN += FN
145 | if batch_idx % 100 == 0:
146 | print('TP=%d FP=%d TN=%d FN=%d' % (TP, FP, TN, FN))
147 | loss1 = loss_function(output, lb)
148 | loss1.backward()
149 | train_loss += loss1.item()
150 | optimizer.step()
151 | torch.nn.utils.clip_grad_norm(net.parameters(), 5.0)
152 | if batch_idx % 100 == 0:
153 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
154 | epoch, batch_idx * len(inputs), len(train_loader.dataset),
155 | 100. * batch_idx / len(train_loader),
156 | loss1.item() / len(inputs)))
157 |
158 | model = Anomaly(win_size)
159 | net = model.cuda()
160 | gpu_num = torch.cuda.device_count()
161 | net = torch.nn.DataParallel(net, list(range(gpu_num)))
162 | print(net)
163 | base_lr = lr
164 | bp_parameters = filter(lambda p: p.requires_grad, net.parameters())
165 | optimizer = optim.SGD(bp_parameters, lr=base_lr, momentum=0.9, weight_decay=0.0)
166 |
167 | if load_path != None:
168 | net = load_model(model, load_path)
169 | print("model loaded")
170 |
171 | gen_data = gen_set(win_size, data_path)
172 | for epoch in range(1, epochs + 1):
173 | print('epoch :', epoch)
174 | train(epoch, net, gen_data)
175 | adjust_lr(optimizer, epoch)
176 | if epoch % 5 == 0:
177 | save_model(model, model_path + 'srcnn_retry' + str(epoch) + '_' + str(win_size) + '.bin')
178 | return
179 |
180 |
181 | def fft(values):
182 | wave = np.array(values)
183 | trans = np.fft.fft(wave)
184 | realnum = np.real(trans)
185 | comnum = np.imag(trans)
186 | mag = np.sqrt(realnum ** 2 + comnum ** 2)
187 | mag += 1e-5
188 | spectral = np.exp(np.log(mag) - average_filter(np.log(mag)))
189 | trans.real = trans.real * spectral / mag
190 | trans.imag = trans.imag * spectral / mag
191 | wave = np.fft.ifft(trans)
192 | mag = np.sqrt(wave.real ** 2 + wave.imag ** 2)
193 | return mag
194 |
195 |
196 | def spectral_residual(values):
197 | """
198 | This method transform a time series into spectral residual series
199 | :param values: list.
200 | a list of float values.
201 | :return: mag: list.
202 | a list of float values as the spectral residual values
203 | """
204 | EPS = 1e-8
205 | trans = np.fft.fft(values)
206 | mag = np.sqrt(trans.real ** 2 + trans.imag ** 2)
207 |
208 | maglog = [np.log(item) if abs(item) > EPS else 0 for item in mag]
209 |
210 | spectral = np.exp(maglog - average_filter(maglog, n=3))
211 |
212 | trans.real = [ireal * ispectral / imag if abs(imag) > EPS else 0
213 | for ireal, ispectral, imag in zip(trans.real, spectral, mag)]
214 | trans.imag = [iimag * ispectral / imag if abs(imag) > EPS else 0
215 | for iimag, ispectral, imag in zip(trans.imag, spectral, mag)]
216 |
217 | wave_r = np.fft.ifft(trans)
218 | mag = np.sqrt(wave_r.real ** 2 + wave_r.imag ** 2)
219 |
220 | return mag
221 |
222 |
223 | class gen_set(Dataset):
224 | def __init__(self, width, data_path):
225 | self.genlen = 0
226 | self.len = self.genlen
227 | self.width = width
228 | with open(data_path, 'r+') as fin:
229 | self.kpinegraw = json.load(fin)
230 | self.negrawlen = len(self.kpinegraw)
231 | print('length :', len(self.kpinegraw))
232 | self.len += self.negrawlen
233 | self.kpineglen = 0
234 | self.control = 0.
235 |
236 | def __len__(self):
237 | return self.len
238 |
239 | def __getitem__(self, index):
240 | idx = index % self.negrawlen
241 | datas = self.kpinegraw[idx]
242 | datas = np.array(datas)
243 | data = datas[0, :].astype(np.float64)
244 | lbs = datas[1, :].astype(np.float64)
245 | wave = spectral_residual(data)
246 | waveavg = average_filter(wave)
247 | for i in range(self.width):
248 | if wave[i] < 0.001 and waveavg[i] < 0.001:
249 | lbs[i] = 0
250 | continue
251 | ratio = wave[i] / waveavg[i]
252 | if ratio < 1.0 and lbs[i] == 1:
253 | lbs[i] = 0
254 | if ratio > 5.0:
255 | lbs[i] = 1
256 | srscore = abs(wave - waveavg) / (waveavg + 0.01)
257 | sortid = np.argsort(srscore)
258 | for idx in sortid[-2:]:
259 | if srscore[idx] > 5:
260 | lbs[idx] = 1
261 | resdata = torch.from_numpy(100 * wave)
262 | reslb = torch.from_numpy(lbs)
263 | return resdata, reslb
264 |
265 |
266 | def sr_cnn_eval(timestamp, value, label, window, net, ms_optioin, threshold=0.95, back_k=0, backaddnum=5, step=1):
267 | def Var(x):
268 | return Variable(x.cuda())
269 |
270 | def modelwork(x, net):
271 | with torch.no_grad():
272 | x = torch.from_numpy(100 * x).float()
273 | x = torch.unsqueeze(x, 0)
274 | x = Var(x)
275 | output = net(x)
276 | aa = output.detach().cpu().numpy().reshape(-1)
277 | res = np.zeros(aa.shape, np.int64)
278 | res[aa > threshold] = 1
279 | return res, aa
280 |
281 | win_size = window
282 | length = len(timestamp)
283 | if back_k <= 5:
284 | back = back_k
285 | else:
286 | back = 5
287 | detres = [0] * (win_size - backaddnum)
288 | scores = [0] * (win_size - backaddnum)
289 |
290 | for pt in range(win_size - backaddnum + back + step, length - back, step):
291 | head = max(0, pt - (win_size - backaddnum))
292 | tail = min(length, pt)
293 | wave = np.array(SpectralResidual.extend_series(value[head:tail + back]))
294 | mag = spectral_residual(wave)
295 | modeloutput, rawout = modelwork(mag, net)
296 | for ipt in range(pt - step - back, pt - back):
297 | detres.append(modeloutput[ipt - head])
298 | scores.append(rawout[ipt - head].item())
299 | detres += [0] * (length - len(detres))
300 | scores += [0] * (length - len(scores))
301 |
302 | if ms_optioin == 'anomaly':
303 | last = -1
304 | interval = min([timestamp[i] - timestamp[i - 1] for i in range(1, len(timestamp))])
305 | for i in range(1, len(timestamp)):
306 | if timestamp[i] - timestamp[i - 1] > interval:
307 | if last >= 0 and i - last < 1000:
308 | detres[i] = 1
309 | scores[i] = 1
310 | if detres[i] == 1:
311 | last = i
312 |
313 | return timestamp[:].tolist(), label[:], detres[:], scores[:]
314 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/anomalydetector/a3260ea0ddfb868986b924a245e003a97143f9df/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_boundary_utils.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import numpy as np
3 | from msanomalydetector import boundary_utils
4 |
5 |
6 | class TestBoundaryUnit(unittest.TestCase):
7 | def test_calculate_boundary_unit(self):
8 | data = [139809.0, 139706.0, 140562.0, 140534.0, 140568.0, 139934.0, 139392.0, 141714.0, 144167.0, 147127.0,
9 | 147450.0, 147991.0, 151621.0, 154912.0, 158443.0, 160899.0, 164170.0, 164339.0, 165780.0, 167373.0,
10 | 167654.0, 168863.0, 169472.0, 169830.0, 169632.0, 169028.0, 165843.0, 162517.0, 159335.0, 156503.0,
11 | 151731.0, 151612.0, 151911.0, 157120.0, 157027.0, 159949.0, 160263.0, 160073.0, 160001.0, 159721.0,
12 | 160138.0, 160292.0, 160280.0, 159822.0, 159482.0, 159384.0, 159038.0, 158901.0, 158899.0, 156036.0]
13 |
14 | is_anomaly = [False, False, False, False, False, False, False, False, False, False, False, False, False, False,
15 | False, False, False, False, False, False, False, False, False, False, False, False, False, False,
16 | False, False, True, True, True, False, False, False, False, False, False, False, False, False, False,
17 | False, False, False, False, False, False, False]
18 |
19 | expected_output = \
20 | [148560.58510638, 148567.58510638, 148574.58510638, 148576.08510638, 148577.58510638, 148864.08510638,
21 | 149150.58510638, 149763.83510638, 150377.08510638, 151857.08510638, 152018.58510638, 152289.08510638,
22 | 154104.08510638, 155749.58510638, 157515.08510638, 158743.08510638, 160378.58510638, 160463.08510638,
23 | 161183.58510638, 161183.58510638, 161183.58510638, 161183.58510638, 161183.58510638, 161183.58510638,
24 | 161183.58510638, 161183.58510638, 161183.58510638, 159552.08510638, 158425.08510638, 158330.08510638,
25 | 158294.08510638, 158268.08510638, 158268.08510638, 158268.08510638, 158268.08510638, 158204.58510638,
26 | 158154.08510638, 158154.08510638, 158154.08510638, 158154.08510638, 158154.08510638, 158154.08510638,
27 | 158179.33510638, 158204.58510638, 158179.33510638, 158154.08510638, 158094.33510638, 158034.58510638,
28 | 158010.08510638, 157985.58510638]
29 |
30 | actual_output = boundary_utils.calculate_boundary_unit_entire(np.asarray(data, dtype=float), is_anomaly)
31 | for e, v in zip(expected_output, actual_output):
32 | self.assertAlmostEqual(e, v)
33 |
34 | expected_last_unit = 156748.27551020408
35 | actual_last_unit = boundary_utils.calculate_boundary_unit_last(np.asarray(data, dtype=float))
36 | self.assertAlmostEqual(expected_last_unit, actual_last_unit)
37 |
38 | def test_calculate_boundary_unit_negative(self):
39 | data = [-21901.0, -31123.0, -33203.0, -33236.0, -54681.0, -112808.0, -5368.0, -40021.0, -35.0, -72593.0,
40 | -30880.0, -34597.0, -6210.0, -5508.0, -28892.0, -41091.0, -34916.0, -31941.0, -31084.0, -7379.0,
41 | -4883.0, -32563.0, -29919.0, -33599.0, -33019.0, -35218.0, -9520.0, -4454.0, -39660.0, -29634.0,
42 | -35751.0, -39912.0, -46940.0, -28969.0, -20196.0, -57031.0, -45264.0, -44059.0, -29180.0, -34642.0,
43 | -11041.0, -10455.0, -40181.0, -43345.0, -37045.0, -33232.0, -37800.0, -9240.0, -12108.0, -34654.0]
44 |
45 | is_anomaly = [False, False, False, False, False, True, False, False, False, True, False, False, False, False, False,
46 | False, False, False, False, False, False, False, False, False, False, False, False, False, False, False,
47 | False, False, False, False, False, False, False, False, False, False, False, False, False, False, False,
48 | False, False, False, False, False]
49 |
50 | expected_output = [
51 | 33250.48958333333, 33258.73958333333, 33250.48958333333, 33258.73958333333, 33250.48958333333,
52 | 32730.489583333332, 32210.489583333332, 32730.489583333332, 33250.48958333333, 33250.48958333333,
53 | 33250.48958333333, 32619.489583333332, 32190.989583333332, 32190.989583333332, 32088.989583333332,
54 | 32190.989583333332, 32190.989583333332, 32619.489583333332, 32190.989583333332, 32190.989583333332,
55 | 32190.989583333332, 32190.989583333332, 32619.489583333332, 32930.48958333333, 32930.48958333333,
56 | 32619.489583333332, 32190.989583333332, 32930.48958333333, 33158.48958333333, 33448.48958333333,
57 | 33448.48958333333, 33969.98958333333, 33969.98958333333, 33969.98958333333, 33969.98958333333,
58 | 34524.48958333333, 35171.48958333333, 34524.48958333333, 35171.48958333333, 35171.48958333333,
59 | 33969.98958333333, 33969.98958333333, 33972.98958333333, 33975.98958333333, 33972.98958333333,
60 | 33969.98958333333, 33617.48958333333, 33969.98958333333, 33620.48958333333, 33975.98958333333]
61 |
62 | actual_output = boundary_utils.calculate_boundary_unit_entire(np.asarray(data), is_anomaly)
63 | for e, v in zip(expected_output, actual_output):
64 | self.assertAlmostEqual(e, v)
65 |
66 | expected_last_unit = 33197.17346938775
67 | actual_last_unit = boundary_utils.calculate_boundary_unit_last(np.asarray(data))
68 | self.assertAlmostEqual(expected_last_unit, actual_last_unit)
69 |
70 | def test_calculate_margin(self):
71 | self.assertAlmostEqual(boundary_utils.calculate_margin(10, 0), 1843316.2871148242)
72 | self.assertAlmostEqual(boundary_utils.calculate_margin(10, 5), 502228.4038287002)
73 | self.assertAlmostEqual(boundary_utils.calculate_margin(10, 25), 3359.7473532360186)
74 | self.assertAlmostEqual(boundary_utils.calculate_margin(10, 95), 0.0014700521929794912)
75 | self.assertAlmostEqual(boundary_utils.calculate_margin(10, 99), 0.00016994687082728675)
76 | self.assertAlmostEqual(boundary_utils.calculate_margin(10, 100), 0.0)
77 | self.assertAlmostEqual(boundary_utils.calculate_margin(345969.3476, 79.7333448252325), 3762.3800000299298)
78 |
79 | def test_calculate_anomaly_score(self):
80 | self.assertAlmostEqual(boundary_utils.calculate_anomaly_score(10, 15, 5, False), 0)
81 | self.assertAlmostEqual(boundary_utils.calculate_anomaly_score(10, 15, 5, True), 0.5)
82 | self.assertAlmostEqual(boundary_utils.calculate_anomaly_score(10+1e-5, 10, 1, True), 0.005884191895350754)
83 | self.assertAlmostEqual(boundary_utils.calculate_anomaly_score(10+1e-7, 10, 1, True), 5.884191859812512e-05)
84 |
85 |
86 | if __name__ == '__main__':
87 | unittest.main()
88 |
--------------------------------------------------------------------------------
/tests/test_spectral_residual.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import pandas as pd
3 | import numpy as np
4 | from msanomalydetector import SpectralResidual, DetectMode
5 |
6 |
7 | class FunctionalyTest(unittest.TestCase):
8 | def test_anomaly_only_mode(self):
9 | frame = pd.DataFrame({'timestamp': pd.date_range('2020-01-01', periods=100, freq='1D'),
10 | 'value': np.linspace(1, 100, 100)})
11 | model = SpectralResidual(frame, threshold=0.3, mag_window=3, score_window=21, sensitivity=99,
12 | detect_mode=DetectMode.anomaly_only, batch_size=0)
13 | result = model.detect()
14 | self.assertEqual(result.shape[0], frame.shape[0])
15 | self.assertTrue('value' in result.columns)
16 | self.assertTrue('isAnomaly' in result.columns)
17 | self.assertTrue('score' in result.columns)
18 | self.assertTrue('expectedValue' not in result.columns)
19 | self.assertTrue('upperBoundary' not in result.columns)
20 | self.assertTrue('lowerBoundary' not in result.columns)
21 |
22 | def test_anomaly_and_margin_mode(self):
23 | frame = pd.DataFrame({'timestamp': pd.date_range('2020-01-01', periods=100, freq='1D'),
24 | 'value': np.linspace(1, 100, 100)})
25 | model = SpectralResidual(frame, threshold=0.3, mag_window=3, score_window=21, sensitivity=99,
26 | detect_mode=DetectMode.anomaly_and_margin, batch_size=0)
27 | result = model.detect()
28 | self.assertEqual(result.shape[0], frame.shape[0])
29 | self.assertTrue('value' in result.columns)
30 | self.assertTrue('isAnomaly' in result.columns)
31 | self.assertTrue('score' in result.columns)
32 | self.assertTrue('expectedValue' in result.columns)
33 | self.assertTrue('upperBoundary' in result.columns)
34 | self.assertTrue('lowerBoundary' in result.columns)
35 |
36 | def test_batch_mode(self):
37 | frame = pd.DataFrame({'timestamp': pd.date_range('2020-01-01', periods=100, freq='1D'),
38 | 'value': np.linspace(1, 100, 100)})
39 | model = SpectralResidual(frame, threshold=0.3, mag_window=3, score_window=21, sensitivity=99,
40 | detect_mode=DetectMode.anomaly_and_margin, batch_size=33)
41 | result = model.detect()
42 | self.assertEqual(result.shape[0], frame.shape[0])
43 | self.assertTrue('value' in result.columns)
44 | self.assertTrue('isAnomaly' in result.columns)
45 | self.assertTrue('score' in result.columns)
46 | self.assertTrue('expectedValue' in result.columns)
47 | self.assertTrue('upperBoundary' in result.columns)
48 | self.assertTrue('lowerBoundary' in result.columns)
49 |
50 |
51 | if __name__ == '__main__':
52 | unittest.main()
53 |
--------------------------------------------------------------------------------
/version.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Version string and parsed tuple. Keeps it all in one place.
4 |
5 | """
6 | __version__ = '1.1'
7 | VERSION = tuple(int(x) for x in __version__.split('.'))
8 |
--------------------------------------------------------------------------------