├── .gitattributes ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── SECURITY.md ├── aml_component ├── README.md ├── ad-pipeline.png ├── ad_component.yaml ├── conda.yaml ├── constants.py ├── error_messages.py ├── invoker.py ├── sr_detector.py ├── tests │ ├── __init__.py │ ├── test_error_input.py │ └── test_functionality.py └── validation.py ├── main.py ├── msanomalydetector ├── __init__.py ├── _anomaly_kernel_cython.c ├── _anomaly_kernel_cython.pyx ├── boundary_utils.py ├── spectral_residual.py └── util.py ├── requirements.txt ├── samples └── sample.csv ├── setup.py ├── srcnn ├── competition_metric.py ├── evalue.py ├── generate_data.py ├── net.py ├── train.py └── utils.py ├── tests ├── __init__.py ├── test_boundary_utils.py └── test_spectral_residual.py └── version.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Set the default behavior, in case people don't have core.autocrlf set. 2 | * text=auto 3 | 4 | # Use text conventions for commonly used text extensions. 5 | *.csv text 6 | *.ini text 7 | *.json text 8 | *.txt text 9 | *.xml text 10 | 11 | # Denote all files that are truly binary and should not be modified. 12 | *.dll binary 13 | *.exe binary 14 | *.gz binary 15 | *.ico binary 16 | *.jpg binary 17 | *.lib binary 18 | *.pdb binary 19 | *.pdf binary 20 | *.png binary 21 | *.wim binary 22 | *.zip binary 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Mac DS_Store files 2 | .DS_Store 3 | 4 | # Compiled class file 5 | *.class 6 | 7 | # Log file 8 | *.log 9 | 10 | # BlueJ files 11 | *.ctxt 12 | 13 | # Mobile Tools for Java (J2ME) 14 | .mtj.tmp/ 15 | 16 | # Package Files # 17 | *.jar 18 | *.war 19 | *.nar 20 | *.ear 21 | *.zip 22 | *.tar.gz 23 | *.rar 24 | 25 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 26 | hs_err_pid* 27 | 28 | # Byte-compiled / optimized / DLL files 29 | __pycache__/ 30 | *.py[cod] 31 | *$py.class 32 | 33 | # C extensions 34 | *.so 35 | 36 | # Distribution / packaging 37 | .Python 38 | build/ 39 | develop-eggs/ 40 | dist/ 41 | downloads/ 42 | eggs/ 43 | .eggs/ 44 | lib/ 45 | lib64/ 46 | parts/ 47 | sdist/ 48 | var/ 49 | .idea/ 50 | wheels/ 51 | *.egg-info/ 52 | .installed.cfg 53 | *.egg 54 | MANIFEST 55 | 56 | # PyInstaller 57 | # Usually these files are written by a python script from a template 58 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 59 | *.manifest 60 | *.spec 61 | 62 | # Installer logs 63 | pip-log.txt 64 | pip-delete-this-directory.txt 65 | 66 | # Unit test / coverage reports 67 | htmlcov/ 68 | .tox/ 69 | .coverage 70 | .coverage.* 71 | .cache 72 | nosetests.xml 73 | coverage.xml 74 | *.cover 75 | .hypothesis/ 76 | .pytest_cache/ 77 | 78 | # Translations 79 | *.mo 80 | *.pot 81 | 82 | # Django stuff: 83 | *.log 84 | local_settings.py 85 | db.sqlite3 86 | 87 | # Flask stuff: 88 | instance/ 89 | .webassets-cache 90 | 91 | # Scrapy stuff: 92 | .scrapy 93 | 94 | # Sphinx documentation 95 | docs/_build/ 96 | 97 | # PyBuilder 98 | target/ 99 | 100 | # Jupyter Notebook 101 | .ipynb_checkpoints 102 | 103 | # pyenv 104 | .python-version 105 | 106 | # celery beat schedule file 107 | celerybeat-schedule 108 | 109 | # SageMath parsed files 110 | *.sage.py 111 | 112 | # Environments 113 | .env 114 | .venv 115 | env/ 116 | venv/ 117 | ENV/ 118 | env.bak/ 119 | venv.bak/ 120 | 121 | # Spyder project settings 122 | .spyderproject 123 | .spyproject 124 | 125 | # Rope project settings 126 | .ropeproject 127 | 128 | # mkdocs documentation 129 | /site 130 | 131 | # mypy 132 | .mypy_cache/ 133 | 134 | # Mac DS_Store files 135 | .DS_Store 136 | # VS code 137 | .vscode 138 | 139 | # Ev2 Generator binaries 140 | bin 141 | packages 142 | debug/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include version.py 2 | include setup.py -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Contributing 3 | 4 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 5 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 6 | the rights to use your contribution. For details, visit https://cla.microsoft.com. 7 | 8 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide 9 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions 10 | provided by the bot. You will only need to do this once across all repos using our CLA. 11 | 12 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 13 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 14 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 15 | 16 | Users can run SR by refering sample here 17 | 18 | https://github.com/microsoft/anomalydetector/blob/master/main.py 19 | This sample only RUN SR, for SR-CNN please refer the below section. Both SR and SR-CNN use the same evaluation in evaluate.py. 20 | 21 | The SR-CNN project is consisted of three major parts.
22 | 1.generate_data.py is used for preprocess the data, where the original continuous time series are splited according to window size and artificial outliers are injected in proportion.
23 | ` 24 | python generate_data.py --data 25 | `
26 | where dataset is the file name of data folder.If you want to change the default config, you can use the command line args:
27 | ` 28 | python generate_data.py -data --window 256 --step 128 29 | `
30 | 2.train.py is the network trianing module of SR-CNN. SR transformer is applied on each time-series before training.
31 | ` 32 | python trian.py -data 33 | `
34 | 3.evalue.py is the evaluation module.As mentioned in our paper,
35 | ` 36 | We evaluate our model from three aspects,accuracy,efficiency and generality.We use precision,recall and F1-score to indicate the accuracy of our model.In real applications,the human operators do not care about the point-wise metrics. It is acceptable for an algorithm to trigger an alert for any point in a contiguous anomaly segment if the delay is not too long.Thus,we adopt the evaluation strategy following[23].We mark the whole segment of continuous anomalies as a positive sample which means no matter how many anomalies have been detected in this segment,only one effective detection will be counted.If any point in ananomaly segment can be detected by the algorithm,and the delay of this point is no more than k from the start point of the anomaly segment, we say this segment is detected correctly.Thus,all points in this segment are treated as correct,and the points outside the anomaly segments are treated as normal. 37 | `
38 | we set different delays to verify whether a whole section of anomalies can be detected in time. For example, When delay = 7, for an entire segment of anomaly, if the anomaly detector can issue an alarm at its first 7 points, it is considered that the entire segment of anomaly has been successfully detected, otherwise it is considered to have not been detected.
39 | Run the code:
40 | ` 41 | python evalue.py -data 42 | `
43 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /aml_component/README.md: -------------------------------------------------------------------------------- 1 | # Spectral Residual Anomaly Detection Component 2 | 3 | This folder specifies the Spectral Residual Anomaly Detection component that can be used in Azure Machine Learning designer. The details of the Spectral Residual algorithm can be found at https://arxiv.org/pdf/1906.03821.pdf. 4 | 5 | ## Component Specification 6 | 7 | This section describes the specification of [Spectral Residual Anomaly Detection Component](./ad_component.yaml). 8 | 9 | ### Input Specification 10 | 11 | * `Input`. AnyDirectory type means you need to register your dataset as **File dataset** to the workspace. The data set should contain at least 12 rows. Each row should contain a timestamp column and one or more columns that are to be detected. 12 | * `Detect Mode`. The following two detect modes are supported. 13 | 1. `AnomalyOnly`. In this mode, the module outputs columns `isAnomaly`, `mag` and `score`. 14 | 2. `AnomalyAndMargin`. In this mode, the module outputs columns `isAnomaly`, `mag`, `score`, `expectedValue`, `lowerBoundary`, `upperBoundary`. 15 | * `Timestamp Column`. The column that contains timestamp. The timestamp should be in ascending order. No duplication is allowed in timestamp. 16 | * `Value Column`. One or more columns that are to be detected. The data in these columns should be numeric. Absolute value greater than 1e100 is not allowed. 17 | * `Batch Size`. The number of rows to be detected in each batch. The batch size should be at least 12. Set this parameter to 0 or negative number if you want to detect all rows in one batch. 18 | * `Threshold`. In AnomalyOnly mode, points are detected as anomaly if its `score` is greater than threshold. In AnomalyAndMargin mode, this parameter and `sensitivity` works together to filter anomaly. 19 | * `Sensitivity`. This parameter is used in AnomalyAndMargin mode to determine the range of the boundaries. 20 | * `Append result column to output`. If this parameter is set, the input data set will be output together with the results. Otherwise, only the results will be output. 21 | 22 | ### Output Specification 23 | The output data set will contain a fraction of the following columns according to the `Detect Mode` parameter. If multiple value colums are selected, the result columns will add value column names as postfix. 24 | * `isAnomaly`. The anomaly result. 25 | * `mag`. The magnitude after spectral residual transformation. 26 | * `score`. A value indicates the significance of the anomaly. 27 | In AnomalyAndMargin mode, the following columns will be output in addition the the above three columns. 28 | * `expectedValue`. The expected value of each point. 29 | * `lowerBoundary`. The lower boundary at each point that the algorithm can tolerant as not anomaly. 30 | * `upperBoundary`. The upper boundary at each point that the algorithm can tolerant as not anomaly. 31 | 32 | ## How to create a new component in Azure Machine Learning 33 | 34 | Follow [this tutorial](https://github.com/Azure/AzureMachineLearningGallery/blob/main/tutorial/tutorial1-use-existing-components.md) to create a new component in your Azure Machine Learning workspace. 35 | 36 | After creating component successfully, you can use it in Azure Machine Learning designer. 37 | 38 | ## How to build a pipeline in AML designer 39 | 40 | 1. Prepare input dataset for the component. 41 | Register this [sample AnomalyDetector-Manufacture dataset](https://github.com/microsoft/Cognitive-Samples-IntelligentKiosk/blob/master/Kiosk/Assets/AnomalyDetector/AnomalyDetector-Manufacture.csv) as **Tabular dataset** in your Azure Machine Learning workspace. 42 | 43 | The dataset above is a sample dataset. You can use your own dataset, make sure that it is registered as Tabular dataset and you can also preprocess your dataset using Designer built-in modules. Make sure that the input dataset of **Spectral Residual Anomaly Detection** is with following format, and the count of time series must be more than 12: 44 | 45 | |Timestamp|Value| 46 | |---|---| 47 | |2018/7/1 0:00|22| 48 | |2018/7/1 2:00|22| 49 | |2018/7/1 4:00|22| 50 | |2018/7/1 6:00|22| 51 | |2018/7/1 8:00|52.93218322| 52 | |2018/7/1 10:00|52.81943684| 53 | |2018/7/1 12:00|52.33277765| 54 | |2018/7/1 14:00|52.82106858| 55 | |2018/7/1 16:00|52.93218322| 56 | |2018/7/1 18:00|22| 57 | |2018/7/1 20:00|22| 58 | |2018/7/1 22:00|22| 59 | |2018/7/2 0:00|22| 60 | |2018/7/2 2:00|22| 61 | |2018/7/2 4:00|22| 62 | |2018/7/2 6:00|22| 63 | 64 | 1. Open AML designer, create a new pipeline draft and drag the registered dataset to canvas. 65 | 66 | Add **Spectral Residual Anomaly Detection** to canvas, connect it to the dataset, and configure the parameters. The pipeline graph is like following: 67 | 68 | ![](./ad-pipeline.png) 69 | 70 | 1. Submit the pipeline. 71 | 1. When the pipeline runs completed, you can click on **Visualize** icon in the **Outputs+logs** tab in the right panel of the **Spectral Residual Anomaly Detection** module, or right-click the module to select **Visualize**. 72 | 73 | -------------------------------------------------------------------------------- /aml_component/ad-pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/anomalydetector/a3260ea0ddfb868986b924a245e003a97143f9df/aml_component/ad-pipeline.png -------------------------------------------------------------------------------- /aml_component/ad_component.yaml: -------------------------------------------------------------------------------- 1 | $schema: http://azureml/sdk-2-0/CommandComponent.json 2 | name: microsoft.com.office.spectral.residual.anomaly.detection 3 | version: 1.1.1 4 | display_name: Spectral Residual Anomaly Detection 5 | is_deterministic: True 6 | type: CommandComponent 7 | description: This module implements the spectral residual anomaly detection algorithm for time-series. 8 | tags: 9 | time series: '' 10 | anomaly detection: '' 11 | inputs: 12 | dataset: 13 | type: DataFrameDirectory 14 | optional: False 15 | detect_mode: 16 | type: Enum 17 | optional: False 18 | default: AnomalyOnly 19 | description: Specify the detection mode. 20 | enum: 21 | - AnomalyOnly 22 | - AnomalyAndMargin 23 | timestamp_column: 24 | type: String 25 | optional: False 26 | description: Choose the column that contains timestamps. 27 | value_column: 28 | type: String 29 | optional: False 30 | description: Choose the column that contains values. 31 | batch_size: 32 | type: Integer 33 | optional: False 34 | default: 2000 35 | description: This parameter specifies the size of each batch that the detection is perfomed, 0 indicates to run all data in a single batch. 36 | min: 0 37 | threshold: 38 | type: Float 39 | optional: False 40 | default: 0.3 41 | description: This parameter specifies the threshold anomaly score that a point is judged as anomaly. 42 | min: 0.0 43 | max: 1.0 44 | sensitivity: 45 | type: Float 46 | optional: False 47 | default: 99 48 | description: This parameter is used in AnomalyAndMargin mode to control the width of margin. 49 | min: 0.0 50 | max: 100.0 51 | append_result_columns_to_output: 52 | type: Boolean 53 | optional: False 54 | default: True 55 | description: Append result columns to the original columns as output 56 | compute_stats_in_visualization: 57 | type: Boolean 58 | optional: False 59 | default: True 60 | description: Compute stats in visualization 61 | outputs: 62 | output_port: 63 | type: DataFrameDirectory 64 | environment: 65 | conda: 66 | conda_dependencies: 67 | name: project_environment 68 | channels: 69 | - defaults 70 | dependencies: 71 | - python=3.6.8 72 | - cython=0.29.2 73 | - numpy=1.18.1 74 | - pip=20.0 75 | - pip: 76 | - azureml-sdk==0.1.0.* 77 | - azureml-designer-core==0.0.31 78 | - --index-url https://azuremlsdktestpypi.azureedge.net/dev/aml/office/134157926D8F 79 | - --extra-index-url https://pypi.org/simple 80 | - pandas==0.25.3 81 | - pyarrow==0.16.0 82 | - matplotlib==3.1.0 83 | - git+https://github.com/microsoft/anomalydetector.git@1.1 84 | docker: 85 | image: mcr.microsoft.com/azureml/base:intelmpi2018.3-ubuntu16.04 86 | os: Linux 87 | command: python invoker.py --input {inputs.dataset} --detect-mode {inputs.detect_mode} --timestamp-column {inputs.timestamp_column} --value-column {inputs.value_column} --batch-size {inputs.batch_size} --threshold {inputs.threshold} --sensitivity {inputs.sensitivity} --append-mode {inputs.append_result_columns_to_output} --compute_stats_in_visualization {inputs.compute_stats_in_visualization} --output {outputs.output_port} 88 | ... -------------------------------------------------------------------------------- /aml_component/conda.yaml: -------------------------------------------------------------------------------- 1 | name: project_environment 2 | channels: 3 | - defaults 4 | dependencies: 5 | - python=3.6.8 6 | - cython=0.29.2 7 | - numpy=1.18.1 8 | - pip: 9 | - azureml-sdk==0.1.0.* 10 | - --index-url https://azuremlsdktestpypi.azureedge.net/dev/aml/office/134157926D8F 11 | - --extra-index-url https://pypi.org/simple 12 | - pandas==0.25.3 13 | - pyarrow==0.16.0 14 | - matplotlib==3.1.0 15 | - git+https://github.com/microsoft/anomalydetector.git@1.1 16 | -------------------------------------------------------------------------------- /aml_component/constants.py: -------------------------------------------------------------------------------- 1 | VALUE_LOWER_BOUND = -1.0e100 2 | VALUE_UPPER_BOUND = 1.0e100 3 | MIN_POINTS = 12 4 | -------------------------------------------------------------------------------- /aml_component/error_messages.py: -------------------------------------------------------------------------------- 1 | InvalidTimestamps = '''The timestamp column specified is malformed.''' 2 | InvalidSeriesOrder = '''The timestamp column specified is not in ascending order.''' 3 | DuplicateSeriesTimestamp = '''The timestamp column specified has duplicated timestamps.''' 4 | InvalidValueFormat = '''The data in column "{0}" can not be parsed as float values.''' 5 | InvalidSeriesValue = '''The data in column "{0}" contains nan values.''' 6 | ValueOverflow = '''The magnitude of data in column "{0}" exceeds limitation.''' 7 | NotEnoughPoints = '''The dataset should contain at least {0} points to run this module.''' 8 | InvalidBatchSize = '''The "batchSize" parameter should be at least {0} or 0 ''' \ 9 | '''that indicates to run all data in a batch.''' 10 | ColumnNotFoundError = '''Column with name or index "{0}" not found.''' 11 | -------------------------------------------------------------------------------- /aml_component/invoker.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | import pathlib 5 | import sr_detector 6 | import numpy as np 7 | import pandas as pd 8 | from error_messages import * 9 | from constants import * 10 | from azureml.studio.core.io.data_frame_directory import load_data_frame_from_directory, save_data_frame_to_directory 11 | 12 | PACKAGE_NAME = 'spectral_residual_anomaly_detection_module' 13 | VERSION = '1.0.0' 14 | 15 | 16 | def str2bool(v): 17 | if isinstance(v, bool): 18 | return v 19 | if v.lower() in ('yes', 'true', 't', 'y', '1'): 20 | return True 21 | elif v.lower() in ('no', 'false', 'f', 'n', '0'): 22 | return False 23 | else: 24 | raise argparse.ArgumentTypeError('Boolean value expected.') 25 | 26 | 27 | def is_timestamp_ascending(timestamps): 28 | count = len(timestamps) 29 | 30 | for i in range(count - 1): 31 | if timestamps[i] > timestamps[i + 1]: 32 | return -1 33 | elif timestamps[i] == timestamps[i + 1]: 34 | return -2 35 | return 0 36 | 37 | 38 | def invoke(input_path, detect_mode, timestamp_column, value_column, batch_size, threshold, sensitivity, 39 | appendMode, compute_stats_in_visualization, output_path): 40 | df = load_data_frame_from_directory(input_path).data 41 | logging.info(f"Shape of loaded DataFrame: {df.shape}") 42 | 43 | if df.shape[0] < MIN_POINTS: 44 | raise Exception(NotEnoughPoints.format(MIN_POINTS)) 45 | 46 | if 0 < batch_size < MIN_POINTS: 47 | raise Exception(InvalidBatchSize.format(MIN_POINTS)) 48 | 49 | if timestamp_column not in list(df.columns): 50 | raise Exception(ColumnNotFoundError.format(timestamp_column)) 51 | 52 | if value_column not in list(df.columns): 53 | raise Exception(ColumnNotFoundError.format(value_column)) 54 | 55 | timestamp = pd.DataFrame(df, columns=[timestamp_column]) 56 | timestamps = pd.to_datetime(timestamp.iloc[:, 0].values) 57 | 58 | if np.any(np.isnat(timestamps)): 59 | raise Exception(InvalidTimestamps) 60 | 61 | res = is_timestamp_ascending(timestamps) 62 | 63 | if res == -1: 64 | raise Exception(InvalidSeriesOrder) 65 | elif res == -2: 66 | raise Exception(DuplicateSeriesTimestamp) 67 | 68 | data_columns = pd.DataFrame(df, columns=[value_column]) 69 | 70 | for col in data_columns: 71 | try: 72 | float_data = data_columns[col].apply(float) 73 | except Exception as e: 74 | raise Exception(InvalidValueFormat.format(col)) 75 | 76 | if not np.all(np.isfinite(float_data)): 77 | raise Exception(InvalidSeriesValue.format(col)) 78 | 79 | if np.any(np.less(float_data, VALUE_LOWER_BOUND)) or np.any(np.greater(float_data, VALUE_UPPER_BOUND)): 80 | raise Exception(ValueOverflow.format(col)) 81 | 82 | data_columns[col] = float_data 83 | 84 | result = sr_detector.detect(timestamps, data_columns, detect_mode=detect_mode, 85 | batch_size=batch_size, threshold=threshold, sensitivity=sensitivity) 86 | 87 | if appendMode is True: 88 | result = pd.merge(df, result, left_index=True, right_index=True) 89 | 90 | save_data_frame_to_directory(output_path, result, compute_stats_in_visualization=compute_stats_in_visualization) 91 | 92 | def main(): 93 | parser = argparse.ArgumentParser() 94 | 95 | parser.add_argument( 96 | '--input-path', 97 | help='Input Dataframe path' 98 | ) 99 | 100 | parser.add_argument( 101 | '--detect-mode', 102 | choices=['AnomalyOnly', 'AnomalyAndMargin'], 103 | help='Specify the detect mode.' 104 | ) 105 | 106 | parser.add_argument( 107 | '--timestamp-column', 108 | help='This parameter specifies the column that contains timestamps.' 109 | ) 110 | 111 | parser.add_argument( 112 | '--value-column', 113 | help='This parameter specifies the column that contains values.' 114 | ) 115 | 116 | parser.add_argument( 117 | '--batch-size', type=int, 118 | help='This parameter specifies the size of each batch that the detection is perfomed.' 119 | ) 120 | 121 | parser.add_argument( 122 | '--threshold', type=float, 123 | help='This parameter specifies the threshold anomaly score that a point is judged as anomaly.' 124 | ) 125 | 126 | parser.add_argument( 127 | '--sensitivity', type=float, 128 | help='This parameter is used in AnomalyAndMargin mode to control the width of margin.' 129 | ) 130 | 131 | parser.add_argument( 132 | '--append-mode', type=str2bool, default=False, 133 | help='This parameter is used in AnomalyAndMargin mode to control the width of margin.' 134 | ) 135 | 136 | parser.add_argument( 137 | '--compute-stats-in-visualization', type=str2bool, default=True, 138 | help='Enable this parameter to get stats visualization.' 139 | ) 140 | 141 | parser.add_argument( 142 | '--output-path', 143 | help='Output Dataframe path' 144 | ) 145 | 146 | args, _ = parser.parse_known_args() 147 | 148 | logging.info(f"Hello world from {PACKAGE_NAME} {VERSION}") 149 | 150 | logging.debug("Received parameters:") 151 | logging.debug(f"input: {args.input_path}") 152 | logging.debug(f"detect mode: {args.detect_mode}") 153 | logging.debug(f"timestamp column: {args.timestamp_column}") 154 | logging.debug(f"value column: {args.value_column}") 155 | logging.debug(f"batch size: {args.batch_size}") 156 | logging.debug(f"threshold: {args.threshold}") 157 | logging.debug(f"sensitivity: {args.sensitivity}") 158 | logging.debug(f"appendMode: {args.append_mode}") 159 | logging.debug(f"output path: {args.output_path}") 160 | 161 | invoke(args.input_path, args.detect_mode, args.timestamp_column, args.value_column, 162 | args.batch_size, args.threshold, args.sensitivity, args.append_mode, 163 | args.compute_stats_in_visualization, args.output_path) 164 | 165 | 166 | if __name__ == '__main__': 167 | main() 168 | -------------------------------------------------------------------------------- /aml_component/sr_detector.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from msanomalydetector import SpectralResidual, DetectMode 3 | import matplotlib 4 | import matplotlib.pyplot as plt 5 | import logging 6 | from azureml.core.run import Run 7 | import os 8 | 9 | 10 | def log_plot_result(input_df, output_df, col_name, mode): 11 | fig = plt.figure(figsize=(20, 10)) 12 | ax1 = fig.add_subplot(211) 13 | if mode == 'AnomalyAndMargin': 14 | ax1.fill_between(output_df.index, output_df['lowerBoundary'], output_df['upperBoundary'], color='grey', alpha=0.2, zorder=1) 15 | ax1.plot(output_df.index, output_df['expectedValue'], alpha=0.5, label='expected value', zorder=8) 16 | ax1.plot(input_df.index, input_df['value'], label='value', zorder=5) 17 | ax1.legend() 18 | anomalies = input_df[output_df['isAnomaly']] 19 | ax1.scatter(anomalies.index, anomalies['value'], c='red', zorder=10) 20 | ax1.set_title(col_name) 21 | 22 | ax2 = fig.add_subplot(212) 23 | ax2.plot(output_df.index, output_df['mag']) 24 | ax2.set_title('mag') 25 | 26 | run = Run.get_context() 27 | run.log_image(col_name, plot=plt) 28 | 29 | 30 | def sr_detect(frame, detect_mode, batch_size, threshold, sensitivity): 31 | model = SpectralResidual(frame, threshold=threshold, mag_window=3, score_window=40, 32 | sensitivity=sensitivity, detect_mode=DetectMode(detect_mode), batch_size=batch_size) 33 | result = model.detect() 34 | 35 | if detect_mode == DetectMode.anomaly_and_margin.value: 36 | return result[['isAnomaly', 'mag', 'score', 'expectedValue', 'lowerBoundary', 'upperBoundary']] 37 | return result[['isAnomaly', 'mag', 'score']] 38 | 39 | 40 | def detect(timestamp, data_to_detect, detect_mode, batch_size, threshold=0.3, sensitivity=99): 41 | 42 | column_length = len(data_to_detect.columns) 43 | if column_length == 1: 44 | logging.debug('single column to detect') 45 | 46 | frame = pd.DataFrame(columns=['timestamp', 'value']) 47 | frame['timestamp'] = timestamp 48 | frame['value'] = data_to_detect.iloc[:, 0] 49 | output = sr_detect(frame, detect_mode, batch_size, threshold, sensitivity) 50 | log_plot_result(frame, output, data_to_detect.columns[0], detect_mode) 51 | else: 52 | logging.debug(f'detect {column_length} columns') 53 | output = pd.DataFrame() 54 | 55 | for col in data_to_detect.columns: 56 | frame = pd.DataFrame(columns=['timestamp', 'value']) 57 | frame['timestamp'] = timestamp 58 | frame['value'] = data_to_detect[col] 59 | result = sr_detect(frame, detect_mode, batch_size, threshold, sensitivity) 60 | log_plot_result(frame, result, col, detect_mode) 61 | result.columns = [f'{rc}_{col}' for rc in result.columns] 62 | output = pd.concat((output, result), axis=1) 63 | 64 | return output 65 | -------------------------------------------------------------------------------- /aml_component/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/anomalydetector/a3260ea0ddfb868986b924a245e003a97143f9df/aml_component/tests/__init__.py -------------------------------------------------------------------------------- /aml_component/tests/test_error_input.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('../') 3 | 4 | import unittest 5 | import numpy as np 6 | import pandas as pd 7 | import shutil 8 | import os 9 | import invoker 10 | 11 | 12 | class TestErrorInput(unittest.TestCase): 13 | def setUp(self): 14 | self.__input_path = './error_test_input_file.csv' 15 | self.__detect_mode = 'AnomalyOnly' 16 | self.__timestamp_column = 'timestamp' 17 | self.__value_column = 'value' 18 | self.__batch_size = 2000 19 | self.__threshold = 0.3 20 | self.__sensitivity = 99 21 | self.__append_mode = True 22 | self.compute_stats_in_visualization = False 23 | self.__output_path = './error_test_output_directory' 24 | 25 | def tearDown(self): 26 | self.deleteDataFrameDirectory() 27 | 28 | def deleteDataFrameDirectory(self): 29 | if os.path.exists(self.__input_path): 30 | os.remove(self.__input_path) 31 | 32 | if os.path.exists(self.__output_path): 33 | shutil.rmtree(self.__output_path) 34 | 35 | def test_empty_input(self): 36 | df = pd.DataFrame() 37 | df.to_csv(self.__input_path) 38 | self.assertRaisesRegexp(Exception, "The dataset should contain at least 12 points to run this module.", 39 | invoker.invoke, 40 | self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column, 41 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, 42 | self.__output_path) 43 | 44 | def test_invalid_timestamp(self): 45 | df = pd.DataFrame() 46 | df['timestamp'] = 'invalid' 47 | df['value'] = np.ones(20) 48 | df.to_csv(self.__input_path) 49 | self.assertRaisesRegexp(Exception, "The timestamp column specified is malformed.", 50 | invoker.invoke, 51 | self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column, 52 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, 53 | self.__output_path) 54 | 55 | def test_invalid_series_order(self): 56 | df = pd.DataFrame() 57 | timestamps = pd.date_range(start='2020-01-01', periods=20, freq='1D')[::-1] 58 | df['timestamp'] = timestamps 59 | df['value'] = np.ones(20) 60 | df.to_csv(self.__input_path) 61 | self.assertRaisesRegexp(Exception, "The timestamp column specified is not in ascending order.", 62 | invoker.invoke, 63 | self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column, 64 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, 65 | self.__output_path) 66 | 67 | def test_dunplicate_sereis(self): 68 | df = pd.DataFrame() 69 | df['value'] = np.ones(20) 70 | df['timestamp'] = '2020-01-01' 71 | df.to_csv(self.__input_path) 72 | self.assertRaisesRegexp(Exception, "The timestamp column specified has duplicated timestamps.", 73 | invoker.invoke, 74 | self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column, 75 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, 76 | self.__output_path) 77 | 78 | def test_invalid_value_format(self): 79 | df = pd.DataFrame() 80 | timestamps = pd.date_range(start='2020-01-01', periods=20, freq='1D') 81 | df['timestamp'] = timestamps 82 | df['value'] = 'invalid' 83 | df.to_csv(self.__input_path) 84 | self.assertRaisesRegexp(Exception, 'The data in column "value" can not be parsed as float values.', 85 | invoker.invoke, 86 | self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column, 87 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, 88 | self.__output_path) 89 | 90 | def test_invalid_series_value(self): 91 | df = pd.DataFrame() 92 | timestamps = pd.date_range(start='2020-01-01', periods=20, freq='1D') 93 | df['timestamp'] = timestamps 94 | df['value'] = np.nan 95 | df.to_csv(self.__input_path) 96 | self.assertRaisesRegexp(Exception, 'The data in column "value" contains nan values.', 97 | invoker.invoke, 98 | self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column, 99 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, 100 | self.__output_path) 101 | 102 | def test_value_overflow(self): 103 | df = pd.DataFrame() 104 | timestamps = pd.date_range(start='2020-01-01', periods=20, freq='1D') 105 | df['timestamp'] = timestamps 106 | df['value'] = 1e200 107 | df.to_csv(self.__input_path) 108 | self.assertRaisesRegexp(Exception, 'The magnitude of data in column "value" exceeds limitation.', 109 | invoker.invoke, 110 | self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column, 111 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, 112 | self.__output_path) 113 | 114 | def test_not_enough_points(self): 115 | df = pd.DataFrame() 116 | timestamps = pd.date_range(start='2020-01-01', periods=10, freq='1D') 117 | df['timestamp'] = timestamps 118 | df['value'] = np.sin(np.linspace(1, 10, 10)) 119 | df.to_csv(self.__input_path) 120 | self.assertRaisesRegexp(Exception, "The dataset should contain at least 12 points to run this module.", 121 | invoker.invoke, 122 | self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column, 123 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, 124 | self.__output_path) 125 | 126 | def test_invalid_batch_size(self): 127 | df = pd.DataFrame() 128 | timestamps = pd.date_range(start='2020-01-01', periods=20, freq='1D') 129 | df['timestamp'] = timestamps 130 | df['value'] = np.sin(np.linspace(1, 10, 20)) 131 | df.to_csv(self.__input_path) 132 | self.assertRaisesRegexp(Exception, 'The "batchSize" parameter should be at least 12 or 0 that indicates to run all data in a batch', 133 | invoker.invoke, 134 | self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column, 135 | 5, self.__threshold, self.__sensitivity, self.__append_mode, self.__output_path) 136 | 137 | def test_timestamp_column_missing(self): 138 | df = pd.DataFrame() 139 | timestamps = pd.date_range(start='2020-01-01', periods=20, freq='1D') 140 | df['time'] = timestamps 141 | df['value'] = np.sin(np.linspace(1, 10, 20)) 142 | df.to_csv(self.__input_path) 143 | self.assertRaisesRegexp(Exception, 'Column with name or index "timestamp" not found.', 144 | invoker.invoke, 145 | self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column, 146 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, 147 | self.__output_path) 148 | 149 | def test_value_column_missing(self): 150 | df = pd.DataFrame() 151 | timestamps = pd.date_range(start='2020-01-01', periods=20, freq='1D') 152 | df['timestamp'] = timestamps 153 | df['missed'] = np.sin(np.linspace(1, 10, 20)) 154 | df.to_csv(self.__input_path) 155 | self.assertRaisesRegexp(Exception, 'Column with name or index "value" not found.', 156 | invoker.invoke, 157 | self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column, 158 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, 159 | self.__output_path) 160 | 161 | 162 | if __name__ == '__main__': 163 | unittest.main() 164 | -------------------------------------------------------------------------------- /aml_component/tests/test_functionality.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('../') 3 | 4 | import unittest 5 | import numpy as np 6 | import pandas as pd 7 | import shutil 8 | import os 9 | import invoker 10 | 11 | 12 | class TestErrorInput(unittest.TestCase): 13 | def setUp(self): 14 | self.__input_path = './functional_test_input_folder' 15 | self.__input_csv_file = './functional_test_input_file.csv' 16 | self.__input_parquet_file = './functional_test_input_file.parquet' 17 | self.__detect_mode = 'AnomalyOnly' 18 | self.__timestamp_column = 'timestamp' 19 | self.__value_column = 'value' 20 | self.__batch_size = 2000 21 | self.__threshold = 0.3 22 | self.__sensitivity = 99 23 | self.__append_mode = True 24 | self.__output_path = './functional_test_output_directory' 25 | 26 | def tearDown(self): 27 | self.deleteDataFrameDirectory() 28 | 29 | def deleteDataFrameDirectory(self): 30 | if os.path.exists(self.__input_path): 31 | shutil.rmtree(self.__input_path) 32 | 33 | if os.path.exists(self.__input_csv_file): 34 | os.remove(self.__input_csv_file) 35 | 36 | if os.path.exists(self.__input_parquet_file): 37 | os.remove(self.__input_parquet_file) 38 | 39 | if os.path.exists(self.__output_path): 40 | shutil.rmtree(self.__output_path) 41 | 42 | def generate_input_data_frame(self, start_date: str = '2020-01-01'): 43 | df = pd.DataFrame() 44 | df['timestamp'] = pd.date_range(start=start_date, periods=200, freq='1D') 45 | df['value'] = np.sin(np.linspace(1, 20, 200)) 46 | return df 47 | 48 | def generate_input_folder(self, file_type: str = 'csv'): 49 | if not os.path.isdir(self.__input_path): 50 | os.mkdir(self.__input_path) 51 | start_dates = ['2018-01-01', '2019-01-01', '2020-01-01'] 52 | for start_date in start_dates: 53 | df = self.generate_input_data_frame(start_date) 54 | if file_type == 'csv': 55 | df.to_csv(f"{self.__input_path}/{start_date}.csv", index=False) 56 | elif file_type == 'parquet': 57 | df.to_parquet(f"{self.__input_path}/{start_date}.parquet", index=False) 58 | else: 59 | raise Exception(f'Unsupported input data type {file_type}, only csv and parquet file are allowed') 60 | 61 | def testAnomalyOnlyModeCsvFile(self): 62 | df = self.generate_input_data_frame() 63 | df.to_csv(self.__input_csv_file, index=False) 64 | invoker.invoke(self.__input_csv_file, self.__detect_mode, self.__timestamp_column, self.__value_column, 65 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, self.__output_path) 66 | result = pd.read_csv(f"{self.__output_path}/output.csv") 67 | self.assertEqual(result.shape[0], 200) 68 | self.assertTrue('value' in result.columns) 69 | self.assertTrue('isAnomaly' in result.columns) 70 | self.assertTrue('score' in result.columns) 71 | self.assertTrue('expectedValue' not in result.columns) 72 | self.assertTrue('upperBoundary' not in result.columns) 73 | self.assertTrue('lowerBoundary' not in result.columns) 74 | 75 | def testAnomalyOnlyModeCsvFolder(self): 76 | self.generate_input_folder() 77 | invoker.invoke(self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column, 78 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, self.__output_path) 79 | result = pd.read_csv(f"{self.__output_path}/output.csv") 80 | self.assertEqual(result.shape[0], 600) 81 | self.assertTrue('value' in result.columns) 82 | self.assertTrue('isAnomaly' in result.columns) 83 | self.assertTrue('score' in result.columns) 84 | self.assertTrue('expectedValue' not in result.columns) 85 | self.assertTrue('upperBoundary' not in result.columns) 86 | self.assertTrue('lowerBoundary' not in result.columns) 87 | 88 | def testAnomalyOnlyModeParquetFile(self): 89 | df = self.generate_input_data_frame() 90 | df.to_parquet(self.__input_parquet_file, index=False) 91 | invoker.invoke(self.__input_parquet_file, self.__detect_mode, self.__timestamp_column, self.__value_column, 92 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, self.__output_path) 93 | result = pd.read_csv(f"{self.__output_path}/output.csv") 94 | self.assertEqual(result.shape[0], 200) 95 | self.assertTrue('value' in result.columns) 96 | self.assertTrue('isAnomaly' in result.columns) 97 | self.assertTrue('score' in result.columns) 98 | self.assertTrue('expectedValue' not in result.columns) 99 | self.assertTrue('upperBoundary' not in result.columns) 100 | self.assertTrue('lowerBoundary' not in result.columns) 101 | 102 | def testAnomalyOnlyModeParquetFolder(self): 103 | self.generate_input_folder('parquet') 104 | invoker.invoke(self.__input_path, self.__detect_mode, self.__timestamp_column, self.__value_column, 105 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, self.__output_path) 106 | result = pd.read_csv(f"{self.__output_path}/output.csv") 107 | self.assertEqual(result.shape[0], 600) 108 | self.assertTrue('value' in result.columns) 109 | self.assertTrue('isAnomaly' in result.columns) 110 | self.assertTrue('score' in result.columns) 111 | self.assertTrue('expectedValue' not in result.columns) 112 | self.assertTrue('upperBoundary' not in result.columns) 113 | self.assertTrue('lowerBoundary' not in result.columns) 114 | 115 | def testAnomalyAndMarginCsvFile(self): 116 | df = self.generate_input_data_frame() 117 | df.to_csv(self.__input_csv_file, index=False) 118 | invoker.invoke(self.__input_csv_file, "AnomalyAndMargin", self.__timestamp_column, self.__value_column, 119 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, self.__output_path) 120 | result = pd.read_csv(f"{self.__output_path}/output.csv") 121 | self.assertEqual(result.shape[0], 200) 122 | self.assertTrue('value' in result.columns) 123 | self.assertTrue('isAnomaly' in result.columns) 124 | self.assertTrue('score' in result.columns) 125 | self.assertTrue('expectedValue' in result.columns) 126 | self.assertTrue('upperBoundary' in result.columns) 127 | self.assertTrue('lowerBoundary' in result.columns) 128 | 129 | def testAnomalyAndMarginCsvFolder(self): 130 | self.generate_input_folder() 131 | invoker.invoke(self.__input_path, "AnomalyAndMargin", self.__timestamp_column, self.__value_column, 132 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, self.__output_path) 133 | result = pd.read_csv(f"{self.__output_path}/output.csv") 134 | self.assertEqual(result.shape[0], 600) 135 | self.assertTrue('value' in result.columns) 136 | self.assertTrue('isAnomaly' in result.columns) 137 | self.assertTrue('score' in result.columns) 138 | self.assertTrue('expectedValue' in result.columns) 139 | self.assertTrue('upperBoundary' in result.columns) 140 | self.assertTrue('lowerBoundary' in result.columns) 141 | 142 | def testAnomalyAndMarginParquetFile(self): 143 | df = self.generate_input_data_frame() 144 | df.to_parquet(self.__input_parquet_file, index=False) 145 | invoker.invoke(self.__input_parquet_file, "AnomalyAndMargin", self.__timestamp_column, self.__value_column, 146 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, self.__output_path) 147 | result = pd.read_csv(f"{self.__output_path}/output.csv") 148 | self.assertEqual(result.shape[0], 200) 149 | self.assertTrue('value' in result.columns) 150 | self.assertTrue('isAnomaly' in result.columns) 151 | self.assertTrue('score' in result.columns) 152 | self.assertTrue('expectedValue' in result.columns) 153 | self.assertTrue('upperBoundary' in result.columns) 154 | self.assertTrue('lowerBoundary' in result.columns) 155 | 156 | def testAnomalyAndMarginParquetFolder(self): 157 | self.generate_input_folder('parquet') 158 | invoker.invoke(self.__input_path, "AnomalyAndMargin", self.__timestamp_column, self.__value_column, 159 | self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, self.__output_path) 160 | result = pd.read_csv(f"{self.__output_path}/output.csv") 161 | self.assertEqual(result.shape[0], 600) 162 | self.assertTrue('value' in result.columns) 163 | self.assertTrue('isAnomaly' in result.columns) 164 | self.assertTrue('score' in result.columns) 165 | self.assertTrue('expectedValue' in result.columns) 166 | self.assertTrue('upperBoundary' in result.columns) 167 | self.assertTrue('lowerBoundary' in result.columns) 168 | 169 | def testBatchModeCsvFile(self): 170 | df = self.generate_input_data_frame() 171 | df.to_csv(self.__input_csv_file, index=False) 172 | invoker.invoke(self.__input_csv_file, "AnomalyAndMargin", self.__timestamp_column, self.__value_column, 173 | 66, self.__threshold, self.__sensitivity, self.__append_mode, self.__output_path) 174 | result = pd.read_csv(f"{self.__output_path}/output.csv") 175 | self.assertEqual(result.shape[0], 200) 176 | self.assertTrue('value' in result.columns) 177 | self.assertTrue('isAnomaly' in result.columns) 178 | self.assertTrue('score' in result.columns) 179 | self.assertTrue('expectedValue' in result.columns) 180 | self.assertTrue('upperBoundary' in result.columns) 181 | self.assertTrue('lowerBoundary' in result.columns) 182 | 183 | def testBatchModeCsvFolder(self): 184 | self.generate_input_folder() 185 | invoker.invoke(self.__input_path, "AnomalyAndMargin", self.__timestamp_column, self.__value_column, 186 | 66, self.__threshold, self.__sensitivity, self.__append_mode, self.__output_path) 187 | result = pd.read_csv(f"{self.__output_path}/output.csv") 188 | self.assertEqual(result.shape[0], 600) 189 | self.assertTrue('value' in result.columns) 190 | self.assertTrue('isAnomaly' in result.columns) 191 | self.assertTrue('score' in result.columns) 192 | self.assertTrue('expectedValue' in result.columns) 193 | self.assertTrue('upperBoundary' in result.columns) 194 | self.assertTrue('lowerBoundary' in result.columns) 195 | 196 | def testBatchModeParquetFile(self): 197 | df = self.generate_input_data_frame() 198 | df.to_parquet(self.__input_parquet_file, index=False) 199 | invoker.invoke(self.__input_parquet_file, "AnomalyAndMargin", self.__timestamp_column, self.__value_column, 200 | 66, self.__threshold, self.__sensitivity, self.__append_mode, self.__output_path) 201 | result = pd.read_csv(f"{self.__output_path}/output.csv") 202 | self.assertEqual(result.shape[0], 200) 203 | self.assertTrue('value' in result.columns) 204 | self.assertTrue('isAnomaly' in result.columns) 205 | self.assertTrue('score' in result.columns) 206 | self.assertTrue('expectedValue' in result.columns) 207 | self.assertTrue('upperBoundary' in result.columns) 208 | self.assertTrue('lowerBoundary' in result.columns) 209 | 210 | def testBatchModeParquetFolder(self): 211 | self.generate_input_folder('parquet') 212 | invoker.invoke(self.__input_path, "AnomalyAndMargin", self.__timestamp_column, self.__value_column, 213 | 66, self.__threshold, self.__sensitivity, self.__append_mode, self.__output_path) 214 | result = pd.read_csv(f"{self.__output_path}/output.csv") 215 | self.assertEqual(result.shape[0], 600) 216 | self.assertTrue('value' in result.columns) 217 | self.assertTrue('isAnomaly' in result.columns) 218 | self.assertTrue('score' in result.columns) 219 | self.assertTrue('expectedValue' in result.columns) 220 | self.assertTrue('upperBoundary' in result.columns) 221 | self.assertTrue('lowerBoundary' in result.columns) 222 | 223 | if __name__ == '__main__': 224 | unittest.main() 225 | -------------------------------------------------------------------------------- /aml_component/validation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/anomalydetector/a3260ea0ddfb868986b924a245e003a97143f9df/aml_component/validation.py -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from msanomalydetector import SpectralResidual 2 | from msanomalydetector import THRESHOLD, MAG_WINDOW, SCORE_WINDOW, DetectMode 3 | import os 4 | import pandas as pd 5 | 6 | 7 | def detect_anomaly(series, threshold, mag_window, score_window, sensitivity, detect_mode): 8 | detector = SpectralResidual(series=series, threshold=threshold, mag_window=mag_window, score_window=score_window, 9 | sensitivity=sensitivity, detect_mode=detect_mode) 10 | print(detector.detect()) 11 | 12 | 13 | if __name__ == '__main__': 14 | sample_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "samples")) 15 | for sample_file in os.listdir(sample_dir): 16 | sample = pd.read_csv(os.path.join(sample_dir, sample_file)) 17 | detect_anomaly(sample, THRESHOLD, MAG_WINDOW, SCORE_WINDOW, 99, DetectMode.anomaly_only) 18 | -------------------------------------------------------------------------------- /msanomalydetector/__init__.py: -------------------------------------------------------------------------------- 1 | from msanomalydetector.spectral_residual import SpectralResidual 2 | from msanomalydetector.util import MAX_RATIO, THRESHOLD, MAG_WINDOW, SCORE_WINDOW, DetectMode 3 | 4 | __all__ = ['SpectralResidual', 'MAX_RATIO', 'THRESHOLD', 'MAG_WINDOW', 'SCORE_WINDOW', 'DetectMode'] 5 | -------------------------------------------------------------------------------- /msanomalydetector/_anomaly_kernel_cython.pyx: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | cimport numpy as np 3 | import array 4 | import bisect 5 | 6 | 7 | cpdef float sorted_median(float[:] data, int i, int j): 8 | cdef int n = j - i 9 | cdef int mid 10 | if n == 0: 11 | raise Exception("no median for empty data") 12 | if n % 2 == 1: 13 | return data[i + n // 2] 14 | else: 15 | mid = i + n // 2 16 | return (data[mid - 1] + data[mid])/2 17 | 18 | cpdef median_filter(np.ndarray data, int window, bint need_two_end=False): 19 | cdef int w_len = window // 2 * 2 + 1 20 | cdef int t_len = len(data) 21 | cdef float[:] val = array.array('f', [x for x in data]) 22 | cdef float[:] ans = array.array('f', [x for x in data]) 23 | cdef float[:] cur_windows = array.array('f', [0 for x in range(w_len)]) 24 | cdef int delete_id 25 | cdef int add_id 26 | cdef int index 27 | if t_len < w_len: 28 | return ans 29 | for i in range(0, w_len): 30 | index = i 31 | add_id = bisect.bisect_right(cur_windows[:i], val[i]) 32 | while index > add_id: 33 | cur_windows[index] = cur_windows[index - 1] 34 | index -= 1 35 | cur_windows[add_id] = data[i] 36 | if i >= w_len // 2 and need_two_end: 37 | ans[i - w_len // 2] = sorted_median(cur_windows, 0, i + 1) 38 | ans[window // 2] = sorted_median(cur_windows, 0, w_len) 39 | for i in range(window // 2 + 1, t_len - window // 2): 40 | delete_id = bisect.bisect_right(cur_windows, val[i - window // 2 - 1]) - 1 41 | index = delete_id 42 | while index < w_len - 1: 43 | cur_windows[index] = cur_windows[index + 1] 44 | index += 1 45 | 46 | add_id = bisect.bisect_right(cur_windows[:w_len - 1], val[i + window // 2]) 47 | index = w_len - 1 48 | while index > add_id: 49 | cur_windows[index] = cur_windows[index - 1] 50 | index -= 1 51 | cur_windows[add_id] = data[i + window // 2] 52 | 53 | ans[i] = sorted_median(cur_windows, 0, w_len) 54 | 55 | if need_two_end: 56 | for i in range(t_len - window // 2, t_len): 57 | delete_id = bisect.bisect_right(cur_windows[: w_len], data[i - window // 2 - 1]) - 1 58 | index = delete_id 59 | while index < w_len - 1: 60 | cur_windows[index] = cur_windows[index + 1] 61 | index += 1 62 | w_len -= 1 63 | ans[i] = sorted_median(cur_windows[: w_len], 0, w_len) 64 | 65 | return ans 66 | -------------------------------------------------------------------------------- /msanomalydetector/boundary_utils.py: -------------------------------------------------------------------------------- 1 | import bisect 2 | import numpy as np 3 | from msanomalydetector._anomaly_kernel_cython import median_filter 4 | 5 | 6 | # pseudo - code to generate the factors. 7 | # factors = [1] 8 | # for i in range(50): 9 | # if i < 40: 10 | # factors.append(factors[-1] / (1.15 + 0.001 * i)) 11 | # else: 12 | # factors.append(factors[-1] / (1.25 + 0.01 * i)) 13 | # for i in range(50): 14 | # factors.insert(0, factors[0] * (1.25 + 0.001 * i)) 15 | 16 | factors = [ 17 | 184331.62871148242, 141902.71648305038, 109324.12672037778, 84289.9974713784, 65038.57829581667, 50222.84038287002, 18 | 38812.08684920403, 30017.081863266845, 23233.035497884553, 17996.15452973242, 13950.50738738947, 10822.736530170265, 19 | 8402.745753237783, 6528.939979205737, 5076.93622022219, 3950.92312857758, 3077.042935029268, 2398.318733460069, 20 | 1870.7634426365591, 1460.393007522685, 1140.9320371270976, 892.0500681212648, 698.0047481387048, 546.5972968979678, 21 | 428.36778753759233, 335.97473532360186, 263.71643275007995, 207.16137686573444, 162.8627176617409, 128.13746472206208, 22 | 100.8956415134347, 79.50799173635517, 62.70346351447568, 49.48971074544253, 39.09139869308257, 30.90229145698227, 23 | 24.448015393182175, 19.35709849024717, 15.338429865489042, 12.163703303322, 9.653732780414286, 7.667778221139226, 24 | 6.095213212352326, 4.8490160798347866, 3.8606815922251485, 3.076240312529999, 2.4531421949999994, 1.9578149999999996, 25 | 1.5637499999999998, 1.25, 1.0, 0.8695652173913044, 0.7554867223208555, 0.655804446459076, 0.5687809596349316, 26 | 0.4928777813127657, 0.4267340097946024, 0.36914706729636887, 0.3190553736355825, 0.27552277516026125, 0.23772456873189068, 27 | 0.20493497304473338, 0.17651591132190647, 0.1519069804835684, 0.13061649224726435, 0.11221348131208278, 0.09632058481723846, 28 | 0.08260770567516164, 0.0707863801843716, 0.06060477755511267, 0.051843265658779024, 0.0443104834690419, 0.03783986632710667, 29 | 0.03228657536442549, 0.027524787181948417, 0.02344530424356765, 0.019953450420057577, 0.01696721974494692, 0.014415649740821513, 30 | 0.012237393667929978, 0.010379468759906684, 0.008796159966022614, 0.0074480609365136455, 0.006301235986898177, 31 | 0.00532648857725966, 0.004498723460523362, 0.0037963911059268884, 0.0032010043051660104, 0.002696718032995797, 32 | 0.0022699646742388863, 0.0019091376570554135, 0.0011570531254881296, 0.000697019955113331, 0.00041737721863073713, 33 | 0.000248438820613534, 0.00014700521929794912, 8.647365841055832e-05, 5.056939088336744e-05, 2.9400808653120604e-05, 34 | 1.6994687082728674e-05, 9.767061541798089e-06 35 | ] 36 | 37 | 38 | def calculate_boundary_unit_last(data): 39 | if len(data) == 0: 40 | return 0 41 | 42 | calculation_size = len(data) - 1 43 | window = int(min(calculation_size // 3, 512)) 44 | trends = np.abs(np.asarray(median_filter(data[:calculation_size], window, need_two_end=True), dtype=float)) 45 | 46 | unit = max(np.mean(trends), 1.0) 47 | 48 | if not np.isfinite(unit): 49 | raise Exception('Not finite unit value') 50 | 51 | return unit 52 | 53 | 54 | def calculate_boundary_unit_entire(data, is_anomaly): 55 | if len(data) == 0: 56 | return [] 57 | 58 | window = int(min(len(data)//3, 512)) 59 | trend_fraction = 0.5 60 | trends = np.abs(np.asarray(median_filter(data, window, need_two_end=True), dtype=float)) 61 | valid_trend = [t for a, t in zip(is_anomaly, trends) if not a] 62 | 63 | if len(valid_trend) > 0: 64 | average_part = np.mean(valid_trend) 65 | units = trend_fraction * trends + average_part * (1 - trend_fraction) 66 | else: 67 | units = trends 68 | 69 | if not np.all(np.isfinite(units)): 70 | raise Exception('Not finite unit values') 71 | 72 | units = np.clip(units, 1.0, max(np.max(units), 1.0)) 73 | 74 | return units 75 | 76 | 77 | def calculate_margin(unit, sensitivity): 78 | 79 | def calculate_margin_core(unit, sensitivity): 80 | lb = int(sensitivity) 81 | # if lb == sensitivity: 82 | # return unit * factors[lb] 83 | 84 | return (factors[lb + 1] + (factors[lb] - factors[lb + 1]) * (1 - sensitivity + lb)) * unit 85 | 86 | if 0 > sensitivity or sensitivity > 100: 87 | raise Exception('sensitivity should be integer in [0, 100]') 88 | 89 | if unit <= 0: 90 | raise Exception('unit should be a positive number') 91 | 92 | if sensitivity == 100: 93 | return 0.0 94 | 95 | return calculate_margin_core(unit, sensitivity) 96 | 97 | 98 | def calculate_anomaly_score(value, expected_value, unit, is_anomaly): 99 | if not is_anomaly: 100 | return 0.0 101 | 102 | distance = np.abs(expected_value - value) 103 | margins = [calculate_margin(unit, i) for i in range(101)][::-1] 104 | lb = bisect.bisect_left(margins, distance) 105 | 106 | if lb == 0: 107 | return 0 108 | elif lb >= 100: 109 | return 1.0 110 | else: 111 | a, b = margins[lb-1], margins[lb] 112 | score = lb - 1 + (distance - a) / (b - a) 113 | 114 | return score / 100.0 115 | 116 | 117 | def calculate_anomaly_scores(values, expected_values, units, is_anomaly): 118 | scores = [calculate_anomaly_score(value, exp, unit, anomaly) 119 | for value, exp, unit, anomaly in zip(values, expected_values, units, is_anomaly)] 120 | return scores 121 | -------------------------------------------------------------------------------- /msanomalydetector/spectral_residual.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) Microsoft Corporation. All rights reserved.​ 3 | ​ 4 | Microsoft Corporation (“Microsoft”) grants you a nonexclusive, perpetual, 5 | royalty-free right to use, copy, and modify the software code provided by us 6 | ("Software Code"). You may not sublicense the Software Code or any use of it 7 | (except to your affiliates and to vendors to perform work on your behalf) 8 | through distribution, network access, service agreement, lease, rental, or 9 | otherwise. This license does not purport to express any claim of ownership over 10 | data you may have shared with Microsoft in the creation of the Software Code. 11 | Unless applicable law gives you more rights, Microsoft reserves all other 12 | rights not expressly granted herein, whether by implication, estoppel or 13 | otherwise. ​ 14 | ​ 15 | THE SOFTWARE CODE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS 16 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 19 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 21 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER 22 | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23 | ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE 24 | POSSIBILITY OF SUCH DAMAGE. 25 | """ 26 | 27 | import pandas as pd 28 | import numpy as np 29 | 30 | from msanomalydetector.util import * 31 | import msanomalydetector.boundary_utils as boundary_helper 32 | from msanomalydetector._anomaly_kernel_cython import median_filter 33 | 34 | 35 | class SpectralResidual: 36 | def __init__(self, series, threshold, mag_window, score_window, sensitivity, detect_mode, batch_size): 37 | self.__series__ = series 38 | self.__values__ = self.__series__['value'].tolist() 39 | self.__threshold__ = threshold 40 | self.__mag_window = mag_window 41 | self.__score_window = score_window 42 | self.__sensitivity = sensitivity 43 | self.__detect_mode = detect_mode 44 | self.__anomaly_frame = None 45 | self.__batch_size = batch_size 46 | if self.__batch_size <= 0: 47 | self.__batch_size = len(series) 48 | 49 | self.__batch_size = max(12, self.__batch_size) 50 | self.__batch_size = min(len(series), self.__batch_size) 51 | 52 | def detect(self): 53 | if self.__anomaly_frame is None: 54 | self.__anomaly_frame = self.__detect() 55 | 56 | return self.__anomaly_frame 57 | 58 | def __detect(self): 59 | anomaly_frames = [] 60 | for i in range(0, len(self.__series__), self.__batch_size): 61 | start = i 62 | end = i + self.__batch_size 63 | end = min(end, len(self.__series__)) 64 | if end - start >= 12: 65 | anomaly_frames.append(self.__detect_core(self.__series__[start:end])) 66 | else: 67 | ext_start = max(0, end - self.__batch_size) 68 | ext_frame = self.__detect_core(self.__series__[ext_start:end]) 69 | anomaly_frames.append(ext_frame[start-ext_start:]) 70 | 71 | return pd.concat(anomaly_frames, axis=0, ignore_index=True) 72 | 73 | def __detect_core(self, series): 74 | values = series['value'].values 75 | extended_series = SpectralResidual.extend_series(values) 76 | mags = self.spectral_residual_transform(extended_series) 77 | anomaly_scores = self.generate_spectral_score(mags) 78 | anomaly_frame = pd.DataFrame({Timestamp: series['timestamp'].values, 79 | Value: values, 80 | Mag: mags[:len(values)], 81 | AnomalyScore: anomaly_scores[:len(values)]}) 82 | anomaly_frame[IsAnomaly] = np.where(anomaly_frame[AnomalyScore] > self.__threshold__, True, False) 83 | 84 | if self.__detect_mode == DetectMode.anomaly_and_margin: 85 | anomaly_index = anomaly_frame[anomaly_frame[IsAnomaly]].index.tolist() 86 | anomaly_frame[ExpectedValue] = self.calculate_expected_value(values, anomaly_index) 87 | boundary_units = boundary_helper.calculate_boundary_unit_entire(values, 88 | anomaly_frame[IsAnomaly].values) 89 | anomaly_frame[AnomalyScore] = boundary_helper.calculate_anomaly_scores( 90 | values=values, 91 | expected_values=anomaly_frame[ExpectedValue].values, 92 | units=boundary_units, 93 | is_anomaly=anomaly_frame[IsAnomaly].values 94 | ) 95 | 96 | margins = [boundary_helper.calculate_margin(u, self.__sensitivity) for u in boundary_units] 97 | anomaly_frame['unit'] = boundary_units 98 | 99 | anomaly_frame[LowerBoundary] = anomaly_frame[ExpectedValue].values - margins 100 | anomaly_frame[UpperBoundary] = anomaly_frame[ExpectedValue].values + margins 101 | isLowerAnomaly = np.logical_and(anomaly_frame[IsAnomaly].values, 102 | anomaly_frame[LowerBoundary].values > values) 103 | isUpperAnomaly = np.logical_and(anomaly_frame[IsAnomaly].values, 104 | values > anomaly_frame[UpperBoundary].values) 105 | anomaly_frame[IsAnomaly] = np.logical_or(isLowerAnomaly, isUpperAnomaly) 106 | 107 | return anomaly_frame 108 | 109 | def generate_spectral_score(self, mags): 110 | ave_mag = average_filter(mags, n=self.__score_window) 111 | safeDivisors = np.clip(ave_mag, EPS, ave_mag.max()) 112 | 113 | raw_scores = np.abs(mags - ave_mag) / safeDivisors 114 | scores = np.clip(raw_scores / 10.0, 0, 1.0) 115 | 116 | return scores 117 | 118 | def spectral_residual_transform(self, values): 119 | """ 120 | This method transform a time series into spectral residual series 121 | :param values: list. 122 | a list of float values. 123 | :return: mag: list. 124 | a list of float values as the spectral residual values 125 | """ 126 | 127 | trans = np.fft.fft(values) 128 | mag = np.sqrt(trans.real ** 2 + trans.imag ** 2) 129 | eps_index = np.where(mag <= EPS)[0] 130 | mag[eps_index] = EPS 131 | 132 | mag_log = np.log(mag) 133 | mag_log[eps_index] = 0 134 | 135 | spectral = np.exp(mag_log - average_filter(mag_log, n=self.__mag_window)) 136 | 137 | trans.real = trans.real * spectral / mag 138 | trans.imag = trans.imag * spectral / mag 139 | trans.real[eps_index] = 0 140 | trans.imag[eps_index] = 0 141 | 142 | wave_r = np.fft.ifft(trans) 143 | mag = np.sqrt(wave_r.real ** 2 + wave_r.imag ** 2) 144 | return mag 145 | 146 | @staticmethod 147 | def predict_next(values): 148 | """ 149 | Predicts the next value by sum up the slope of the last value with previous values. 150 | Mathematically, g = 1/m * sum_{i=1}^{m} g(x_n, x_{n-i}), x_{n+1} = x_{n-m+1} + g * m, 151 | where g(x_i,x_j) = (x_i - x_j) / (i - j) 152 | :param values: list. 153 | a list of float numbers. 154 | :return : float. 155 | the predicted next value. 156 | """ 157 | 158 | if len(values) <= 1: 159 | raise ValueError(f'data should contain at least 2 numbers') 160 | 161 | v_last = values[-1] 162 | n = len(values) 163 | 164 | slopes = [(v_last - v) / (n - 1 - i) for i, v in enumerate(values[:-1])] 165 | 166 | return values[1] + sum(slopes) 167 | 168 | @staticmethod 169 | def extend_series(values, extend_num=5, look_ahead=5): 170 | """ 171 | extend the array data by the predicted next value 172 | :param values: list. 173 | a list of float numbers. 174 | :param extend_num: int, default 5. 175 | number of values added to the back of data. 176 | :param look_ahead: int, default 5. 177 | number of previous values used in prediction. 178 | :return: list. 179 | The result array. 180 | """ 181 | 182 | if look_ahead < 1: 183 | raise ValueError('look_ahead must be at least 1') 184 | 185 | extension = [SpectralResidual.predict_next(values[-look_ahead - 2:-1])] * extend_num 186 | return np.concatenate((values, extension), axis=0) 187 | 188 | @staticmethod 189 | def calculate_expected_value(values, anomaly_index): 190 | values = deanomaly_entire(values, anomaly_index) 191 | length = len(values) 192 | fft_coef = np.fft.fft(values) 193 | fft_coef.real = [v if length * 3 / 8 >= i or i >= length * 5 / 8 else 0 for i, v in enumerate(fft_coef.real)] 194 | fft_coef.imag = [v if length * 3 / 8 >= i or i >= length * 5 / 8 else 0 for i, v in enumerate(fft_coef.imag)] 195 | exps = np.fft.ifft(fft_coef) 196 | return exps.real 197 | -------------------------------------------------------------------------------- /msanomalydetector/util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) Microsoft Corporation. All rights reserved.​ 3 | ​ 4 | Microsoft Corporation (“Microsoft”) grants you a nonexclusive, perpetual, 5 | royalty-free right to use, copy, and modify the software code provided by us 6 | ("Software Code"). You may not sublicense the Software Code or any use of it 7 | (except to your affiliates and to vendors to perform work on your behalf) 8 | through distribution, network access, service agreement, lease, rental, or 9 | otherwise. This license does not purport to express any claim of ownership over 10 | data you may have shared with Microsoft in the creation of the Software Code. 11 | Unless applicable law gives you more rights, Microsoft reserves all other 12 | rights not expressly granted herein, whether by implication, estoppel or 13 | otherwise. ​ 14 | ​ 15 | THE SOFTWARE CODE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS 16 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 19 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 21 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER 22 | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23 | ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE 24 | POSSIBILITY OF SUCH DAMAGE. 25 | """ 26 | from enum import Enum 27 | import numpy as np 28 | 29 | IsAnomaly = "isAnomaly" 30 | AnomalyId = "id" 31 | AnomalyScore = "score" 32 | Value = "value" 33 | Timestamp = "timestamp" 34 | Mag = "mag" 35 | ExpectedValue = "expectedValue" 36 | UpperBoundary = "upperBoundary" 37 | LowerBoundary = "lowerBoundary" 38 | 39 | MAX_RATIO = 0.25 40 | EPS = 1e-8 41 | THRESHOLD = 0.3 42 | MAG_WINDOW = 3 43 | SCORE_WINDOW = 40 44 | 45 | 46 | class DetectMode(Enum): 47 | anomaly_only = 'AnomalyOnly' 48 | anomaly_and_margin = 'AnomalyAndMargin' 49 | 50 | 51 | def average_filter(values, n=3): 52 | """ 53 | Calculate the sliding window average for the give time series. 54 | Mathematically, res[i] = sum_{j=i-t+1}^{i} values[j] / t, where t = min(n, i+1) 55 | :param values: list. 56 | a list of float numbers 57 | :param n: int, default 3. 58 | window size. 59 | :return res: list. 60 | a list of value after the average_filter process. 61 | """ 62 | 63 | if n >= len(values): 64 | n = len(values) 65 | 66 | res = np.cumsum(values, dtype=float) 67 | res[n:] = res[n:] - res[:-n] 68 | res[n:] = res[n:] / n 69 | 70 | for i in range(1, n): 71 | res[i] /= (i + 1) 72 | 73 | return res 74 | 75 | 76 | def leastsq(x, y): 77 | n = len(x) 78 | sum_x = np.sum(x) 79 | sum_y = np.sum(y) 80 | sum_xx = np.sum(np.multiply(x, x)) 81 | sum_xy = np.sum(np.multiply(x, y)) 82 | a = (n * sum_xy - sum_x * sum_y) / (n * sum_xx - sum_x * sum_x) 83 | b = (sum_xx * sum_y - sum_x * sum_xy) / (n * sum_xx - sum_x * sum_x) 84 | return a, b 85 | 86 | 87 | def deanomaly_entire(values, entire_anomalies): 88 | deanomaly_data = np.copy(values) 89 | min_points_to_fit = 4 90 | for idx in entire_anomalies: 91 | step = 1 92 | start = max(idx - step, 0) 93 | end = min(len(values) - 1, idx + step) 94 | fit_values = [(i, values[i]) for i in range(start, end+1) if i not in entire_anomalies] 95 | while len(fit_values) < min_points_to_fit and (start > 0 or end < len(values)-1): 96 | step = step + 2 97 | start = max(idx - step, 0) 98 | end = min(len(values) - 1, idx + step) 99 | fit_values = [(i, values[i]) for i in range(start, end+1) if i not in entire_anomalies] 100 | 101 | if len(fit_values) > 1: 102 | x, y = tuple(zip(*fit_values)) 103 | a, b = leastsq(x, y) 104 | deanomaly_data[idx] = a * idx + b 105 | 106 | return deanomaly_data 107 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Cython>=0.29.2 2 | numpy==1.18.1 3 | pandas==0.25.3 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages, Extension 2 | from Cython.Build import cythonize 3 | from Cython.Distutils import build_ext 4 | import numpy as np 5 | 6 | __version__ = "can't find version.py" 7 | exec(compile(open('version.py').read(), 8 | 'version.py', 'exec')) 9 | 10 | extensions = [ 11 | Extension("msanomalydetector._anomaly_kernel_cython", ["msanomalydetector/_anomaly_kernel_cython.pyx"], 12 | define_macros=[('CYTHON_TRACE', '1')]) 13 | ] 14 | 15 | cmdclass = {'build_ext': build_ext} 16 | 17 | install_requires = [ 18 | 'Cython>=0.29.2', 19 | 'numpy==1.18.1', 20 | 'pandas==0.25.3' 21 | ] 22 | 23 | setup( 24 | name="msanomalydetector", 25 | description='Microsoft Anomaly Detector Package Based On Saliency Detection', 26 | packages=find_packages(), 27 | include_dirs=[np.get_include()], 28 | cmdclass=cmdclass, 29 | ext_modules=cythonize(extensions), 30 | version=__version__, 31 | setup_requires=['Cython>=0.29.2', 'numpy==1.18.1'], 32 | install_requires=install_requires, 33 | requires=['Cython', 'numpy', 'pandas'], 34 | python_requires='>=3.6.0', 35 | package_data={'': ['*.txt']} 36 | ) 37 | -------------------------------------------------------------------------------- /srcnn/competition_metric.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is referenced from https://github.com/iopsai/iops/blob/master/evaluation/evaluation.py 3 | """ 4 | 5 | import numpy as np 6 | from sklearn.metrics import f1_score, precision_score, recall_score 7 | 8 | 9 | def get_range_proba(predict, label, delay=7): 10 | predict = np.array(predict) 11 | label = np.array(label) 12 | 13 | splits = np.where(label[1:] != label[:-1])[0] + 1 14 | is_anomaly = label[0] == 1 15 | new_predict = np.array(predict) 16 | pos = 0 17 | 18 | for sp in splits: 19 | if is_anomaly: 20 | if 1 in predict[pos:min(pos + delay + 1, sp)]: 21 | new_predict[pos: sp] = 1 22 | else: 23 | new_predict[pos: sp] = 0 24 | is_anomaly = not is_anomaly 25 | pos = sp 26 | sp = len(label) 27 | 28 | if is_anomaly: 29 | if 1 in predict[pos: min(pos + delay + 1, sp)]: 30 | new_predict[pos: sp] = 1 31 | else: 32 | new_predict[pos: sp] = 0 33 | 34 | return new_predict 35 | 36 | 37 | def reconstruct_label(timestamp, label): 38 | timestamp = np.asarray(timestamp, np.int64) 39 | index = np.argsort(timestamp) 40 | 41 | timestamp_sorted = np.asarray(timestamp[index]) 42 | interval = np.min(np.diff(timestamp_sorted)) 43 | 44 | label = np.asarray(label, np.int64) 45 | label = np.asarray(label[index]) 46 | 47 | idx = (timestamp_sorted - timestamp_sorted[0]) // interval 48 | 49 | new_label = np.zeros(shape=((timestamp_sorted[-1] - timestamp_sorted[0]) // interval + 1,), dtype=np.int) 50 | new_label[idx] = label 51 | 52 | return new_label 53 | 54 | 55 | def reconstruct_series(timestamp, label, predict, delay=7): 56 | label = reconstruct_label(timestamp, label) 57 | predict = reconstruct_label(timestamp, predict) 58 | predict = get_range_proba(predict, label, delay) 59 | return label.tolist(), predict.tolist() 60 | 61 | 62 | def calc(pred, true): 63 | TP = 0 64 | FP = 0 65 | TN = 0 66 | FN = 0 67 | for pre, gt in zip(pred, true): 68 | if gt == 1: 69 | if pre == 1: 70 | TP += 1 71 | else: 72 | FN += 1 73 | if gt == 0: 74 | if pre == 1: 75 | FP += 1 76 | else: 77 | TN += 1 78 | return TP, FP, TN, FN 79 | 80 | 81 | def evaluate_for_all_series(lst_timestamp_label_predict, delay=7, prt=True): 82 | labels, predicts = [], [] 83 | for timestamp, label, predict, _ in lst_timestamp_label_predict: 84 | if timestamp == []: 85 | continue 86 | lbl, pdt = reconstruct_series(timestamp, label, predict, delay) 87 | labels += lbl 88 | predicts += pdt 89 | 90 | f1 = f1_score(labels, predicts) 91 | pre = precision_score(labels, predicts) 92 | rec = recall_score(labels, predicts) 93 | TP, FP, TN, FN = calc(predicts, labels) 94 | if prt: 95 | print('precision', pre) 96 | print('recall', rec) 97 | print('f1', f1) 98 | print('-------------------------------') 99 | return f1, pre, rec, TP, FP, TN, FN 100 | 101 | 102 | def bi_get_range_proba(predict, label, left, right): 103 | i = 1 104 | rs = predict[:] 105 | while i < len(label): 106 | if label[i] == 1 and label[i - 1] == 0: 107 | start = max(0, i - left) 108 | end = min(i + right + 1, len(label)) 109 | if 1 in predict[start: end]: 110 | j = i 111 | while j < len(label) and label[j] == 1: 112 | rs[j] = 1 113 | j += 1 114 | i = j 115 | rs[start: end] = label[start: end] 116 | else: 117 | j = i 118 | while j < len(label) and label[j] == 1: 119 | rs[j] = 0 120 | j += 1 121 | i = j 122 | i += 1 123 | return rs 124 | 125 | 126 | def bi_reconstruct_series(timestamp, label, predict, left, right): 127 | label = reconstruct_label(timestamp, label).tolist() 128 | predict = reconstruct_label(timestamp, predict).tolist() 129 | predict = bi_get_range_proba(predict, label, left, right) 130 | return label, predict 131 | 132 | 133 | def bi_evaluate_for_all_series(lst_timestamp_label_predict, left, right, prt=True): 134 | import json 135 | labels, predicts = [], [] 136 | save = [] 137 | for timestamp, label, predict in lst_timestamp_label_predict: 138 | if timestamp == []: 139 | continue 140 | try: 141 | lbl, pdt = bi_reconstruct_series(timestamp, label, predict, left, right) 142 | except: 143 | continue 144 | ifi = f1_score(lbl, pdt) 145 | save.append(ifi) 146 | labels += lbl 147 | predicts += pdt 148 | with open('eachscore.json', 'w+') as fout: 149 | json.dump(save, fout) 150 | f1 = f1_score(labels, predicts) 151 | pre = precision_score(labels, predicts) 152 | rec = recall_score(labels, predicts) 153 | if prt: 154 | print('precision', pre) 155 | print('recall', rec) 156 | print('f1', f1) 157 | print('-------------------------------') 158 | return f1, pre, rec 159 | 160 | 161 | def get_variance(f_score, all_fscore): 162 | va = 0.0 163 | for i in range(len(all_fscore)): 164 | va += 1.0 * (all_fscore[i] - f_score) * (all_fscore[i] - f_score) 165 | 166 | return va / len(all_fscore) 167 | -------------------------------------------------------------------------------- /srcnn/evalue.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) Microsoft Corporation. All rights reserved.​ 3 | ​ 4 | Microsoft Corporation ("Microsoft") grants you a nonexclusive, perpetual, 5 | royalty-free right to use, copy, and modify the software code provided by us 6 | ("Software Code"). You may not sublicense the Software Code or any use of it 7 | (except to your affiliates and to vendors to perform work on your behalf) 8 | through distribution, network access, service agreement, lease, rental, or 9 | otherwise. This license does not purport to express any claim of ownership over 10 | data you may have shared with Microsoft in the creation of the Software Code. 11 | Unless applicable law gives you more rights, Microsoft reserves all other 12 | rights not expressly granted herein, whether by implication, estoppel or 13 | otherwise. ​ 14 | ​ 15 | THE SOFTWARE CODE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 16 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 19 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 21 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER 22 | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23 | ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE 24 | POSSIBILITY OF SUCH DAMAGE. 25 | """ 26 | 27 | import os 28 | from srcnn.competition_metric import get_variance, evaluate_for_all_series 29 | import time 30 | import json 31 | import argparse 32 | from msanomalydetector.spectral_residual import SpectralResidual 33 | from srcnn.utils import * 34 | 35 | 36 | def auto(): 37 | path_auto = os.getcwd() + '/auto.json' 38 | with open(path_auto, 'r+') as f: 39 | store = json.load(f) 40 | window = store['window'] 41 | epoch = store['epoch'] 42 | return window, epoch 43 | 44 | 45 | def getfid(path): 46 | return path.split('/')[-1] 47 | 48 | 49 | def get_path(data_source): 50 | if data_source == 'kpi': 51 | dir_ = root + '/Test/' 52 | trainfiles = [dir_ + _ for _ in os.listdir(dir_)] 53 | files = trainfiles 54 | else: 55 | dir_ = root + '/' + data_source + '/' 56 | files = [dir_ + _ for _ in os.listdir(dir_)] 57 | return files 58 | 59 | 60 | def get_score(data_source, files, thres, option): 61 | total_time = 0 62 | results = [] 63 | savedscore = [] 64 | for f in files: 65 | print('reading', f) 66 | if data_source == 'kpi' or data_source == 'test_kpi': 67 | in_timestamp, in_value, in_label = read_csv_kpi(f) 68 | else: 69 | tmp_data = read_pkl(f) 70 | in_timestamp, in_value, in_label = tmp_data['timestamp'], tmp_data['value'], tmp_data['label'] 71 | length = len(in_timestamp) 72 | if model == 'sr_cnn' and len(in_value) < window: 73 | print("length is shorter than win_size", len(in_value), window) 74 | continue 75 | time_start = time.time() 76 | timestamp, label, pre, scores = models[model](in_timestamp, in_value, in_label, window, net, option, thres) 77 | time_end = time.time() 78 | total_time += time_end - time_start 79 | results.append([timestamp, label, pre, f]) 80 | savedscore.append([label, scores, f, timestamp]) 81 | return total_time, results, savedscore 82 | 83 | 84 | if __name__ == '__main__': 85 | parser = argparse.ArgumentParser(description='SRCNN') 86 | parser.add_argument('--data', type=str, required=True, help='location of the data file') 87 | parser.add_argument('--window', type=int, default=128, help='window size') 88 | parser.add_argument('--epoch', type=int, default=10) 89 | parser.add_argument('--model_path', type=str, default='snapshot', help='model path') 90 | parser.add_argument('--delay', type=int, default=3, help='delay') 91 | parser.add_argument('--thres', type=int, default=0.95, help='initial threshold of SR') 92 | parser.add_argument('--auto', type=bool, default=False, help='Automatic filling parameters') 93 | parser.add_argument('--model', type=str, default='sr_cnn', help='model') 94 | parser.add_argument('--missing_option', type=str, default='anomaly', 95 | help='missing data option, anomaly means treat missing data as anomaly') 96 | 97 | args = parser.parse_args() 98 | if args.auto: 99 | window, epoch = auto() 100 | else: 101 | window = args.window 102 | epoch = args.epoch 103 | data_source = args.data 104 | delay = args.delay 105 | model = args.model 106 | root = os.getcwd() 107 | print(data, window, epoch) 108 | models = { 109 | 'sr_cnn': sr_cnn_eval, 110 | } 111 | 112 | model_path = root + '/' + args.model_path + '/srcnn_retry' + str(epoch) + '_' + str(window) + '.bin' 113 | srcnn_model = Anomaly(window) 114 | net = load_model(srcnn_model, model_path).cuda() 115 | files = get_path(data_source) 116 | total_time, results, savedscore = get_score(data_source, files, args.thres, args.missing_option) 117 | print('\n***********************************************') 118 | print('data source:', data_source, ' model:', model) 119 | print('-------------------------------') 120 | total_fscore, pre, rec, TP, FP, TN, FN = evaluate_for_all_series(results, delay) 121 | with open(data_source + '_saved_scores.json', 'w') as f: 122 | json.dump(savedscore, f) 123 | print('time used for making predictions:', total_time, 'seconds') 124 | 125 | 126 | best = 0. 127 | bestthre = 0. 128 | print('delay :', delay) 129 | if data_source == 'yahoo': 130 | sru = {} 131 | rf = open(data_source + 'sr3.json', 'r') 132 | srres = json.load(rf) 133 | for (srtime, srl, srpre, srf) in srres: 134 | sru[getfid(srf)] = [srtime, srl, srpre] 135 | for i in range(98): 136 | newresults = [] 137 | threshold = 0.01 + i * 0.01 138 | for f, (srtt, srlt, srpret, srft), (flabel, cnnscores, cnnf, cnnt) in zip(files, srres, savedscore): 139 | fid = getfid(cnnf) 140 | srtime = sru[fid][0] 141 | srl = sru[fid][1] 142 | srpre = sru[fid][2] 143 | srtime = [(srtime[0] - 3600 * (64 - j)) for j in range(64)] + srtime 144 | srl = [0] * 64 + srl 145 | srpre = [0] * 64 + srpre 146 | print(len(srl), len(flabel), '!!') 147 | assert (len(srl) == len(flabel)) 148 | pre = [1 if item > threshold else 0 for item in cnnscores] 149 | newresults.append([srtime, srpre, pre, f]) 150 | total_fscore, pre, rec, TP, FP, TN, FN = evaluate_for_all_series(newresults, delay, prt=False) 151 | if total_fscore > best: 152 | best = total_fscore 153 | bestthre = threshold 154 | results = [] 155 | threshold = bestthre 156 | print('guided threshold :', threshold) 157 | for f, (flabel, cnnscores, _, ftimestamp) in zip(files, savedscore): 158 | pre = [1 if item > threshold else 0 for item in cnnscores] 159 | results.append([ftimestamp, flabel, pre, f]) 160 | print('score\n') 161 | total_fscore, pre, rec, TP, FP, TN, FN = evaluate_for_all_series(results, delay) 162 | print(total_fscore) 163 | best = 0. 164 | for i in range(98): 165 | newresults = [] 166 | threshold = 0.01 + i * 0.01 167 | for f, (flabel, cnnscores, _, ftimestamp) in zip(files, savedscore): 168 | pre = [1 if item > threshold else 0 for item in cnnscores] 169 | newresults.append([ftimestamp, flabel, pre, f]) 170 | total_fscore, pre, rec, TP, FP, TN, FN = evaluate_for_all_series(newresults, delay, prt=False) 171 | if total_fscore > best: 172 | best = total_fscore 173 | bestthre = threshold 174 | print('tem best', best, threshold) 175 | threshold = bestthre 176 | print('best overall threshold :', threshold, 'best score :', best) 177 | -------------------------------------------------------------------------------- /srcnn/generate_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) Microsoft Corporation. All rights reserved.​ 3 | ​ 4 | Microsoft Corporation ("Microsoft") grants you a nonexclusive, perpetual, 5 | royalty-free right to use, copy, and modify the software code provided by us 6 | ("Software Code"). You may not sublicense the Software Code or any use of it 7 | (except to your affiliates and to vendors to perform work on your behalf) 8 | through distribution, network access, service agreement, lease, rental, or 9 | otherwise. This license does not purport to express any claim of ownership over 10 | data you may have shared with Microsoft in the creation of the Software Code. 11 | Unless applicable law gives you more rights, Microsoft reserves all other 12 | rights not expressly granted herein, whether by implication, estoppel or 13 | otherwise. ​ 14 | ​ 15 | THE SOFTWARE CODE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 16 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 19 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 21 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER 22 | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23 | ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE 24 | POSSIBILITY OF SUCH DAMAGE. 25 | """ 26 | 27 | import argparse 28 | from srcnn.utils import * 29 | import os 30 | import time 31 | from msanomalydetector.util import average_filter 32 | 33 | 34 | class gen(): 35 | def __init__(self, win_siz, step, nums): 36 | self.control = 0 37 | self.win_siz = win_siz 38 | self.step = step 39 | self.number = nums 40 | 41 | def generate_train_data(self, value, back_k=0): 42 | def normalize(a): 43 | amin = np.min(a) 44 | amax = np.max(a) 45 | a = (a - amin) / (amax - amin + 1e-5) 46 | return 3 * a 47 | 48 | if back_k <= 5: 49 | back = back_k 50 | else: 51 | back = 5 52 | length = len(value) 53 | tmp = [] 54 | for pt in range(self.win_siz, length - back, self.step): 55 | head = max(0, pt - self.win_siz) 56 | tail = min(length - back, pt) 57 | data = np.array(value[head:tail]) 58 | data = data.astype(np.float64) 59 | data = normalize(data) 60 | num = np.random.randint(1, self.number) 61 | ids = np.random.choice(self.win_siz, num, replace=False) 62 | lbs = np.zeros(self.win_siz, dtype=np.int64) 63 | if (self.win_siz - 6) not in ids: 64 | self.control += np.random.random() 65 | else: 66 | self.control = 0 67 | if self.control > 100: 68 | ids[0] = self.win_siz - 6 69 | self.control = 0 70 | mean = np.mean(data) 71 | dataavg = average_filter(data) 72 | var = np.var(data) 73 | for id in ids: 74 | data[id] += (dataavg[id] + mean) * np.random.randn() * min((1 + var), 10) 75 | lbs[id] = 1 76 | tmp.append([data.tolist(), lbs.tolist()]) 77 | return tmp 78 | 79 | 80 | def auto(dic): 81 | path_auto = os.getcwd() + '/auto.json' 82 | auto = {} 83 | for item, value in dic: 84 | if value != None: 85 | auto[item] = value 86 | with open(path_auto, 'w+') as f: 87 | json.dump(auto, f) 88 | 89 | 90 | def get_path(data): 91 | dir_ = os.getcwd() + '/' + data + '/' 92 | fadir = [_ for _ in os.listdir(dir_)] 93 | print(fadir, 'fadir') 94 | files = [] 95 | for eachdir in fadir: 96 | files += [dir_ + eachdir + '/' + _ for _ in os.listdir(dir_ + eachdir)] 97 | print(files, 'files') 98 | return files 99 | 100 | 101 | if __name__ == '__main__': 102 | parser = argparse.ArgumentParser(description='SRCNN') 103 | parser.add_argument('--data', type=str, required=True, help='location of the data file') 104 | parser.add_argument('--window', type=int, default=128, help='window size') 105 | parser.add_argument('--step', type=int, default=64, help='step') 106 | parser.add_argument('--seed', type=int, default=54321, help='random seed') 107 | parser.add_argument('--num', type=int, default=10, help='upper limit value for the number of anomaly points') 108 | args = parser.parse_args() 109 | np.random.seed(args.seed) 110 | auto(vars(args).items()) 111 | files = get_path(args.data) 112 | 113 | train_data_path = os.getcwd() + '/' + args.data + '_' + str(args.window) + '_train.json' 114 | total_time = 0 115 | results = [] 116 | print("generating train data") 117 | generator = gen(args.window, args.step, args.num) 118 | for f in files: 119 | print('reading', f) 120 | in_timestamp, in_value = read_csv(f) 121 | in_label = [] 122 | if len(in_value) < args.window: 123 | print("value's length < window size", len(in_value), args.window) 124 | continue 125 | time_start = time.time() 126 | train_data = generator.generate_train_data(in_value) 127 | time_end = time.time() 128 | total_time += time_end - time_start 129 | results += train_data 130 | print('file num:', len(files)) 131 | print('total fake data size:', len(results)) 132 | with open(train_data_path, 'w+') as f: 133 | print(train_data_path) 134 | json.dump(results, f) 135 | -------------------------------------------------------------------------------- /srcnn/net.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) Microsoft Corporation. All rights reserved.​ 3 | ​ 4 | Microsoft Corporation ("Microsoft") grants you a nonexclusive, perpetual, 5 | royalty-free right to use, copy, and modify the software code provided by us 6 | ("Software Code"). You may not sublicense the Software Code or any use of it 7 | (except to your affiliates and to vendors to perform work on your behalf) 8 | through distribution, network access, service agreement, lease, rental, or 9 | otherwise. This license does not purport to express any claim of ownership over 10 | data you may have shared with Microsoft in the creation of the Software Code. 11 | Unless applicable law gives you more rights, Microsoft reserves all other 12 | rights not expressly granted herein, whether by implication, estoppel or 13 | otherwise. ​ 14 | ​ 15 | THE SOFTWARE CODE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 16 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 19 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 21 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER 22 | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23 | ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE 24 | POSSIBILITY OF SUCH DAMAGE. 25 | """ 26 | import torch 27 | import torch.utils.data 28 | from torch import nn, optim 29 | from torch.nn import functional as F 30 | from torchvision import datasets, transforms 31 | from torchvision.utils import save_image 32 | 33 | configs = [()] 34 | 35 | 36 | def make_layers(Bn=True, input=256): 37 | global configs 38 | layers = [] 39 | layer = nn.Conv2d(input, input, kernel_size=1, stride=1, padding=0) 40 | layers.append(layer) 41 | if Bn: 42 | layers.append(nn.BatchNorm2d(input)) 43 | 44 | for k, s, c in configs: 45 | if c == -1: 46 | layer = nn.Conv2d(kernel_size=k, stride=s, padding=0) 47 | else: 48 | now = [] 49 | now.append(nn.Conv1d(input, c, kernel_size=k, stride=s, padding=0)) 50 | input = c 51 | if Bn: 52 | now.append(nn.BatchNorm2d(input)) 53 | now.append(nn.Relu(inplace=True)) 54 | layer = nn.Sequential(*now) 55 | layers.append(layer) 56 | return nn.Sequential(*layers), input 57 | 58 | 59 | class trynet(nn.Module): 60 | def __init__(self): 61 | super(trynet, self).__init__() 62 | self.layer1 = nn.Conv1d(1, 128, kernel_size=128, stride=0, padding=0) 63 | self.layer2 = nn.BatchNorm1d(128) 64 | 65 | self.feature = make_layers() 66 | 67 | 68 | class Anomaly(nn.Module): 69 | def __init__(self, window=1024): 70 | self.window = window 71 | super(Anomaly, self).__init__() 72 | self.layer1 = nn.Conv1d(window, window, kernel_size=1, stride=1, padding=0) 73 | self.layer2 = nn.Conv1d(window, 2 * window, kernel_size=1, stride=1, padding=0) 74 | self.fc1 = nn.Linear(2 * window, 4 * window) 75 | self.fc2 = nn.Linear(4 * window, window) 76 | self.relu = nn.ReLU(inplace=True) 77 | 78 | def forward(self, x): 79 | x = x.view(x.size(0), self.window, 1) 80 | x = self.layer1(x) 81 | x = self.relu(x) 82 | x = self.layer2(x) 83 | x = x.view(x.size(0), -1) 84 | x = self.relu(x) 85 | x = self.fc1(x) 86 | x = self.relu(x) 87 | x = self.fc2(x) 88 | return torch.sigmoid(x) 89 | 90 | 91 | def save_model(model, model_path): 92 | try: 93 | torch.save(model.state_dict(), model_path) 94 | except: 95 | torch.save(model, model_path) 96 | 97 | 98 | def load_model(model, path): 99 | print("loading %s" % path) 100 | with open(path, 'rb') as f: 101 | pretrained = torch.load(f, map_location=lambda storage, loc: storage) 102 | model_dict = model.state_dict() 103 | pretrained = {k: v for k, v in pretrained.items() if k in model_dict} 104 | model_dict.update(pretrained) 105 | model.load_state_dict(model_dict) 106 | return model 107 | -------------------------------------------------------------------------------- /srcnn/train.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) Microsoft Corporation. All rights reserved.​ 3 | ​ 4 | Microsoft Corporation ("Microsoft") grants you a nonexclusive, perpetual, 5 | royalty-free right to use, copy, and modify the software code provided by us 6 | ("Software Code"). You may not sublicense the Software Code or any use of it 7 | (except to your affiliates and to vendors to perform work on your behalf) 8 | through distribution, network access, service agreement, lease, rental, or 9 | otherwise. This license does not purport to express any claim of ownership over 10 | data you may have shared with Microsoft in the creation of the Software Code. 11 | Unless applicable law gives you more rights, Microsoft reserves all other 12 | rights not expressly granted herein, whether by implication, estoppel or 13 | otherwise. ​ 14 | ​ 15 | THE SOFTWARE CODE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 16 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 19 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 21 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER 22 | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23 | ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE 24 | POSSIBILITY OF SUCH DAMAGE. 25 | """ 26 | import argparse 27 | from srcnn.utils import * 28 | import numpy as np 29 | import os 30 | import time 31 | 32 | 33 | def auto(epoch): 34 | path_auto = os.getcwd() + '/auto.json' 35 | with open(path_auto, 'r+') as f: 36 | store = json.load(f) 37 | data = store['data'] 38 | window = store['window'] 39 | store['epoch'] = epoch 40 | with open(path_auto, 'w+') as f: 41 | json.dump(store, f) 42 | return data, window 43 | 44 | 45 | if __name__ == '__main__': 46 | parser = argparse.ArgumentParser(description='SRCNN') 47 | parser.add_argument('--data', type=str, required=True, help='location of the data file') 48 | parser.add_argument('--window', type=int, default=128, help='window size') 49 | parser.add_argument('--lr', type=int, default=1e-6, help='learning rate') 50 | parser.add_argument('--step', type=int, default=64, help='step') 51 | 52 | parser.add_argument('--seed', type=int, default=54321, help='random seed') 53 | parser.add_argument('--load', type=bool, default=False, help='load the existed model') 54 | parser.add_argument('--save', type=str, default='snapshot', help='path to save the model') 55 | parser.add_argument('--epoch', type=int, default=10) 56 | parser.add_argument('--batch_size', type=int, default=256, help='path to save the model') 57 | parser.add_argument('--num_workers', type=int, default=8, help='number of workers of pytorch') 58 | parser.add_argument('--model', type=str, default='sr_cnn', help='model') 59 | parser.add_argument('--auto', type=bool, default=False, help='Automatic filling parameters') 60 | 61 | args = parser.parse_args() 62 | if args.auto: 63 | data, window = auto(args.epoch) 64 | else: 65 | data, window = args.data, args.window 66 | torch.cuda.manual_seed(args.seed) 67 | np.random.seed(args.seed) 68 | models = { 69 | 'sr_cnn': sr_cnn, 70 | } 71 | model = args.model 72 | root_path = os.getcwd() 73 | train_data_path = root_path + '/' + data + '_' + str(window) + '_train.json' 74 | model_path = root_path + '/' + args.save + '/' 75 | if args.load: 76 | load_path = root_path + '/' + args.load 77 | else: 78 | load_path = None 79 | 80 | total_time = 0 81 | time_start = time.time() 82 | models[model](train_data_path, model_path, window, args.lr, args.epoch, args.batch_size, args.num_workers, 83 | load_path=load_path) 84 | time_end = time.time() 85 | total_time += time_end - time_start 86 | print('time used for training:', total_time, 'seconds') 87 | -------------------------------------------------------------------------------- /srcnn/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (C) Microsoft Corporation. All rights reserved.​ 3 | ​ 4 | Microsoft Corporation ("Microsoft") grants you a nonexclusive, perpetual, 5 | royalty-free right to use, copy, and modify the software code provided by us 6 | ("Software Code"). You may not sublicense the Software Code or any use of it 7 | (except to your affiliates and to vendors to perform work on your behalf) 8 | through distribution, network access, service agreement, lease, rental, or 9 | otherwise. This license does not purport to express any claim of ownership over 10 | data you may have shared with Microsoft in the creation of the Software Code. 11 | Unless applicable law gives you more rights, Microsoft reserves all other 12 | rights not expressly granted herein, whether by implication, estoppel or 13 | otherwise. ​ 14 | ​ 15 | THE SOFTWARE CODE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 16 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 19 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 21 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER 22 | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23 | ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE 24 | POSSIBILITY OF SUCH DAMAGE. 25 | """ 26 | 27 | import pickle 28 | import csv 29 | import numpy as np 30 | import torch.nn as nn 31 | import torch.utils.data as data 32 | from torch.autograd import Variable 33 | from tqdm import tqdm 34 | from torch.utils.data import Dataset 35 | from srcnn.net import * 36 | import json 37 | from msanomalydetector.util import average_filter 38 | from msanomalydetector.spectral_residual import SpectralResidual 39 | 40 | 41 | def read_pkl(path): 42 | with open(path, 'rb') as f: 43 | return pickle.load(f) 44 | 45 | 46 | def read_csv_kpi(path): 47 | tm = [] 48 | vl = [] 49 | lb = [] 50 | with open(path) as f: 51 | input = csv.reader(f, delimiter=',') 52 | cnt = 0 53 | for row in input: 54 | if cnt == 0: 55 | cnt += 1 56 | continue 57 | tm.append(int(row[0])) 58 | vl.append(float(row[1])) 59 | lb.append(int(row[2])) 60 | cnt += 1 61 | f.close() 62 | return tm, vl, lb 63 | 64 | 65 | def read_csv(path): 66 | tm = [] 67 | vl = [] 68 | with open(path, 'r+') as f: 69 | input = csv.reader(f, delimiter=',') 70 | cnt = 0 71 | for row in input: 72 | if cnt == 0: 73 | cnt += 1 74 | continue 75 | tm.append(cnt) 76 | vl.append(float(row[1])) 77 | f.close() 78 | return tm, vl 79 | 80 | 81 | def sr_cnn(data_path, model_path, win_size, lr, epochs, batch, num_worker, load_path=None): 82 | def adjust_lr(optimizer, epoch): 83 | base_lr = lr 84 | cur_lr = base_lr * (0.5 ** ((epoch + 10) // 10)) 85 | for param in optimizer.param_groups: 86 | param['lr'] = cur_lr 87 | 88 | def Var(x): 89 | return Variable(x.cuda()) 90 | 91 | def loss_function(x, lb): 92 | l2_reg = 0. 93 | l2_weight = 0. 94 | for W in net.parameters(): 95 | l2_reg = l2_reg + W.norm(2) 96 | kpiweight = torch.ones(lb.shape) 97 | kpiweight[lb == 1] = win_size // 100 98 | kpiweight = kpiweight.cuda() 99 | BCE = F.binary_cross_entropy(x, lb, weight=kpiweight, reduction='sum') 100 | return l2_reg * l2_weight + BCE 101 | 102 | def calc(pred, true): 103 | TP = 0 104 | FP = 0 105 | TN = 0 106 | FN = 0 107 | for pre, gt in zip(pred, true): 108 | if gt == 1: 109 | if pre == 1: 110 | TP += 1 111 | else: 112 | FN += 1 113 | if gt == 0: 114 | if pre == 1: 115 | FP += 1 116 | else: 117 | TN += 1 118 | print('TP=%d FP=%d TN=%d FN=%d' % (TP, FP, TN, FN)) 119 | return TP, FP, TN, FN 120 | 121 | def train(epoch, net, gen_set): 122 | train_loader = data.DataLoader(dataset=gen_set, shuffle=True, num_workers=num_worker, batch_size=batch, 123 | pin_memory=True) 124 | net.train() 125 | train_loss = 0 126 | totTP, totFP, totTN, totFN = 0, 0, 0, 0 127 | threshold = 0.5 128 | for batch_idx, (inputs, lb) in enumerate(tqdm(train_loader, desc="Iteration")): 129 | optimizer.zero_grad() 130 | inputs = inputs.float() 131 | lb = lb.float() 132 | valueseq = Var(inputs) 133 | lb = Var(lb) 134 | output = net(valueseq) 135 | if epoch > 110: 136 | aa = output.detach().cpu().numpy().reshape(-1) 137 | res = np.zeros(aa.shape, np.int64) 138 | res[aa > threshold] = 1 139 | bb = lb.detach().cpu().numpy().reshape(-1) 140 | TP, FP, TN, FN = calc(res, bb) 141 | totTP += TP 142 | totFP += FP 143 | totTN += TN 144 | totFN += FN 145 | if batch_idx % 100 == 0: 146 | print('TP=%d FP=%d TN=%d FN=%d' % (TP, FP, TN, FN)) 147 | loss1 = loss_function(output, lb) 148 | loss1.backward() 149 | train_loss += loss1.item() 150 | optimizer.step() 151 | torch.nn.utils.clip_grad_norm(net.parameters(), 5.0) 152 | if batch_idx % 100 == 0: 153 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( 154 | epoch, batch_idx * len(inputs), len(train_loader.dataset), 155 | 100. * batch_idx / len(train_loader), 156 | loss1.item() / len(inputs))) 157 | 158 | model = Anomaly(win_size) 159 | net = model.cuda() 160 | gpu_num = torch.cuda.device_count() 161 | net = torch.nn.DataParallel(net, list(range(gpu_num))) 162 | print(net) 163 | base_lr = lr 164 | bp_parameters = filter(lambda p: p.requires_grad, net.parameters()) 165 | optimizer = optim.SGD(bp_parameters, lr=base_lr, momentum=0.9, weight_decay=0.0) 166 | 167 | if load_path != None: 168 | net = load_model(model, load_path) 169 | print("model loaded") 170 | 171 | gen_data = gen_set(win_size, data_path) 172 | for epoch in range(1, epochs + 1): 173 | print('epoch :', epoch) 174 | train(epoch, net, gen_data) 175 | adjust_lr(optimizer, epoch) 176 | if epoch % 5 == 0: 177 | save_model(model, model_path + 'srcnn_retry' + str(epoch) + '_' + str(win_size) + '.bin') 178 | return 179 | 180 | 181 | def fft(values): 182 | wave = np.array(values) 183 | trans = np.fft.fft(wave) 184 | realnum = np.real(trans) 185 | comnum = np.imag(trans) 186 | mag = np.sqrt(realnum ** 2 + comnum ** 2) 187 | mag += 1e-5 188 | spectral = np.exp(np.log(mag) - average_filter(np.log(mag))) 189 | trans.real = trans.real * spectral / mag 190 | trans.imag = trans.imag * spectral / mag 191 | wave = np.fft.ifft(trans) 192 | mag = np.sqrt(wave.real ** 2 + wave.imag ** 2) 193 | return mag 194 | 195 | 196 | def spectral_residual(values): 197 | """ 198 | This method transform a time series into spectral residual series 199 | :param values: list. 200 | a list of float values. 201 | :return: mag: list. 202 | a list of float values as the spectral residual values 203 | """ 204 | EPS = 1e-8 205 | trans = np.fft.fft(values) 206 | mag = np.sqrt(trans.real ** 2 + trans.imag ** 2) 207 | 208 | maglog = [np.log(item) if abs(item) > EPS else 0 for item in mag] 209 | 210 | spectral = np.exp(maglog - average_filter(maglog, n=3)) 211 | 212 | trans.real = [ireal * ispectral / imag if abs(imag) > EPS else 0 213 | for ireal, ispectral, imag in zip(trans.real, spectral, mag)] 214 | trans.imag = [iimag * ispectral / imag if abs(imag) > EPS else 0 215 | for iimag, ispectral, imag in zip(trans.imag, spectral, mag)] 216 | 217 | wave_r = np.fft.ifft(trans) 218 | mag = np.sqrt(wave_r.real ** 2 + wave_r.imag ** 2) 219 | 220 | return mag 221 | 222 | 223 | class gen_set(Dataset): 224 | def __init__(self, width, data_path): 225 | self.genlen = 0 226 | self.len = self.genlen 227 | self.width = width 228 | with open(data_path, 'r+') as fin: 229 | self.kpinegraw = json.load(fin) 230 | self.negrawlen = len(self.kpinegraw) 231 | print('length :', len(self.kpinegraw)) 232 | self.len += self.negrawlen 233 | self.kpineglen = 0 234 | self.control = 0. 235 | 236 | def __len__(self): 237 | return self.len 238 | 239 | def __getitem__(self, index): 240 | idx = index % self.negrawlen 241 | datas = self.kpinegraw[idx] 242 | datas = np.array(datas) 243 | data = datas[0, :].astype(np.float64) 244 | lbs = datas[1, :].astype(np.float64) 245 | wave = spectral_residual(data) 246 | waveavg = average_filter(wave) 247 | for i in range(self.width): 248 | if wave[i] < 0.001 and waveavg[i] < 0.001: 249 | lbs[i] = 0 250 | continue 251 | ratio = wave[i] / waveavg[i] 252 | if ratio < 1.0 and lbs[i] == 1: 253 | lbs[i] = 0 254 | if ratio > 5.0: 255 | lbs[i] = 1 256 | srscore = abs(wave - waveavg) / (waveavg + 0.01) 257 | sortid = np.argsort(srscore) 258 | for idx in sortid[-2:]: 259 | if srscore[idx] > 5: 260 | lbs[idx] = 1 261 | resdata = torch.from_numpy(100 * wave) 262 | reslb = torch.from_numpy(lbs) 263 | return resdata, reslb 264 | 265 | 266 | def sr_cnn_eval(timestamp, value, label, window, net, ms_optioin, threshold=0.95, back_k=0, backaddnum=5, step=1): 267 | def Var(x): 268 | return Variable(x.cuda()) 269 | 270 | def modelwork(x, net): 271 | with torch.no_grad(): 272 | x = torch.from_numpy(100 * x).float() 273 | x = torch.unsqueeze(x, 0) 274 | x = Var(x) 275 | output = net(x) 276 | aa = output.detach().cpu().numpy().reshape(-1) 277 | res = np.zeros(aa.shape, np.int64) 278 | res[aa > threshold] = 1 279 | return res, aa 280 | 281 | win_size = window 282 | length = len(timestamp) 283 | if back_k <= 5: 284 | back = back_k 285 | else: 286 | back = 5 287 | detres = [0] * (win_size - backaddnum) 288 | scores = [0] * (win_size - backaddnum) 289 | 290 | for pt in range(win_size - backaddnum + back + step, length - back, step): 291 | head = max(0, pt - (win_size - backaddnum)) 292 | tail = min(length, pt) 293 | wave = np.array(SpectralResidual.extend_series(value[head:tail + back])) 294 | mag = spectral_residual(wave) 295 | modeloutput, rawout = modelwork(mag, net) 296 | for ipt in range(pt - step - back, pt - back): 297 | detres.append(modeloutput[ipt - head]) 298 | scores.append(rawout[ipt - head].item()) 299 | detres += [0] * (length - len(detres)) 300 | scores += [0] * (length - len(scores)) 301 | 302 | if ms_optioin == 'anomaly': 303 | last = -1 304 | interval = min([timestamp[i] - timestamp[i - 1] for i in range(1, len(timestamp))]) 305 | for i in range(1, len(timestamp)): 306 | if timestamp[i] - timestamp[i - 1] > interval: 307 | if last >= 0 and i - last < 1000: 308 | detres[i] = 1 309 | scores[i] = 1 310 | if detres[i] == 1: 311 | last = i 312 | 313 | return timestamp[:].tolist(), label[:], detres[:], scores[:] 314 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/anomalydetector/a3260ea0ddfb868986b924a245e003a97143f9df/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_boundary_utils.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | from msanomalydetector import boundary_utils 4 | 5 | 6 | class TestBoundaryUnit(unittest.TestCase): 7 | def test_calculate_boundary_unit(self): 8 | data = [139809.0, 139706.0, 140562.0, 140534.0, 140568.0, 139934.0, 139392.0, 141714.0, 144167.0, 147127.0, 9 | 147450.0, 147991.0, 151621.0, 154912.0, 158443.0, 160899.0, 164170.0, 164339.0, 165780.0, 167373.0, 10 | 167654.0, 168863.0, 169472.0, 169830.0, 169632.0, 169028.0, 165843.0, 162517.0, 159335.0, 156503.0, 11 | 151731.0, 151612.0, 151911.0, 157120.0, 157027.0, 159949.0, 160263.0, 160073.0, 160001.0, 159721.0, 12 | 160138.0, 160292.0, 160280.0, 159822.0, 159482.0, 159384.0, 159038.0, 158901.0, 158899.0, 156036.0] 13 | 14 | is_anomaly = [False, False, False, False, False, False, False, False, False, False, False, False, False, False, 15 | False, False, False, False, False, False, False, False, False, False, False, False, False, False, 16 | False, False, True, True, True, False, False, False, False, False, False, False, False, False, False, 17 | False, False, False, False, False, False, False] 18 | 19 | expected_output = \ 20 | [148560.58510638, 148567.58510638, 148574.58510638, 148576.08510638, 148577.58510638, 148864.08510638, 21 | 149150.58510638, 149763.83510638, 150377.08510638, 151857.08510638, 152018.58510638, 152289.08510638, 22 | 154104.08510638, 155749.58510638, 157515.08510638, 158743.08510638, 160378.58510638, 160463.08510638, 23 | 161183.58510638, 161183.58510638, 161183.58510638, 161183.58510638, 161183.58510638, 161183.58510638, 24 | 161183.58510638, 161183.58510638, 161183.58510638, 159552.08510638, 158425.08510638, 158330.08510638, 25 | 158294.08510638, 158268.08510638, 158268.08510638, 158268.08510638, 158268.08510638, 158204.58510638, 26 | 158154.08510638, 158154.08510638, 158154.08510638, 158154.08510638, 158154.08510638, 158154.08510638, 27 | 158179.33510638, 158204.58510638, 158179.33510638, 158154.08510638, 158094.33510638, 158034.58510638, 28 | 158010.08510638, 157985.58510638] 29 | 30 | actual_output = boundary_utils.calculate_boundary_unit_entire(np.asarray(data, dtype=float), is_anomaly) 31 | for e, v in zip(expected_output, actual_output): 32 | self.assertAlmostEqual(e, v) 33 | 34 | expected_last_unit = 156748.27551020408 35 | actual_last_unit = boundary_utils.calculate_boundary_unit_last(np.asarray(data, dtype=float)) 36 | self.assertAlmostEqual(expected_last_unit, actual_last_unit) 37 | 38 | def test_calculate_boundary_unit_negative(self): 39 | data = [-21901.0, -31123.0, -33203.0, -33236.0, -54681.0, -112808.0, -5368.0, -40021.0, -35.0, -72593.0, 40 | -30880.0, -34597.0, -6210.0, -5508.0, -28892.0, -41091.0, -34916.0, -31941.0, -31084.0, -7379.0, 41 | -4883.0, -32563.0, -29919.0, -33599.0, -33019.0, -35218.0, -9520.0, -4454.0, -39660.0, -29634.0, 42 | -35751.0, -39912.0, -46940.0, -28969.0, -20196.0, -57031.0, -45264.0, -44059.0, -29180.0, -34642.0, 43 | -11041.0, -10455.0, -40181.0, -43345.0, -37045.0, -33232.0, -37800.0, -9240.0, -12108.0, -34654.0] 44 | 45 | is_anomaly = [False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, 46 | False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, 47 | False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, 48 | False, False, False, False, False] 49 | 50 | expected_output = [ 51 | 33250.48958333333, 33258.73958333333, 33250.48958333333, 33258.73958333333, 33250.48958333333, 52 | 32730.489583333332, 32210.489583333332, 32730.489583333332, 33250.48958333333, 33250.48958333333, 53 | 33250.48958333333, 32619.489583333332, 32190.989583333332, 32190.989583333332, 32088.989583333332, 54 | 32190.989583333332, 32190.989583333332, 32619.489583333332, 32190.989583333332, 32190.989583333332, 55 | 32190.989583333332, 32190.989583333332, 32619.489583333332, 32930.48958333333, 32930.48958333333, 56 | 32619.489583333332, 32190.989583333332, 32930.48958333333, 33158.48958333333, 33448.48958333333, 57 | 33448.48958333333, 33969.98958333333, 33969.98958333333, 33969.98958333333, 33969.98958333333, 58 | 34524.48958333333, 35171.48958333333, 34524.48958333333, 35171.48958333333, 35171.48958333333, 59 | 33969.98958333333, 33969.98958333333, 33972.98958333333, 33975.98958333333, 33972.98958333333, 60 | 33969.98958333333, 33617.48958333333, 33969.98958333333, 33620.48958333333, 33975.98958333333] 61 | 62 | actual_output = boundary_utils.calculate_boundary_unit_entire(np.asarray(data), is_anomaly) 63 | for e, v in zip(expected_output, actual_output): 64 | self.assertAlmostEqual(e, v) 65 | 66 | expected_last_unit = 33197.17346938775 67 | actual_last_unit = boundary_utils.calculate_boundary_unit_last(np.asarray(data)) 68 | self.assertAlmostEqual(expected_last_unit, actual_last_unit) 69 | 70 | def test_calculate_margin(self): 71 | self.assertAlmostEqual(boundary_utils.calculate_margin(10, 0), 1843316.2871148242) 72 | self.assertAlmostEqual(boundary_utils.calculate_margin(10, 5), 502228.4038287002) 73 | self.assertAlmostEqual(boundary_utils.calculate_margin(10, 25), 3359.7473532360186) 74 | self.assertAlmostEqual(boundary_utils.calculate_margin(10, 95), 0.0014700521929794912) 75 | self.assertAlmostEqual(boundary_utils.calculate_margin(10, 99), 0.00016994687082728675) 76 | self.assertAlmostEqual(boundary_utils.calculate_margin(10, 100), 0.0) 77 | self.assertAlmostEqual(boundary_utils.calculate_margin(345969.3476, 79.7333448252325), 3762.3800000299298) 78 | 79 | def test_calculate_anomaly_score(self): 80 | self.assertAlmostEqual(boundary_utils.calculate_anomaly_score(10, 15, 5, False), 0) 81 | self.assertAlmostEqual(boundary_utils.calculate_anomaly_score(10, 15, 5, True), 0.5) 82 | self.assertAlmostEqual(boundary_utils.calculate_anomaly_score(10+1e-5, 10, 1, True), 0.005884191895350754) 83 | self.assertAlmostEqual(boundary_utils.calculate_anomaly_score(10+1e-7, 10, 1, True), 5.884191859812512e-05) 84 | 85 | 86 | if __name__ == '__main__': 87 | unittest.main() 88 | -------------------------------------------------------------------------------- /tests/test_spectral_residual.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pandas as pd 3 | import numpy as np 4 | from msanomalydetector import SpectralResidual, DetectMode 5 | 6 | 7 | class FunctionalyTest(unittest.TestCase): 8 | def test_anomaly_only_mode(self): 9 | frame = pd.DataFrame({'timestamp': pd.date_range('2020-01-01', periods=100, freq='1D'), 10 | 'value': np.linspace(1, 100, 100)}) 11 | model = SpectralResidual(frame, threshold=0.3, mag_window=3, score_window=21, sensitivity=99, 12 | detect_mode=DetectMode.anomaly_only, batch_size=0) 13 | result = model.detect() 14 | self.assertEqual(result.shape[0], frame.shape[0]) 15 | self.assertTrue('value' in result.columns) 16 | self.assertTrue('isAnomaly' in result.columns) 17 | self.assertTrue('score' in result.columns) 18 | self.assertTrue('expectedValue' not in result.columns) 19 | self.assertTrue('upperBoundary' not in result.columns) 20 | self.assertTrue('lowerBoundary' not in result.columns) 21 | 22 | def test_anomaly_and_margin_mode(self): 23 | frame = pd.DataFrame({'timestamp': pd.date_range('2020-01-01', periods=100, freq='1D'), 24 | 'value': np.linspace(1, 100, 100)}) 25 | model = SpectralResidual(frame, threshold=0.3, mag_window=3, score_window=21, sensitivity=99, 26 | detect_mode=DetectMode.anomaly_and_margin, batch_size=0) 27 | result = model.detect() 28 | self.assertEqual(result.shape[0], frame.shape[0]) 29 | self.assertTrue('value' in result.columns) 30 | self.assertTrue('isAnomaly' in result.columns) 31 | self.assertTrue('score' in result.columns) 32 | self.assertTrue('expectedValue' in result.columns) 33 | self.assertTrue('upperBoundary' in result.columns) 34 | self.assertTrue('lowerBoundary' in result.columns) 35 | 36 | def test_batch_mode(self): 37 | frame = pd.DataFrame({'timestamp': pd.date_range('2020-01-01', periods=100, freq='1D'), 38 | 'value': np.linspace(1, 100, 100)}) 39 | model = SpectralResidual(frame, threshold=0.3, mag_window=3, score_window=21, sensitivity=99, 40 | detect_mode=DetectMode.anomaly_and_margin, batch_size=33) 41 | result = model.detect() 42 | self.assertEqual(result.shape[0], frame.shape[0]) 43 | self.assertTrue('value' in result.columns) 44 | self.assertTrue('isAnomaly' in result.columns) 45 | self.assertTrue('score' in result.columns) 46 | self.assertTrue('expectedValue' in result.columns) 47 | self.assertTrue('upperBoundary' in result.columns) 48 | self.assertTrue('lowerBoundary' in result.columns) 49 | 50 | 51 | if __name__ == '__main__': 52 | unittest.main() 53 | -------------------------------------------------------------------------------- /version.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Version string and parsed tuple. Keeps it all in one place. 4 | 5 | """ 6 | __version__ = '1.1' 7 | VERSION = tuple(int(x) for x in __version__.split('.')) 8 | --------------------------------------------------------------------------------