├── .copyright.tmpl ├── .dockerignore ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── badges │ └── README.md └── workflows │ ├── docs.yml │ ├── publish.yml │ └── tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── AUTHORS.md ├── CODEOWNERS ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── SECURITY.md ├── benchmark_anomaly.py ├── benchmark_forecast.py ├── conf ├── benchmark_anomaly.json └── benchmark_forecast.json ├── data ├── example.csv ├── iops_competition │ └── phase2.zip ├── multivariate │ ├── energy_power │ │ └── est_hourly.csv.gz │ ├── seattle_trail │ │ └── burke-gilman-trail-north-of-ne-70th-st-bike-and-ped-counter.csv │ └── solar_plant │ │ └── merged.zip ├── smap │ └── SMAP.tar.gz ├── synthetic_anomaly │ ├── horizontal.csv │ ├── horizontal_dip_anomaly.csv │ ├── horizontal_level_anomaly.csv │ ├── horizontal_shock_anomaly.csv │ ├── horizontal_spike_anomaly.csv │ ├── horizontal_trend_anomaly.csv │ ├── seasonal.csv │ ├── seasonal_dip_anomaly.csv │ ├── seasonal_level_anomaly.csv │ ├── seasonal_shock_anomaly.csv │ ├── seasonal_spike_anomaly.csv │ ├── seasonal_trend_anomaly.csv │ ├── upward_downward.csv │ ├── upward_downward_dip_anomaly.csv │ ├── upward_downward_level_anomaly.csv │ ├── upward_downward_shock_anomaly.csv │ ├── upward_downward_spike_anomaly.csv │ └── upward_downward_trend_anomaly.csv ├── test_transform.pkl └── walmart │ ├── walmart_mini.csv │ └── walmart_mini_error.csv ├── docker ├── Dockerfile ├── dashboard │ └── Dockerfile └── spark-on-k8s │ └── Dockerfile ├── docs ├── README.md ├── build_docs.sh ├── process_old_docs.py ├── requirements.txt └── source │ ├── _static │ └── figures │ ├── _templates │ ├── autosummary │ │ └── module.rst │ └── versions.html │ ├── architecture.rst │ ├── conf.py │ ├── index.rst │ ├── merlion.dashboard.rst │ ├── merlion.evaluate.rst │ ├── merlion.models.anomaly.change_point.rst │ ├── merlion.models.anomaly.forecast_based.rst │ ├── merlion.models.anomaly.rst │ ├── merlion.models.automl.rst │ ├── merlion.models.ensemble.rst │ ├── merlion.models.forecast.rst │ ├── merlion.models.rst │ ├── merlion.models.utils.rst │ ├── merlion.plot.rst │ ├── merlion.post_process.rst │ ├── merlion.rst │ ├── merlion.spark.rst │ ├── merlion.transform.rst │ ├── merlion.utils.rst │ ├── ts_datasets.anomaly.rst │ ├── ts_datasets.forecast.rst │ ├── ts_datasets.rst │ ├── tutorials │ └── tutorials.rst ├── examples ├── CustomDataset.ipynb ├── README.md ├── TimeSeries.ipynb ├── advanced │ ├── 1_AutoSARIMA_forecasting_tutorial.ipynb │ └── 2_ForecastInvertPOC.ipynb ├── anomaly │ ├── 0_AnomalyIntro.ipynb │ ├── 1_AnomalyFeatures.ipynb │ ├── 2_AnomalyMultivariate.ipynb │ └── 3_AnomalyNewModel.ipynb ├── forecast │ ├── 0_ForecastIntro.ipynb │ ├── 1_ForecastFeatures.ipynb │ ├── 2_ForecastMultivariate.ipynb │ ├── 3_ForecastExogenous.ipynb │ └── 4_ForecastNewModel.ipynb └── misc │ └── generate_synthetic_tsad_dataset.py ├── figures ├── anom_example.png ├── dashboard_anomaly.png ├── dashboard_file.png ├── dashboard_forecast.png └── forecast_example.png ├── k8s-spec ├── anomaly.yml └── forecast.yml ├── merlion ├── dashboard │ ├── __init__.py │ ├── __main__.py │ ├── assets │ │ ├── Acumin-BdPro.otf │ │ ├── base.css │ │ ├── fonts │ │ │ ├── SalesforceSans-Bold.woff │ │ │ ├── SalesforceSans-BoldItalic.woff │ │ │ ├── SalesforceSans-Italic.woff │ │ │ ├── SalesforceSans-Light.woff │ │ │ ├── SalesforceSans-LightItalic.woff │ │ │ ├── SalesforceSans-Regular.woff │ │ │ ├── SalesforceSans-Thin.woff │ │ │ └── SalesforceSans-ThinItalic.woff │ │ ├── merlion.css │ │ ├── merlion_small.svg │ │ ├── modal.css │ │ ├── resizing.js │ │ ├── styles.css │ │ └── upload.svg │ ├── callbacks │ │ ├── __init__.py │ │ ├── anomaly.py │ │ ├── data.py │ │ └── forecast.py │ ├── models │ │ ├── __init__.py │ │ ├── anomaly.py │ │ ├── data.py │ │ ├── forecast.py │ │ └── utils.py │ ├── pages │ │ ├── __init__.py │ │ ├── anomaly.py │ │ ├── data.py │ │ ├── forecast.py │ │ └── utils.py │ ├── server.py │ ├── settings.py │ └── utils │ │ ├── __init__.py │ │ ├── file_manager.py │ │ ├── layout.py │ │ ├── log.py │ │ └── plot.py ├── evaluate │ ├── anomaly.py │ ├── base.py │ └── forecast.py ├── models │ ├── anomaly │ │ ├── __init__.py │ │ ├── autoencoder.py │ │ ├── base.py │ │ ├── change_point │ │ │ ├── __init__.py │ │ │ └── bocpd.py │ │ ├── dagmm.py │ │ ├── dbl.py │ │ ├── deep_point_anomaly_detector.py │ │ ├── forecast_based │ │ │ ├── __init__.py │ │ │ ├── arima.py │ │ │ ├── base.py │ │ │ ├── ets.py │ │ │ ├── mses.py │ │ │ ├── prophet.py │ │ │ └── sarima.py │ │ ├── isolation_forest.py │ │ ├── lof.py │ │ ├── lstm_ed.py │ │ ├── random_cut_forest.py │ │ ├── spectral_residual.py │ │ ├── stat_threshold.py │ │ ├── vae.py │ │ ├── windstats.py │ │ ├── windstats_monthly.py │ │ ├── windstats_run.py │ │ └── zms.py │ ├── automl │ │ ├── __init__.py │ │ ├── autoets.py │ │ ├── autoprophet.py │ │ ├── autosarima.py │ │ ├── base.py │ │ ├── search.py │ │ └── seasonality.py │ ├── base.py │ ├── deep_base.py │ ├── defaults.py │ ├── ensemble │ │ ├── __init__.py │ │ ├── anomaly.py │ │ ├── base.py │ │ ├── combine.py │ │ └── forecast.py │ ├── factory.py │ ├── forecast │ │ ├── __init__.py │ │ ├── arima.py │ │ ├── autoformer.py │ │ ├── base.py │ │ ├── deep_ar.py │ │ ├── deep_base.py │ │ ├── ets.py │ │ ├── etsformer.py │ │ ├── informer.py │ │ ├── prophet.py │ │ ├── sarima.py │ │ ├── sklearn_base.py │ │ ├── smoother.py │ │ ├── transformer.py │ │ ├── trees.py │ │ └── vector_ar.py │ ├── layers.py │ └── utils │ │ ├── __init__.py │ │ ├── autosarima_utils.py │ │ ├── early_stopping.py │ │ ├── nn_modules │ │ ├── __init__.py │ │ ├── blocks.py │ │ ├── embed.py │ │ ├── enc_dec_autoformer.py │ │ ├── enc_dec_etsformer.py │ │ ├── enc_dec_transformer.py │ │ └── layers.py │ │ ├── rolling_window_dataset.py │ │ └── time_features.py ├── plot.py ├── post_process │ ├── base.py │ ├── calibrate.py │ ├── factory.py │ ├── sequence.py │ └── threshold.py ├── resources │ ├── gson-2.8.9.jar │ ├── randomcutforest-core-1.0.jar │ └── randomcutforest-serialization-json-1.0.jar ├── spark │ ├── dataset.py │ └── pandas_udf.py ├── transform │ ├── anomalize.py │ ├── base.py │ ├── bound.py │ ├── factory.py │ ├── moving_average.py │ ├── normalize.py │ ├── resample.py │ └── sequence.py └── utils │ ├── __init__.py │ ├── conj_priors.py │ ├── data_io.py │ ├── hts.py │ ├── istat.py │ ├── misc.py │ ├── resample.py │ ├── time_series.py │ └── ts_generator.py ├── merlion_logo.svg ├── pytest.ini ├── setup.py ├── spark_apps ├── anomaly.py └── forecast.py ├── tests ├── anomaly │ ├── __init__.py │ ├── forecast_based │ │ ├── __init__.py │ │ ├── test_arima.py │ │ ├── test_mses.py │ │ ├── test_prophet.py │ │ └── test_sarima.py │ ├── multivariate │ │ ├── test_autoencoder.py │ │ ├── test_dagmm.py │ │ ├── test_lstmed.py │ │ └── test_vae.py │ ├── test_anom_ensemble.py │ ├── test_dbl.py │ ├── test_default.py │ ├── test_dpad.py │ ├── test_isolation_forest.py │ ├── test_lof.py │ ├── test_random_cut_forest.py │ ├── test_spectral_residual.py │ ├── test_stat_threshold.py │ ├── test_windstats.py │ └── test_zms.py ├── change_point │ ├── __init__.py │ ├── test_bocpd.py │ └── test_conj_prior.py ├── evaluate │ ├── __init__.py │ ├── test_eval_anomaly.py │ └── test_eval_forecast.py ├── forecast │ ├── __init__.py │ ├── test_autoets.py │ ├── test_autosarima.py │ ├── test_baggingtrees.py │ ├── test_boostingtrees.py │ ├── test_deep_model.py │ ├── test_default.py │ ├── test_ets.py │ ├── test_exog.py │ ├── test_forecast_ensemble.py │ ├── test_istat.py │ ├── test_prophet.py │ ├── test_smoother.py │ └── test_vector_ar.py ├── spark │ ├── __init__.py │ ├── conftest.py │ ├── test_anomaly.py │ └── test_forecast.py ├── test_custom_dataset.py ├── test_generator.py ├── test_hts.py ├── test_plot.py └── transform │ ├── __init__.py │ ├── test_anomalize.py │ ├── test_inverse.py │ ├── test_moving_average.py │ ├── test_resample.py │ └── test_sequence.py └── ts_datasets ├── README.md ├── setup.py └── ts_datasets ├── __init__.py ├── anomaly ├── __init__.py ├── base.py ├── custom.py ├── iops_competition.py ├── msl.py ├── nab.py ├── smap.py ├── smd.py ├── synthetic.py └── ucr.py ├── base.py └── forecast ├── __init__.py ├── custom.py ├── energy_power.py ├── m4.py ├── seattle_trail.py └── solar_plant.py /.copyright.tmpl: -------------------------------------------------------------------------------- 1 | Copyright (c) ${years} ${owner} 2 | All rights reserved. 3 | SPDX-License-Identifier: BSD-3-Clause 4 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # package 2 | __pycache__ 3 | *.egg-info 4 | docs 5 | tmp 6 | ts_datasets 7 | # pytest 8 | .pytest_cache 9 | .coverage* 10 | htmlcov 11 | # IDE/system 12 | .idea 13 | *.swp 14 | .DS_Store 15 | sandbox 16 | .vscode 17 | Icon? 18 | # build files 19 | docs/build/* 20 | .ipynb_checkpoints 21 | venv/ -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "[BUG]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior 15 | 16 | **Expected behavior** 17 | A clear and concise description of what you expected to happen. 18 | 19 | **Screenshots** 20 | If applicable, add screenshots to help explain your problem. 21 | 22 | **Desktop (please complete the following information):** 23 | - OS: [e.g. Ubuntu 16.04 LTS] 24 | - Merlion Version [e.g. 1.0.0] 25 | 26 | **Additional context** 27 | Add any other context about the problem here. 28 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "[FEATURE REQUEST]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/badges/README.md: -------------------------------------------------------------------------------- 1 | Branch for automatically uploading status badges. -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: docs 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | release: 9 | types: [ published ] 10 | 11 | jobs: 12 | docs: 13 | 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - uses: actions/checkout@v3 18 | with: 19 | fetch-depth: 0 20 | - name: Set up Python 21 | uses: actions/setup-python@v4 22 | with: 23 | python-version: '3.10' 24 | - name: Install dependencies 25 | run: | 26 | sudo apt-get update -y 27 | sudo apt-get install openjdk-11-jre-headless pandoc --fix-missing 28 | python -m pip install --upgrade pip setuptools wheel 29 | - name: Build Sphinx docs 30 | run: | 31 | docs/build_docs.sh 32 | timeout-minutes: 10 33 | - name: Deploy to gh-pages 34 | uses: peaceiris/actions-gh-pages@v3 35 | if: ${{ github.ref == 'refs/heads/main' || github.event_name == 'release' }} 36 | with: 37 | github_token: ${{ secrets.GITHUB_TOKEN }} 38 | publish_dir: docs/build/html 39 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish to pip 2 | 3 | on: 4 | release: 5 | types: [ published ] 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v3 12 | - name: Set up Python 13 | uses: actions/setup-python@v4 14 | with: 15 | python-version: '3.10' 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip setuptools build 19 | - name: Build package 20 | run: | 21 | python -m build 22 | - name: Publish package 23 | uses: pypa/gh-action-pypi-publish@release/v1 24 | with: 25 | user: __token__ 26 | password: ${{ secrets.PYPI_API_TOKEN }} 27 | verbose: true 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # package 2 | __pycache__ 3 | *.egg-info 4 | tmp 5 | # pytest 6 | .pytest_cache 7 | .coverage* 8 | htmlcov 9 | # IDE/system 10 | .idea 11 | *.swp 12 | .DS_Store 13 | sandbox 14 | .vscode 15 | Icon? 16 | # build files 17 | docs/build/* 18 | .ipynb_checkpoints 19 | venv/ -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: '22.10.0' 4 | hooks: 5 | - id: black 6 | args: ["--line-length", "120"] 7 | - repo: https://github.com/johann-petrak/licenseheaders.git 8 | rev: 'v0.8.8' 9 | hooks: 10 | - id: licenseheaders 11 | args: ["-t", ".copyright.tmpl", "-cy", "-o", "salesforce.com, inc.", 12 | "-E", ".py", "-x", "docs/source/conf.py", "-f"] 13 | -------------------------------------------------------------------------------- /AUTHORS.md: -------------------------------------------------------------------------------- 1 | Aadyot Bhatnagar 2 | Paul Kassianik 3 | Chenghao Liu 4 | Tian Lan 5 | Wenzhuo Yang 6 | Rowan Cassius 7 | Doyen Sahoo 8 | Devansh Arpit 9 | Sri Subramanian 10 | Gerald Woo 11 | Amrita Saha 12 | Arun Kumar Jagota 13 | Gokulakrishnan Gopalakrishnan 14 | Manpreet Singh 15 | K C Krithika 16 | Sukumar Maddineni 17 | Daeki Cho 18 | Bo Zong 19 | Yingbo Zhou 20 | Caiming Xiong 21 | Silvio Savarese 22 | Steven Hoi 23 | Huan Wang -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Comment line immediately above ownership line is reserved for related gus information. Please be careful while editing. 2 | #ECCN:Open Source 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2021, Salesforce.com, Inc. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | 6 | * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | 8 | * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 9 | 10 | * Neither the name of Salesforce.com nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 11 | 12 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 13 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHORS.md CODE_OF_CONDUCT.md LICENSE SECURITY.md 2 | global-exclude *.py[cod] 3 | exclude benchmark*.py 4 | recursive-exclude conf * 5 | recursive-exclude data * 6 | recursive-exclude docs * 7 | recursive-exclude examples * 8 | recursive-exclude figures * 9 | recursive-exclude tests * 10 | recursive-exclude ts_datasets * 11 | recursive-exclude venv * 12 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | ## Security 2 | 3 | Please report any security issue to [security@salesforce.com](mailto:security@salesforce.com) 4 | as soon as it is discovered. This library limits its runtime dependencies in 5 | order to reduce the total cost of ownership as much as can be, but all consumers 6 | should remain vigilant and have their security stakeholders review all third-party 7 | products (3PP) like this one and their dependencies. -------------------------------------------------------------------------------- /conf/benchmark_forecast.json: -------------------------------------------------------------------------------- 1 | { 2 | "ARIMA": {"alias": "Arima"}, 3 | "Arima": { 4 | "config": { 5 | "default": { 6 | "order": [30, 0, 10] 7 | } 8 | } 9 | }, 10 | 11 | "SARIMA": {"alias": "Sarima"}, 12 | "Sarima": { 13 | "config": { 14 | "default": { 15 | "order": [15, 1, 5], 16 | "seasonal_order": [2, 0, 1, 30] 17 | } 18 | } 19 | }, 20 | 21 | "AutoSARIMA": {"alias": "AutoSarima"}, 22 | "AutoSarima": { 23 | "model_type": "SeasonalityLayer", 24 | "config": { 25 | "default": { 26 | "model": {"name": "AutoSarima"}, 27 | "periodicity_strategy": "min" 28 | } 29 | } 30 | }, 31 | 32 | "ETS": { 33 | "config": { 34 | "default": { 35 | "damped_trend": true 36 | } 37 | } 38 | }, 39 | 40 | "AutoETS": { 41 | "config": { 42 | "default": { 43 | "damped_trend": true 44 | } 45 | } 46 | }, 47 | 48 | "MSES": { 49 | "config": { 50 | "default": { 51 | "max_forecast_steps": 100 52 | } 53 | } 54 | }, 55 | 56 | "Prophet": { 57 | "config": { 58 | "default": { 59 | "uncertainty_samples": 0 60 | } 61 | } 62 | }, 63 | 64 | "AutoProphet": { 65 | "config": { 66 | "default": { 67 | "uncertainty_samples": 0 68 | } 69 | } 70 | }, 71 | 72 | "Var": {"alias": "VectorAR"}, 73 | "VAR": {"alias": "VectorAR"}, 74 | "VectorAR" : { 75 | "config": { 76 | "default": { 77 | "target_seq_index": 0, 78 | "maxlags": 168, 79 | "max_forecast_steps": 3 80 | } 81 | }, 82 | "dataset": {} 83 | }, 84 | "RandomForestForecaster" : { 85 | "config": { 86 | "default": { 87 | "target_seq_index": 0, 88 | "maxlags": 21, 89 | "max_forecast_steps": 3, 90 | "n_estimators": 100, 91 | "max_depth": 9, 92 | "prediction_stride": 1 93 | }, 94 | "dataset": {} 95 | } 96 | }, 97 | "ExtraTreesForecaster" : { 98 | "config": { 99 | "default": { 100 | "target_seq_index": 0, 101 | "maxlags": 21, 102 | "max_forecast_steps": 3, 103 | "n_estimators": 100, 104 | "max_depth": 9, 105 | "prediction_stride": 1 106 | }, 107 | "dataset": {} 108 | } 109 | }, 110 | "LGBMForecaster" : { 111 | "config": { 112 | "default": { 113 | "target_seq_index": 0, 114 | "maxlags": 21, 115 | "max_forecast_steps": 3, 116 | "learning_rate": 0.1, 117 | "n_estimators": 100, 118 | "max_depth": 7, 119 | "prediction_stride": 1 120 | }, 121 | "dataset": {} 122 | } 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /data/iops_competition/phase2.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/Merlion/085ef8a69e5dcdfb9dcaa394cc21e087cccbb8f0/data/iops_competition/phase2.zip -------------------------------------------------------------------------------- /data/multivariate/energy_power/est_hourly.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/Merlion/085ef8a69e5dcdfb9dcaa394cc21e087cccbb8f0/data/multivariate/energy_power/est_hourly.csv.gz -------------------------------------------------------------------------------- /data/multivariate/solar_plant/merged.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/Merlion/085ef8a69e5dcdfb9dcaa394cc21e087cccbb8f0/data/multivariate/solar_plant/merged.zip -------------------------------------------------------------------------------- /data/smap/SMAP.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/Merlion/085ef8a69e5dcdfb9dcaa394cc21e087cccbb8f0/data/smap/SMAP.tar.gz -------------------------------------------------------------------------------- /data/test_transform.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/Merlion/085ef8a69e5dcdfb9dcaa394cc21e087cccbb8f0/data/test_transform.pkl -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim 2 | WORKDIR /opt/Merlion 3 | # Install Java 4 | RUN rm -rf /var/lib/apt/lists/* && \ 5 | apt-get clean && \ 6 | apt-get update && \ 7 | apt-get upgrade && \ 8 | apt-get install -y --no-install-recommends openjdk-11-jre-headless && \ 9 | rm -rf /var/lib/apt/lists/* 10 | # Install Merlion from source 11 | COPY *.md ./ 12 | COPY setup.py ./ 13 | COPY merlion merlion 14 | RUN pip install "./" 15 | -------------------------------------------------------------------------------- /docker/dashboard/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim 2 | WORKDIR /opt/Merlion 3 | # Install Java 4 | RUN rm -rf /var/lib/apt/lists/* && \ 5 | apt-get clean && \ 6 | apt-get update && \ 7 | apt-get upgrade && \ 8 | apt-get install -y --no-install-recommends openjdk-11-jre-headless && \ 9 | rm -rf /var/lib/apt/lists/* 10 | # Install Merlion from source & set up a gunicorn server 11 | COPY *.md ./ 12 | COPY setup.py ./ 13 | COPY merlion merlion 14 | RUN pip install gunicorn "./[dashboard]" 15 | CMD gunicorn -b 0.0.0.0:80 merlion.dashboard.server:server 16 | -------------------------------------------------------------------------------- /docker/spark-on-k8s/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG spark_uid=185 2 | FROM gcr.io/spark-operator/spark-py:v3.1.1 3 | 4 | # Change to root user for installation steps 5 | USER 0 6 | 7 | # Install pyarrow (for spark-sql) and Merlion; get pyspark & py4j from the PYTHONPATH 8 | ENV PYTHONPATH="${SPARK_HOME}/python/lib/pyspark.zip:${SPARK_HOME}/python/lib/py4j-0.10.9-src.zip:${PYTHONPATH}" 9 | COPY *.md ./ 10 | COPY setup.py ./ 11 | COPY merlion merlion 12 | RUN pip install pyarrow "./" 13 | 14 | # Copy Merlion pyspark apps 15 | COPY spark_apps /opt/spark/apps 16 | COPY data/walmart/walmart_mini.csv . 17 | RUN chmod g+w /opt/spark/apps 18 | USER ${spark_uid} 19 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | To generate documentation using [Sphinx](https://www.sphinx-doc.org/en/master/index.html), just run the script 2 | [`build_docs.sh`](build_docs.sh). The ``build/html`` directory will be populated with searchable, 3 | indexed HTML documentation. 4 | 5 | Note that our documentation also depends on [Pandoc](https://pandoc.org/installing.html) to render Jupyter notebooks. 6 | For Ubuntu, call ``sudo apt-get install pandoc``. For Mac OS, install [Homebrew](https://brew.sh/) 7 | and call ``brew install pandoc``. 8 | -------------------------------------------------------------------------------- /docs/process_old_docs.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | """ 8 | Script which removes redirects from the HTML API docs & updates the version matrix on old files. 9 | """ 10 | import os 11 | import re 12 | import shutil 13 | 14 | from bs4 import BeautifulSoup as bs 15 | from git import Repo 16 | 17 | 18 | def create_version_dl(soup, prefix, current_version, all_versions): 19 | dl = soup.new_tag("dl") 20 | dt = soup.new_tag("dt") 21 | dt.string = "Versions" 22 | dl.append(dt) 23 | for version in all_versions: 24 | # Create the href for this version & bold it if it's the current version 25 | href = soup.new_tag("a", href=f"{prefix}/{version}/index.html") 26 | href.string = version 27 | if version == current_version: 28 | strong = soup.new_tag("strong") 29 | strong.append(href) 30 | href = strong 31 | # Create a list item & add it to the dl 32 | dd = soup.new_tag("dd") 33 | dd.append(href) 34 | dl.append(dd) 35 | return dl 36 | 37 | 38 | def main(): 39 | # Get all the versions 40 | repo = Repo(search_parent_directories=True) 41 | versions = sorted([tag.name for tag in repo.tags if re.match("v[0-9].*", tag.name)], reverse=True) 42 | versions = ["latest", *versions] 43 | 44 | dirname = os.path.join(os.path.dirname(os.path.abspath(__file__)), "build", "html") 45 | for version in os.listdir(dirname): 46 | # If this isn't a directory containing a numbered version's API docs, delete it 47 | version_root = os.path.join(dirname, version) 48 | if version == "latest" or version not in versions: 49 | shutil.rmtree(version_root) if os.path.isdir(version_root) else os.remove(version_root) 50 | continue 51 | 52 | # Update version matrix in HTML source versioned files 53 | for subdir, _, files in os.walk(version_root): 54 | html_files = [os.path.join(subdir, f) for f in files if f.endswith(".html")] 55 | 56 | # Determine how far the version root is from the files in this directory 57 | prefix = ".." 58 | while subdir and subdir != version_root: 59 | subdir = os.path.dirname(subdir) 60 | prefix += "/.." 61 | 62 | # Create the new description list for the version & write the new file 63 | for file in html_files: 64 | with open(file) as f: 65 | soup = bs(f, "html.parser") 66 | version_dl = [dl for dl in soup.find_all("dl") if dl.find("dt", string="Versions")] 67 | if len(version_dl) == 0: 68 | continue 69 | version_dl[0].replace_with(create_version_dl(soup, prefix, version, versions)) 70 | with open(file, "w", encoding="utf-8") as f: 71 | f.write(str(soup)) 72 | 73 | 74 | if __name__ == "__main__": 75 | main() 76 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | GitPython 2 | beautifulsoup4 3 | ipykernel 4 | nbsphinx 5 | pandoc 6 | docutils==0.16 7 | sphinx<6 8 | sphinx_autodoc_typehints 9 | sphinx_rtd_theme 10 | -------------------------------------------------------------------------------- /docs/source/_static/figures: -------------------------------------------------------------------------------- 1 | ../../../figures -------------------------------------------------------------------------------- /docs/source/_templates/autosummary/module.rst: -------------------------------------------------------------------------------- 1 | {{ fullname }} 2 | {{ underline }} 3 | 4 | .. currentmodule:: {{fullname}} 5 | 6 | .. contents:: 7 | :local: 8 | 9 | .. automodule:: {{fullname}} 10 | :members: 11 | :undoc-members: 12 | :show-inheritance: 13 | 14 | Members 15 | ======= 16 | -------------------------------------------------------------------------------- /docs/source/_templates/versions.html: -------------------------------------------------------------------------------- 1 | {% if display_lower_left %} 2 | {# Add rst-badge after rst-versions for small badge style. #} 3 | <div class="rst-versions" data-toggle="rst-versions" role="note" aria-label="versions"> 4 | <span class="rst-current-version" data-toggle="rst-current-version"> 5 | <span class="fa fa-book"> Versions</span> 6 | {{ current_version }} 7 | <span class="fa fa-caret-down"></span> 8 | </span> 9 | <div class="rst-other-versions"> 10 | {% if versions|length >= 1 %} 11 | <dl> 12 | <dt>{{ _('Versions') }}</dt> 13 | {% for version in versions %} 14 | {% if version == current_version %} <strong> {% endif %} 15 | {% set rootdir = "/".join(pathto(root_doc).split("/")[:-1] + [".."]) %} 16 | <dd><a href="{{ rootdir }}/{{ version }}/index.html">{{ version }}</a></dd> 17 | {% if version == current_version %} </strong> {% endif %} 18 | {% endfor %} 19 | </dl> 20 | {% endif %} 21 | </div> 22 | </div> 23 | {% endif %} 24 | 25 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | from git import Repo 14 | import os 15 | import packaging.version 16 | import pkg_resources 17 | import re 18 | import sys 19 | 20 | sys.path.insert(0, os.path.abspath("..")) 21 | 22 | 23 | # -- Project information ----------------------------------------------------- 24 | 25 | project = "Merlion" 26 | copyright = "2021, salesforce.com, inc." 27 | 28 | # The full version, including alpha/beta/rc tags 29 | release = pkg_resources.get_distribution("salesforce-merlion").version 30 | 31 | default_role = "any" 32 | 33 | 34 | # -- General configuration --------------------------------------------------- 35 | 36 | # Add any Sphinx extension module names here, as strings. They can be 37 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 38 | # ones. 39 | extensions = [ 40 | "nbsphinx", 41 | "IPython.sphinxext.ipython_console_highlighting", 42 | "sphinx.ext.autodoc", 43 | "sphinx.ext.autosummary", 44 | "sphinx_autodoc_typehints", 45 | ] 46 | 47 | autoclass_content = "both" # include both class docstring and __init__ 48 | autodoc_default_options = { 49 | # Make sure that any autodoc declarations show the right members 50 | "members": True, 51 | "undoc-members": True, 52 | "inherited-members": False, 53 | "show-inheritance": True, 54 | } 55 | autodoc_member_order = "bysource" 56 | autosummary_generate = True # Make _autosummary files and include them 57 | 58 | # Add any paths that contain templates here, relative to this directory. 59 | templates_path = ["_templates"] 60 | 61 | 62 | # -- Options for HTML output ------------------------------------------------- 63 | 64 | # The theme to use for HTML and HTML Help pages. See the documentation for 65 | # a list of builtin themes. 66 | # 67 | html_theme = "sphinx_rtd_theme" 68 | 69 | html_theme_options = {"navigation_depth": -1} 70 | 71 | # Set up something to display versions, but only do it if the current version is set in the environment. 72 | if "current_version" in os.environ: 73 | current_version = os.environ["current_version"] 74 | stable_version = os.environ.get("stable_version", "latest") 75 | if current_version == stable_version != "latest": 76 | current_version = f"{current_version} (stable)" 77 | try: 78 | html_context 79 | except NameError: 80 | html_context = dict() 81 | html_context["display_lower_left"] = True 82 | 83 | repo = Repo(search_parent_directories=True) 84 | html_context["current_version"] = current_version 85 | html_context["version"] = current_version 86 | versions = sorted([tag.name for tag in repo.tags if re.match("v[0-9].*", tag.name)], reverse=True) 87 | versions = ["latest", *versions] 88 | html_context["versions"] = versions 89 | 90 | else: 91 | current_version = "latest" 92 | 93 | # List of patterns, relative to source directory, that match files and 94 | # directories to ignore when looking for source files. 95 | # This pattern also affects html_static_path and html_extra_path. 96 | if current_version == "latest" or packaging.version.parse(current_version) > packaging.version.parse("1.3.0"): 97 | exclude_patterns = ["examples"] 98 | else: 99 | exclude_patterns = ["tutorials"] 100 | exclude_patterns += ["**.ipynb_checkpoints"] 101 | -------------------------------------------------------------------------------- /docs/source/merlion.evaluate.rst: -------------------------------------------------------------------------------- 1 | merlion.evaluate package 2 | ======================== 3 | This sub-package implements utilities and metrics for evaluating the performance 4 | of time series models on different tasks. 5 | 6 | .. automodule:: merlion.evaluate 7 | :members: 8 | :undoc-members: 9 | :show-inheritance: 10 | 11 | .. autosummary:: 12 | base 13 | anomaly 14 | forecast 15 | 16 | merlion.evaluate.base 17 | --------------------- 18 | 19 | .. automodule:: merlion.evaluate.base 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | 24 | merlion.evaluate.anomaly 25 | ------------------------ 26 | 27 | .. automodule:: merlion.evaluate.anomaly 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | 32 | merlion.evaluate.forecast 33 | ------------------------- 34 | 35 | .. automodule:: merlion.evaluate.forecast 36 | :members: 37 | :undoc-members: 38 | :show-inheritance: 39 | -------------------------------------------------------------------------------- /docs/source/merlion.models.anomaly.change_point.rst: -------------------------------------------------------------------------------- 1 | anomaly.change\_point 2 | ===================== 3 | 4 | .. automodule:: merlion.models.anomaly.change_point 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | .. autosummary:: 10 | bocpd 11 | 12 | anomaly.change\_point.bocpd 13 | --------------------------- 14 | 15 | .. automodule:: merlion.models.anomaly.change_point.bocpd 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | -------------------------------------------------------------------------------- /docs/source/merlion.models.anomaly.forecast_based.rst: -------------------------------------------------------------------------------- 1 | anomaly.forecast\_based 2 | ======================= 3 | 4 | .. automodule:: merlion.models.anomaly.forecast_based 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | .. autosummary:: 10 | base 11 | arima 12 | sarima 13 | ets 14 | prophet 15 | mses 16 | 17 | anomaly.forecast\_based.base 18 | ---------------------------- 19 | 20 | .. automodule:: merlion.models.anomaly.forecast_based.base 21 | :members: 22 | :undoc-members: 23 | :show-inheritance: 24 | 25 | anomaly.forecast\_based.arima 26 | ----------------------------- 27 | 28 | .. automodule:: merlion.models.anomaly.forecast_based.arima 29 | :members: 30 | :undoc-members: 31 | :show-inheritance: 32 | 33 | anomaly.forecast\_based.sarima 34 | ------------------------------ 35 | 36 | .. automodule:: merlion.models.anomaly.forecast_based.sarima 37 | :members: 38 | :undoc-members: 39 | :show-inheritance: 40 | 41 | anomaly.forecast\_based.ets 42 | --------------------------- 43 | 44 | .. automodule:: merlion.models.anomaly.forecast_based.ets 45 | :members: 46 | :undoc-members: 47 | :show-inheritance: 48 | 49 | anomaly.forecast\_based.prophet 50 | ------------------------------- 51 | 52 | .. automodule:: merlion.models.anomaly.forecast_based.prophet 53 | :members: 54 | :undoc-members: 55 | :show-inheritance: 56 | 57 | anomaly.forecast\_based.mses 58 | ---------------------------- 59 | 60 | .. automodule:: merlion.models.anomaly.forecast_based.mses 61 | :members: 62 | :undoc-members: 63 | :show-inheritance: 64 | -------------------------------------------------------------------------------- /docs/source/merlion.models.anomaly.rst: -------------------------------------------------------------------------------- 1 | anomaly 2 | ======= 3 | 4 | .. automodule:: merlion.models.anomaly 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Base classes 10 | 11 | .. autosummary:: 12 | base 13 | 14 | Univariate models: 15 | 16 | .. autosummary:: 17 | dbl 18 | windstats 19 | spectral_residual 20 | stat_threshold 21 | zms 22 | 23 | `Multivariate <tutorials/anomaly/2_AnomalyMultivariate>` models: 24 | 25 | .. autosummary:: 26 | isolation_forest 27 | random_cut_forest 28 | autoencoder 29 | dagmm 30 | lstm_ed 31 | vae 32 | deep_point_anomaly_detector 33 | 34 | Subpackages 35 | ----------- 36 | 37 | .. toctree:: 38 | :maxdepth: 4 39 | 40 | merlion.models.anomaly.forecast_based 41 | merlion.models.anomaly.change_point 42 | 43 | Base classes 44 | ------------ 45 | 46 | anomaly.base 47 | ^^^^^^^^^^^^ 48 | .. automodule:: merlion.models.anomaly.base 49 | :members: 50 | :undoc-members: 51 | :show-inheritance: 52 | 53 | Univariate models 54 | ----------------- 55 | 56 | anomaly.dbl 57 | ^^^^^^^^^^^ 58 | .. automodule:: merlion.models.anomaly.dbl 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | anomaly.windstats 64 | ^^^^^^^^^^^^^^^^^ 65 | .. automodule:: merlion.models.anomaly.windstats 66 | :members: 67 | :undoc-members: 68 | :show-inheritance: 69 | 70 | anomaly.spectral\_residual 71 | ^^^^^^^^^^^^^^^^^^^^^^^^^^ 72 | .. automodule:: merlion.models.anomaly.spectral_residual 73 | :members: 74 | :undoc-members: 75 | :show-inheritance: 76 | 77 | anomaly.stat\_threshold 78 | ^^^^^^^^^^^^^^^^^^^^^^^ 79 | .. automodule:: merlion.models.anomaly.stat_threshold 80 | :members: 81 | :undoc-members: 82 | :show-inheritance: 83 | 84 | anomaly.zms 85 | ^^^^^^^^^^^ 86 | .. automodule:: merlion.models.anomaly.zms 87 | :members: 88 | :undoc-members: 89 | :show-inheritance: 90 | 91 | Multivariate models 92 | ------------------- 93 | 94 | anomaly.isolation\_forest 95 | ^^^^^^^^^^^^^^^^^^^^^^^^^ 96 | .. automodule:: merlion.models.anomaly.isolation_forest 97 | :members: 98 | :undoc-members: 99 | :show-inheritance: 100 | 101 | anomaly.random\_cut\_forest 102 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^ 103 | .. automodule:: merlion.models.anomaly.random_cut_forest 104 | :members: 105 | :undoc-members: 106 | :show-inheritance: 107 | 108 | anomaly.autoencoder 109 | ^^^^^^^^^^^^^^^^^^^ 110 | .. automodule:: merlion.models.anomaly.autoencoder 111 | :members: 112 | :undoc-members: 113 | :show-inheritance: 114 | 115 | anomaly.vae 116 | ^^^^^^^^^^^ 117 | .. automodule:: merlion.models.anomaly.vae 118 | :members: 119 | :undoc-members: 120 | :show-inheritance: 121 | 122 | anomaly.dagmm 123 | ^^^^^^^^^^^^^ 124 | .. automodule:: merlion.models.anomaly.dagmm 125 | :members: 126 | :undoc-members: 127 | :show-inheritance: 128 | 129 | anomaly.lstm_ed 130 | ^^^^^^^^^^^^^^^ 131 | .. automodule:: merlion.models.anomaly.lstm_ed 132 | :members: 133 | :undoc-members: 134 | :show-inheritance: 135 | 136 | anomaly.lof 137 | ^^^^^^^^^^^^^^^ 138 | .. automodule:: merlion.models.anomaly.lof 139 | :members: 140 | :undoc-members: 141 | :show-inheritance: 142 | 143 | anomaly.deep\_point\_anomaly\_detector 144 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 145 | .. automodule:: merlion.models.anomaly.deep_point_anomaly_detector 146 | :members: 147 | :undoc-members: 148 | :show-inheritance: 149 | -------------------------------------------------------------------------------- /docs/source/merlion.models.automl.rst: -------------------------------------------------------------------------------- 1 | automl 2 | ====== 3 | 4 | .. automodule:: merlion.models.automl 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Base classes: 10 | 11 | .. autosummary:: 12 | base 13 | 14 | Models: 15 | 16 | .. autosummary:: 17 | autoets 18 | autoprophet 19 | autosarima 20 | 21 | Utilities: 22 | 23 | .. autosummary:: 24 | seasonality 25 | search 26 | 27 | Base classes 28 | ------------ 29 | automl.base 30 | ^^^^^^^^^^^ 31 | .. automodule:: merlion.models.automl.base 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | 36 | Models 37 | ------ 38 | automl.autoets 39 | ^^^^^^^^^^^^^^ 40 | .. automodule:: merlion.models.automl.autoets 41 | :members: 42 | :undoc-members: 43 | :show-inheritance: 44 | 45 | automl.autoprophet 46 | ^^^^^^^^^^^^^^^^^^ 47 | .. automodule:: merlion.models.automl.autoprophet 48 | :members: 49 | :undoc-members: 50 | :show-inheritance: 51 | 52 | automl.autosarima 53 | ^^^^^^^^^^^^^^^^^ 54 | .. automodule:: merlion.models.automl.autosarima 55 | :members: 56 | :undoc-members: 57 | :show-inheritance: 58 | 59 | 60 | Utilities 61 | --------- 62 | 63 | automl.seasonality 64 | ^^^^^^^^^^^^^^^^^^ 65 | .. automodule:: merlion.models.automl.seasonality 66 | :members: 67 | :undoc-members: 68 | :show-inheritance: 69 | 70 | automl.search 71 | ^^^^^^^^^^^^^ 72 | .. automodule:: merlion.models.automl.search 73 | :members: 74 | :undoc-members: 75 | :show-inheritance: 76 | -------------------------------------------------------------------------------- /docs/source/merlion.models.ensemble.rst: -------------------------------------------------------------------------------- 1 | ensemble 2 | ======== 3 | 4 | .. automodule:: merlion.models.ensemble 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | .. autosummary:: 10 | base 11 | combine 12 | anomaly 13 | forecast 14 | 15 | ensemble.base 16 | ------------- 17 | .. automodule:: merlion.models.ensemble.base 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | ensemble.combine 23 | ---------------- 24 | .. automodule:: merlion.models.ensemble.combine 25 | :members: 26 | :undoc-members: 27 | :show-inheritance: 28 | 29 | ensemble.anomaly 30 | ---------------- 31 | .. automodule:: merlion.models.ensemble.anomaly 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | 36 | ensemble.forecast 37 | ----------------- 38 | .. automodule:: merlion.models.ensemble.forecast 39 | :members: 40 | :undoc-members: 41 | :show-inheritance: 42 | -------------------------------------------------------------------------------- /docs/source/merlion.models.utils.rst: -------------------------------------------------------------------------------- 1 | utils 2 | ===== 3 | 4 | .. automodule:: merlion.models.utils 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | .. autosummary:: 10 | time_features 11 | rolling_window_dataset 12 | early_stopping 13 | autosarima_utils 14 | 15 | 16 | utils.time\_features 17 | -------------------- 18 | .. automodule:: merlion.models.utils.time_features 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | 24 | utils.rolling\_window\_dataset 25 | ------------------------------ 26 | 27 | .. automodule:: merlion.models.utils.rolling_window_dataset 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | 32 | 33 | utils.early\_stopping 34 | --------------------- 35 | .. automodule:: merlion.models.utils.early_stopping 36 | :members: 37 | :undoc-members: 38 | :show-inheritance: 39 | 40 | 41 | utils.autosarima\_utils 42 | ----------------------- 43 | 44 | .. automodule:: merlion.models.utils.autosarima_utils 45 | :members: 46 | :undoc-members: 47 | :show-inheritance: -------------------------------------------------------------------------------- /docs/source/merlion.plot.rst: -------------------------------------------------------------------------------- 1 | merlion.plot package 2 | ==================== 3 | .. automodule:: merlion.plot 4 | :members: 5 | :undoc-members: 6 | :show-inheritance: 7 | -------------------------------------------------------------------------------- /docs/source/merlion.post_process.rst: -------------------------------------------------------------------------------- 1 | merlion.post\_process package 2 | ============================= 3 | This package implements some simple rules to post-process the output of an 4 | anomaly detection model. This includes rules for reshaping a sequence to follow 5 | a standard normal distribution (:py:mod:`merlion.post_process.calibrate`), sparsifying 6 | a sequence based on a threshold (:py:mod:`merlion.post_process.threshold`), and composing 7 | together sequences of post-processing rules (:py:mod:`merlion.post_process.sequence`). 8 | 9 | .. automodule:: merlion.post_process 10 | :members: 11 | :undoc-members: 12 | :show-inheritance: 13 | 14 | .. autosummary:: 15 | base 16 | factory 17 | sequence 18 | calibrate 19 | threshold 20 | 21 | 22 | merlion.post\_process.base 23 | -------------------------- 24 | 25 | .. automodule:: merlion.post_process.base 26 | :members: 27 | :undoc-members: 28 | :show-inheritance: 29 | 30 | merlion.post\_process.factory 31 | ----------------------------- 32 | 33 | .. automodule:: merlion.post_process.factory 34 | :members: 35 | :undoc-members: 36 | :show-inheritance: 37 | 38 | merlion.post\_process.sequence 39 | ------------------------------ 40 | 41 | .. automodule:: merlion.post_process.sequence 42 | :members: 43 | :undoc-members: 44 | :show-inheritance: 45 | 46 | .. _merlion.post_process.calibrate: 47 | 48 | merlion.post\_process.calibrate 49 | ------------------------------- 50 | 51 | .. automodule:: merlion.post_process.calibrate 52 | :members: 53 | :undoc-members: 54 | :show-inheritance: 55 | 56 | merlion.post\_process.threshold 57 | ------------------------------- 58 | 59 | .. automodule:: merlion.post_process.threshold 60 | :members: 61 | :undoc-members: 62 | :show-inheritance: 63 | -------------------------------------------------------------------------------- /docs/source/merlion.transform.rst: -------------------------------------------------------------------------------- 1 | merlion.transform package 2 | ========================= 3 | This package provides a number of useful data pre-processing transforms. Each 4 | transform is a callable object that inherits either from `TransformBase` or 5 | `InvertibleTransformBase`. 6 | 7 | We will introduce the key features of transform objects using the `Rescale` 8 | class. You may initialize a ``transform`` in three ways: 9 | 10 | .. code-block:: python 11 | 12 | from merlion.transform.factory import TransformFactory 13 | from merlion.transform.normalize import Rescale 14 | 15 | # Use the initializer 16 | transform = Rescale(bias=5.0, scale=3.2) 17 | 18 | # Use the class's from_dict() method with the arguments you would normally 19 | # give to the initializer 20 | kwargs = dict(bias=5.0, scale=3.2) 21 | transform = Rescale.from_dict(kwargs) 22 | 23 | # Use the TransformFactory with the class's name, and the keyword arguments 24 | # you would normally give to the inializer 25 | transform = TransformFactory.create("Rescale", **kwargs) 26 | 27 | After initializing a ``transform``, one may use it as follows: 28 | 29 | .. code-block:: python 30 | 31 | transform.train(time_series) # set any trainable params 32 | transformed = transform(time_series) # apply the transform to the time series 33 | inverted = transform.invert(transformed) # invert the transform 34 | state_dict = transform.to_dict() # serialize to a JSON-compatible dict 35 | 36 | Note that ``transform.invert()`` is supported even if the transform doesn't 37 | inherit from `InvertibleTransformBase`! In this case, ``transform.invert()`` 38 | implements a *pseudo*-inverse that may not recover the original ``time_series`` 39 | exactly. Additionally, the dict returned by ``transform.to_dict()`` is exactly 40 | the same as the dict expected by the class method ``TransformCls.from_dict()``. 41 | 42 | .. automodule:: merlion.transform 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | Base primitives: 48 | 49 | .. autosummary:: 50 | factory 51 | base 52 | sequence 53 | 54 | Resampling: 55 | 56 | .. autosummary:: 57 | resample 58 | moving_average 59 | 60 | Normalization: 61 | 62 | .. autosummary:: 63 | bound 64 | normalize 65 | 66 | Miscellaneous: 67 | 68 | .. autosummary:: 69 | anomalize 70 | 71 | Base primitives 72 | --------------- 73 | 74 | transform.factory 75 | ^^^^^^^^^^^^^^^^^ 76 | .. automodule:: merlion.transform.factory 77 | :members: 78 | :undoc-members: 79 | :show-inheritance: 80 | 81 | transform.base 82 | ^^^^^^^^^^^^^^ 83 | .. automodule:: merlion.transform.base 84 | :members: 85 | :undoc-members: 86 | :show-inheritance: 87 | 88 | transform.sequence 89 | ^^^^^^^^^^^^^^^^^^ 90 | .. automodule:: merlion.transform.sequence 91 | :members: 92 | :undoc-members: 93 | :show-inheritance: 94 | 95 | Resampling 96 | ---------- 97 | 98 | transform.resample 99 | ^^^^^^^^^^^^^^^^^^ 100 | .. automodule:: merlion.transform.resample 101 | :members: 102 | :undoc-members: 103 | :show-inheritance: 104 | 105 | transform.moving\_average 106 | ^^^^^^^^^^^^^^^^^^^^^^^^^ 107 | .. automodule:: merlion.transform.moving_average 108 | :members: 109 | :undoc-members: 110 | :show-inheritance: 111 | 112 | Normalization 113 | ------------- 114 | 115 | transform.normalize 116 | ^^^^^^^^^^^^^^^^^^^ 117 | .. automodule:: merlion.transform.normalize 118 | :members: 119 | :undoc-members: 120 | :show-inheritance: 121 | 122 | transform.bound 123 | ^^^^^^^^^^^^^^^ 124 | .. automodule:: merlion.transform.bound 125 | :members: 126 | :undoc-members: 127 | :show-inheritance: 128 | 129 | 130 | Miscellaneous 131 | ------------- 132 | 133 | transform.anomalize 134 | ^^^^^^^^^^^^^^^^^^^ 135 | .. automodule:: merlion.transform.anomalize 136 | :members: 137 | :undoc-members: 138 | :show-inheritance: 139 | -------------------------------------------------------------------------------- /docs/source/merlion.utils.rst: -------------------------------------------------------------------------------- 1 | 2 | merlion.utils package 3 | ===================== 4 | This package contains various utilities, including the `TimeSeries` class and 5 | utilities for resampling time series. 6 | 7 | .. automodule:: merlion.utils 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | 12 | .. autosummary:: 13 | time_series 14 | resample 15 | data_io 16 | hts 17 | ts_generator 18 | conj_priors 19 | istat 20 | 21 | merlion.utils.time\_series 22 | -------------------------- 23 | .. automodule:: merlion.utils.time_series 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | merlion.utils.resample 29 | ---------------------- 30 | .. automodule:: merlion.utils.resample 31 | :members: 32 | :undoc-members: 33 | :show-inheritance: 34 | 35 | merlion.utils.data\_io 36 | ---------------------- 37 | .. automodule:: merlion.utils.data_io 38 | :members: 39 | :undoc-members: 40 | :show-inheritance: 41 | 42 | merlion.utils.hts 43 | ----------------- 44 | .. automodule:: merlion.utils.hts 45 | :members: 46 | :undoc-members: 47 | :show-inheritance: 48 | 49 | merlion.utils.ts\_generator 50 | --------------------------- 51 | .. automodule:: merlion.utils.ts_generator 52 | :members: 53 | :undoc-members: 54 | :show-inheritance: 55 | 56 | merlion.utils.conj_priors 57 | ------------------------- 58 | .. automodule:: merlion.utils.conj_priors 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | merlion.utils.istat 64 | ------------------- 65 | .. automodule:: merlion.utils.istat 66 | :members: 67 | :undoc-members: 68 | :show-inheritance: 69 | -------------------------------------------------------------------------------- /docs/source/ts_datasets.anomaly.rst: -------------------------------------------------------------------------------- 1 | ts_datasets.anomaly package 2 | =========================== 3 | 4 | .. automodule:: ts_datasets.anomaly 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/ts_datasets.forecast.rst: -------------------------------------------------------------------------------- 1 | ts_datasets.forecast package 2 | ============================ 3 | 4 | .. automodule:: ts_datasets.forecast 5 | :members: 6 | :show-inheritance: 7 | -------------------------------------------------------------------------------- /docs/source/ts_datasets.rst: -------------------------------------------------------------------------------- 1 | ts_datasets: Easy Data Loading 2 | ============================== 3 | 4 | :py:mod:`ts_datasets` implements Python classes that manipulate numerous time series datasets 5 | into standardized ``pandas.DataFrame`` s. The sub-modules are :py:mod:`ts_datasets.anomaly` 6 | for time series anomaly detection, and :py:mod:`ts_datasets.forecast` for time series forecasting. 7 | Simply install the package by calling ``pip install -e ts_datasets/`` from the root directory of Merlion. 8 | Then, you can load a dataset (e.g. the "realAWSCloudwatch" split of the Numenta Anomaly Benchmark 9 | or the "Hourly" subset of the M4 dataset) by calling 10 | 11 | .. code-block:: python 12 | 13 | from ts_datasets.anomaly import NAB 14 | from ts_datasets.forecast import M4 15 | anom_dataset = NAB(subset="realAWSCloudwatch", rootdir=path_to_NAB) 16 | forecast_dataset = M4(subset="Hourly", rootdir=path_to_M4) 17 | 18 | If you install this package in editable mode (i.e. specify ``-e`` when calling ``pip install -e ts_datasets/``), 19 | there is no need to specify a ``rootdir`` for any of the data loaders. 20 | 21 | The core features of general data loaders (e.g. for forecasting) are outlined in the API doc for 22 | :py:class:`ts_datasets.base.BaseDataset`, and the features for time series anomaly detection data loaders 23 | are outlined in the API doc for :py:class:`ts_datasets.anomaly.TSADBaseDataset`. 24 | 25 | The easiest way to load a custom dataset is to use either the :py:class:`ts_datasets.forecast.CustomDataset` or 26 | :py:class:`ts_datasets.anomaly.CustomAnomalyDataset` classes. Please review the `tutorial <tutorials/CustomDataset>` 27 | to get started. 28 | 29 | .. automodule:: ts_datasets 30 | :members: 31 | :undoc-members: 32 | :show-inheritance: 33 | 34 | Subpackages 35 | ----------- 36 | 37 | .. toctree:: 38 | :maxdepth: 4 39 | 40 | ts_datasets.anomaly 41 | ts_datasets.forecast 42 | 43 | datasets.base module 44 | -------------------- 45 | 46 | .. automodule:: ts_datasets.base 47 | :members: 48 | :undoc-members: 49 | :show-inheritance: 50 | -------------------------------------------------------------------------------- /docs/source/tutorials: -------------------------------------------------------------------------------- 1 | ../../examples -------------------------------------------------------------------------------- /docs/source/tutorials.rst: -------------------------------------------------------------------------------- 1 | Tutorials & Example Code 2 | ======================== 3 | 4 | Basics 5 | ------ 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | :glob: 10 | 11 | tutorials/TimeSeries.ipynb 12 | tutorials/CustomDataset.ipynb 13 | 14 | Anomaly Detection 15 | ----------------- 16 | .. toctree:: 17 | :maxdepth: 2 18 | :glob: 19 | 20 | tutorials/anomaly/* 21 | 22 | Forecasting 23 | ----------- 24 | .. toctree:: 25 | :maxdepth: 2 26 | :glob: 27 | 28 | tutorials/forecast/* 29 | 30 | Advanced Features 31 | ----------------- 32 | .. toctree:: 33 | :maxdepth: 2 34 | :glob: 35 | 36 | tutorials/advanced/* 37 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | This file outlines how you should navigate the Jupyter notebooks in this folder. 2 | All new users should start with [`TimeSeries.ipynb`](TimeSeries.ipynb), which explains 3 | how to use Merlion's `UnivariateTimeSeries` and `TimeSeries` classes. These classes are 4 | the core data format used throughout the repo. 5 | 6 | If you are interested in anomaly detection, you should next read 7 | [`anomaly/AnomalyIntro.ipynb`](anomaly/0_AnomalyIntro.ipynb) to understand how to use 8 | anomaly detection models in Merlion. Afterwards, if you want to implement a new 9 | anomaly detection model in Merlion, please read [`CONTRIBUTING.md`](../CONTRIBUTING.md) 10 | and [`anomaly/AnomalyNewModel.ipynb`](anomaly/3_AnomalyNewModel.ipynb). 11 | 12 | If you are interested in forecasting, you should next read 13 | [`forecast/ForecastIntro.ipynb`](forecast/0_ForecastIntro.ipynb) to understand how to use 14 | forecasting models in Merlion. Afterward, if you want to implement a new forecasting 15 | model in Merlion, please read [`CONTRIBUTING.md`](../CONTRIBUTING.md) and 16 | and [`forecast/ForecastNewModel.ipynb`](forecast/3_ForecastNewModel.ipynb). 17 | 18 | We offer more advanced tutorials on specific high-performing models (AutoSARIMA and Mixture of Experts forecaster) 19 | in the [`advanced`](advanced) subdirectory. If you are interested in other utilities offered by the `merlion` 20 | package, look at the resources inside the [`misc`](misc) subdirectory. For example, 21 | [`misc/generate_synthetic_tsad_dataset.py`](misc/generate_synthetic_tsad_dataset.py) 22 | is a script for generating an artifical anomaly detection dataset using `merlion`'s time series 23 | generation and anomaly injection modules. This particular dataset may be loaded using the data 24 | loader `ts_datasets.anomaly.Synthetic`. 25 | -------------------------------------------------------------------------------- /examples/misc/generate_synthetic_tsad_dataset.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2022 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | from os.path import abspath, dirname, join 8 | from collections import OrderedDict 9 | import os 10 | 11 | import numpy as np 12 | from math import floor, ceil 13 | 14 | from merlion.utils.ts_generator import GeneratorConcatenator, TimeSeriesGenerator 15 | from merlion.transform.anomalize import LevelShift, Shock, TrendChange 16 | 17 | MERLION_ROOT = dirname(dirname(dirname(abspath(__file__)))) 18 | DATADIR = join(MERLION_ROOT, "data") 19 | 20 | 21 | def main(): 22 | np.random.seed(12345) 23 | n = 10000 24 | 25 | # Generate Synthetic Time Series 26 | ts_generators = [ 27 | # generates a time series that trends upward before 28 | # trending downward 29 | GeneratorConcatenator( 30 | generators=[ 31 | # upward trend 32 | TimeSeriesGenerator(f=lambda x: x ** 1.6, n=floor(0.6 * n)), 33 | # downward trend 34 | TimeSeriesGenerator(f=lambda x: -x ** 1.2, n=ceil(0.4 * n)), 35 | ], 36 | noise=lambda: np.random.normal(0, 500), 37 | string_outputs=True, 38 | name="upward_downward", 39 | ), 40 | # generates a white noise series 41 | TimeSeriesGenerator(f=lambda x: 0, n=n, name="horizontal"), 42 | # generates a time series with multiple seasonality 43 | TimeSeriesGenerator(f=lambda x: 2 * np.sin(x * 0.1) + np.sin(x * 0.02), n=n, name="seasonal"), 44 | ] 45 | 46 | ts_list = [generator.generate(return_ts=True) for generator in ts_generators] 47 | 48 | # Initialize Anomaly Injection Transforms 49 | anomalize_kwargs = dict(anom_prob=0.002, anom_width_range=(20, 200), alpha=0.5) 50 | 51 | anomalizers = OrderedDict( 52 | shock=Shock(pos_prob=0.5, sd_range=(4, 8), **anomalize_kwargs), 53 | spike=Shock(pos_prob=1.0, sd_range=(4, 8), **anomalize_kwargs), 54 | dip=Shock(pos_prob=0.0, sd_range=(4, 8), **anomalize_kwargs), 55 | level=LevelShift(pos_prob=0.5, sd_range=(3, 6), **anomalize_kwargs), 56 | trend=TrendChange(anom_prob=0.01, pos_prob=0.5, scale_range=(2.5, 5)), 57 | ) 58 | 59 | # make directory for writing anomalized data 60 | anom_dir = join(DATADIR, "synthetic_anomaly") 61 | os.makedirs(anom_dir, exist_ok=True) 62 | 63 | for i, ts in enumerate(ts_list): 64 | # write original ts 65 | csv = join(anom_dir, f"{ts.names[0]}.csv") 66 | ts.to_csv(csv) 67 | # anomalize ts with each anomalizer 68 | for j, (name, anom) in enumerate(anomalizers.items()): 69 | np.random.seed(1000 * i + j) 70 | anom_ts = anom(ts) 71 | csv = join(anom_dir, f"{anom_ts.names[0]}_{name}_anomaly.csv") 72 | anom_ts.to_csv(csv) 73 | 74 | 75 | if __name__ == "__main__": 76 | main() 77 | -------------------------------------------------------------------------------- /figures/anom_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/Merlion/085ef8a69e5dcdfb9dcaa394cc21e087cccbb8f0/figures/anom_example.png -------------------------------------------------------------------------------- /figures/dashboard_anomaly.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/Merlion/085ef8a69e5dcdfb9dcaa394cc21e087cccbb8f0/figures/dashboard_anomaly.png -------------------------------------------------------------------------------- /figures/dashboard_file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/Merlion/085ef8a69e5dcdfb9dcaa394cc21e087cccbb8f0/figures/dashboard_file.png -------------------------------------------------------------------------------- /figures/dashboard_forecast.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/Merlion/085ef8a69e5dcdfb9dcaa394cc21e087cccbb8f0/figures/dashboard_forecast.png -------------------------------------------------------------------------------- /figures/forecast_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/Merlion/085ef8a69e5dcdfb9dcaa394cc21e087cccbb8f0/figures/forecast_example.png -------------------------------------------------------------------------------- /k8s-spec/anomaly.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2018 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | # Support for Python is experimental, and requires building SNAPSHOT image of Apache Spark, 17 | # with `imagePullPolicy` set to Always 18 | 19 | # Install the spark operator as follows: 20 | # helm install spark-operator spark-operator/spark-operator --namespace spark-operator --create-namespace --set sparkJobNamespace=spark-apps 21 | 22 | apiVersion: "sparkoperator.k8s.io/v1beta2" 23 | kind: SparkApplication 24 | metadata: 25 | name: anomaly 26 | namespace: spark-apps 27 | spec: 28 | sparkVersion: "3.1.1" 29 | sparkConf: 30 | spark.sql.execution.arrow.pyspark.enabled: "true" 31 | 32 | restartPolicy: 33 | type: Never 34 | 35 | driver: 36 | cores: 1 37 | memory: "1G" 38 | serviceAccount: spark-operator-spark 39 | labels: 40 | version: 3.1.1 41 | 42 | executor: 43 | cores: 1 44 | instances: 2 45 | memory: "2G" 46 | podSecurityContext: 47 | runAsNonRoot: true 48 | runAsUser: 185 49 | labels: 50 | version: 3.1.1 51 | 52 | type: Python 53 | pythonVersion: "3" 54 | mode: cluster 55 | image: "merlion-spark:latest" 56 | imagePullPolicy: Always 57 | mainApplicationFile: local:///opt/spark/apps/anomaly.py 58 | arguments: 59 | - "--data" 60 | - "/opt/spark/work-dir/walmart_mini.csv" # can be on the cloud if you configure Spark appropriately 61 | - "--output_path" 62 | - "results" # can be on the cloud if you configure Spark appropriately 63 | - "--train_test_split" 64 | - "2012-08-01" 65 | - "--data_cols" 66 | - '[ 67 | "Weekly_Sales", 68 | "Unemployment", 69 | "CPI", 70 | "Fuel_Price", 71 | "Temperature" 72 | ]' 73 | - "--index_cols" 74 | - '["Store", "Dept"]' 75 | - "--time_col" 76 | - "Date" 77 | - "--model" 78 | - '{"name": "DefaultDetector"}' 79 | -------------------------------------------------------------------------------- /k8s-spec/forecast.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2018 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | # Support for Python is experimental, and requires building SNAPSHOT image of Apache Spark, 17 | # with `imagePullPolicy` set to Always 18 | 19 | # Install the spark operator as follows: 20 | # helm install spark-operator spark-operator/spark-operator --namespace spark-operator --create-namespace --set sparkJobNamespace=spark-apps 21 | 22 | apiVersion: "sparkoperator.k8s.io/v1beta2" 23 | kind: SparkApplication 24 | metadata: 25 | name: forecast 26 | namespace: spark-apps 27 | spec: 28 | sparkVersion: "3.1.1" 29 | sparkConf: 30 | spark.sql.execution.arrow.pyspark.enabled: "true" 31 | 32 | restartPolicy: 33 | type: Never 34 | 35 | driver: 36 | cores: 1 37 | memory: "1G" 38 | serviceAccount: spark-operator-spark 39 | labels: 40 | version: 3.1.1 41 | 42 | executor: 43 | cores: 1 44 | instances: 2 45 | memory: "2G" 46 | podSecurityContext: 47 | runAsNonRoot: true 48 | runAsUser: 185 49 | labels: 50 | version: 3.1.1 51 | 52 | type: Python 53 | pythonVersion: "3" 54 | mode: cluster 55 | image: "merlion-spark:latest" 56 | imagePullPolicy: Always 57 | mainApplicationFile: local:///opt/spark/apps/forecast.py 58 | arguments: 59 | - "--train_data" 60 | - "/opt/spark/work-dir/walmart_mini.csv" # can be on the cloud if you configure Spark appropriately 61 | - "--output_path" 62 | - "results" # can be on the cloud if you configure Spark appropriately 63 | - "--target_col" 64 | - "Weekly_Sales" 65 | - "--data_cols" 66 | - '[ 67 | "Weekly_Sales", 68 | "Unemployment", 69 | "CPI", 70 | "Fuel_Price", 71 | "Temperature" 72 | ]' 73 | - "--index_cols" 74 | - '["Store", "Dept"]' 75 | - "--time_col" 76 | - "Date" 77 | - "--hierarchical" 78 | - "--agg_dict" 79 | - '{ 80 | "Weekly_Sales": "sum", 81 | }' 82 | - "--model" 83 | - '{"name": "DefaultForecaster"}' 84 | - "--time_stamps" 85 | - '[ 86 | "2012-11-02", 87 | "2012-11-09", 88 | "2012-11-16", 89 | "2012-11-23", 90 | "2012-11-30", 91 | "2012-12-07", 92 | "2012-12-14", 93 | "2012-12-21", 94 | "2012-12-28", 95 | "2013-01-04", 96 | "2013-01-11", 97 | "2013-01-18", 98 | "2013-01-25", 99 | "2013-02-01", 100 | "2013-02-08", 101 | "2013-02-15", 102 | "2013-02-22", 103 | "2013-03-01", 104 | "2013-03-08", 105 | "2013-03-15", 106 | "2013-03-22", 107 | "2013-03-29", 108 | "2013-04-05", 109 | "2013-04-12", 110 | "2013-04-19", 111 | "2013-04-26", 112 | "2013-05-03" 113 | ]' 114 | -------------------------------------------------------------------------------- /merlion/dashboard/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | try: 8 | import dash 9 | import diskcache 10 | import dash_bootstrap_components 11 | except ImportError as e: 12 | err = ( 13 | "Try installing Merlion with optional dependencies using `pip install salesforce-merlion[dashboard]` or " 14 | "`pip install `salesforce-merlion[all]`" 15 | ) 16 | raise ImportError(str(e) + ". " + err) 17 | -------------------------------------------------------------------------------- /merlion/dashboard/__main__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | from merlion.dashboard.server import app 8 | 9 | if __name__ == "__main__": 10 | app.run_server(debug=False) 11 | -------------------------------------------------------------------------------- /merlion/dashboard/assets/Acumin-BdPro.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/Merlion/085ef8a69e5dcdfb9dcaa394cc21e087cccbb8f0/merlion/dashboard/assets/Acumin-BdPro.otf -------------------------------------------------------------------------------- /merlion/dashboard/assets/fonts/SalesforceSans-Bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/Merlion/085ef8a69e5dcdfb9dcaa394cc21e087cccbb8f0/merlion/dashboard/assets/fonts/SalesforceSans-Bold.woff -------------------------------------------------------------------------------- /merlion/dashboard/assets/fonts/SalesforceSans-BoldItalic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/Merlion/085ef8a69e5dcdfb9dcaa394cc21e087cccbb8f0/merlion/dashboard/assets/fonts/SalesforceSans-BoldItalic.woff -------------------------------------------------------------------------------- /merlion/dashboard/assets/fonts/SalesforceSans-Italic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/Merlion/085ef8a69e5dcdfb9dcaa394cc21e087cccbb8f0/merlion/dashboard/assets/fonts/SalesforceSans-Italic.woff -------------------------------------------------------------------------------- /merlion/dashboard/assets/fonts/SalesforceSans-Light.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/Merlion/085ef8a69e5dcdfb9dcaa394cc21e087cccbb8f0/merlion/dashboard/assets/fonts/SalesforceSans-Light.woff -------------------------------------------------------------------------------- /merlion/dashboard/assets/fonts/SalesforceSans-LightItalic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/Merlion/085ef8a69e5dcdfb9dcaa394cc21e087cccbb8f0/merlion/dashboard/assets/fonts/SalesforceSans-LightItalic.woff -------------------------------------------------------------------------------- /merlion/dashboard/assets/fonts/SalesforceSans-Regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/Merlion/085ef8a69e5dcdfb9dcaa394cc21e087cccbb8f0/merlion/dashboard/assets/fonts/SalesforceSans-Regular.woff -------------------------------------------------------------------------------- /merlion/dashboard/assets/fonts/SalesforceSans-Thin.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/Merlion/085ef8a69e5dcdfb9dcaa394cc21e087cccbb8f0/merlion/dashboard/assets/fonts/SalesforceSans-Thin.woff -------------------------------------------------------------------------------- /merlion/dashboard/assets/fonts/SalesforceSans-ThinItalic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/Merlion/085ef8a69e5dcdfb9dcaa394cc21e087cccbb8f0/merlion/dashboard/assets/fonts/SalesforceSans-ThinItalic.woff -------------------------------------------------------------------------------- /merlion/dashboard/assets/resizing.js: -------------------------------------------------------------------------------- 1 | /* resize figures in table upon callback get fires */ 2 | 3 | if(!window.dash_clientside) {window.dash_clientside = {};} 4 | window.dash_clientside.clientside = { 5 | resize: function (value) { 6 | console.log("resizing..."); 7 | window.dispatchEvent(new Event('resize')); 8 | return null 9 | } 10 | } -------------------------------------------------------------------------------- /merlion/dashboard/assets/styles.css: -------------------------------------------------------------------------------- 1 | 2 | a:link { 3 | color: #696969; 4 | text-decoration: none; 5 | } 6 | 7 | /* visited link */ 8 | a:visited { 9 | color: #696969; 10 | text-decoration: none; 11 | } 12 | 13 | /* mouse over link */ 14 | a:hover { 15 | opacity: 0.6; 16 | } 17 | 18 | /* selected link */ 19 | a:active { 20 | color: lightgrey; 21 | text-decoration: underline; 22 | } 23 | 24 | .greyline { 25 | width: 90%; 26 | border-bottom: 1px solid lightgrey; 27 | } 28 | #tabs{ 29 | filter:drop-shadow(0px 4px 6px rgba(0, 0, 0, 0.2)); 30 | } 31 | .tab { 32 | /* border-style: solid; 33 | border-color: rgb(0, 0, 0, 0.2); */ 34 | border-bottom-style: none; 35 | border-top-style: none; 36 | border-right-style: none; 37 | padding: 5px 10px; 38 | border:none !important; 39 | } 40 | 41 | .rowrow { 42 | margin: auto; 43 | text-align: center; 44 | width: 97%; 45 | } 46 | 47 | .rowrow2 { 48 | margin: auto; 49 | width: 97%; 50 | } 51 | 52 | .tablast { 53 | border-style: solid; 54 | border-color: rgb(0, 0, 0, 0.2); 55 | border-bottom-style: none; 56 | border-top-style: none; 57 | color: black; 58 | padding: 6px 20px; 59 | text-align: center; 60 | text-decoration: none; 61 | display: inline-block; 62 | } 63 | 64 | #learn-more-button { 65 | float: right; 66 | padding-left: 15px; 67 | padding-right: 15px; 68 | text-transform: none; 69 | margin: 25px 25px; 70 | } 71 | 72 | -------------------------------------------------------------------------------- /merlion/dashboard/assets/upload.svg: -------------------------------------------------------------------------------- 1 | <svg xmlns="http://www.w3.org/2000/svg" x="0px" y="0px" 2 | width="20px" height="20px" viewBox="0 0 52 52" enable-background="new 0 0 52 52" xml:space="preserve"> 3 | <g> 4 | <path fill="#1B96FF" d="M48.5,31h-3c-0.8,0-1.5,0.8-1.5,1.5v10c0,0.8-0.7,1.5-1.5,1.5h-33C8.7,44,8,43.3,8,42.5v-10 5 | C8,31.8,7.3,31,6.5,31h-3C2.7,31,2,31.8,2,32.5V46c0,2.2,1.8,4,4,4h40c2.2,0,4-1.8,4-4V32.5C50,31.8,49.3,31,48.5,31z"/> 6 | <path fill="#1B96FF" d="M27,2.4c-0.6-0.6-1.5-0.6-2.1,0L11.4,15.9c-0.6,0.6-0.6,1.5,0,2.1l2.1,2.1c0.6,0.6,1.5,0.6,2.1,0l5.6-5.6 7 | c0.6-0.6,1.8-0.2,1.8,0.7v21.2c0,0.8,0.6,1.5,1.4,1.5h3c0.8,0,1.6-0.8,1.6-1.5V15.3c0-0.9,1-1.3,1.7-0.7l5.6,5.6 8 | c0.6,0.6,1.5,0.6,2.1,0l2.1-2.1c0.6-0.6,0.6-1.5,0-2.1L27,2.4z"/> 9 | </g> 10 | </svg> 11 | -------------------------------------------------------------------------------- /merlion/dashboard/callbacks/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | -------------------------------------------------------------------------------- /merlion/dashboard/models/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | -------------------------------------------------------------------------------- /merlion/dashboard/models/data.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import sys 8 | import logging 9 | from collections import OrderedDict 10 | from merlion.dashboard.models.utils import DataMixin 11 | from merlion.dashboard.pages.utils import create_empty_figure 12 | from merlion.dashboard.utils.log import DashLogger 13 | from merlion.dashboard.utils.plot import data_table, plot_timeseries 14 | 15 | dash_logger = DashLogger(stream=sys.stdout) 16 | 17 | 18 | class DataAnalyzer(DataMixin): 19 | def __init__(self): 20 | self.logger = logging.getLogger(__name__) 21 | self.logger.setLevel(logging.DEBUG) 22 | self.logger.addHandler(dash_logger) 23 | 24 | @staticmethod 25 | def get_stats(df): 26 | stats = { 27 | "@global": OrderedDict( 28 | { 29 | "NO. of Variables": len(df.columns), 30 | "Time Series Length": len(df), 31 | "Has NaNs": bool(df.isnull().values.any()), 32 | } 33 | ), 34 | "@columns": list(df.columns), 35 | } 36 | for col in df.columns: 37 | stats[col] = df[col].describe().to_dict(into=OrderedDict) 38 | return stats 39 | 40 | @staticmethod 41 | def get_data_table(df): 42 | return data_table(df) 43 | 44 | @staticmethod 45 | def get_data_figure(df): 46 | if df is None: 47 | return create_empty_figure() 48 | else: 49 | return plot_timeseries(df) 50 | -------------------------------------------------------------------------------- /merlion/dashboard/pages/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | -------------------------------------------------------------------------------- /merlion/dashboard/pages/utils.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import dash_bootstrap_components as dbc 8 | from dash import html, dash_table 9 | import pandas as pd 10 | from merlion.dashboard.settings import * 11 | from merlion.dashboard.utils.plot import plot_timeseries 12 | 13 | styles = { 14 | "json-output": {"overflow-y": "scroll", "height": "calc(90% - 25px)", "border": "thin lightgrey solid"}, 15 | "tab": {"height": "calc(98vh - 80px)"}, 16 | "log-output": { 17 | "overflow-y": "scroll", 18 | "height": "calc(90% - 25px)", 19 | "border": "thin lightgrey solid", 20 | "white-space": "pre-wrap", 21 | }, 22 | } 23 | 24 | 25 | def create_modal(modal_id, header, content, content_id, button_id): 26 | modal = html.Div( 27 | [ 28 | dbc.Modal( 29 | [ 30 | dbc.ModalHeader(dbc.ModalTitle(header)), 31 | dbc.ModalBody(content, id=content_id), 32 | dbc.ModalFooter(dbc.Button("Close", id=button_id, className="ml-auto", n_clicks=0)), 33 | ], 34 | id=modal_id, 35 | is_open=False, 36 | ) 37 | ] 38 | ) 39 | return modal 40 | 41 | 42 | def create_param_table(params=None, height=100): 43 | if params is None or len(params) == 0: 44 | data = [{"Parameter": "", "Value": ""}] 45 | else: 46 | data = [{"Parameter": key, "Value": str(value["default"])} for key, value in params.items()] 47 | 48 | table = dash_table.DataTable( 49 | data=data, 50 | columns=[{"id": "Parameter", "name": "Parameter"}, {"id": "Value", "name": "Value"}], 51 | editable=True, 52 | style_header_conditional=[{"textAlign": "center", "font-family": "Salesforce Sans"}], 53 | style_cell_conditional=[{"textAlign": "center", "font-family": "Salesforce Sans"}], 54 | style_table={"overflowX": "scroll", "overflowY": "scroll", "height": height}, 55 | style_header=dict(backgroundColor=TABLE_HEADER_COLOR, color="white"), 56 | style_data=dict(backgroundColor=TABLE_DATA_COLOR), 57 | ) 58 | return table 59 | 60 | 61 | def create_metric_table(metrics=None): 62 | if metrics is None or len(metrics) == 0: 63 | data, columns = {}, [] 64 | for i in range(4): 65 | data[f"Metric {i}"] = "-" 66 | columns.append({"id": f"Metric {i}", "name": f"Metric {i}"}) 67 | 68 | else: 69 | data = metrics 70 | columns = [{"id": key, "name": key} for key in metrics.keys()] 71 | 72 | if not isinstance(data, list): 73 | data = [data] 74 | table = dash_table.DataTable( 75 | data=data, 76 | columns=columns, 77 | editable=False, 78 | style_header_conditional=[{"textAlign": "center", "font-family": "Salesforce Sans"}], 79 | style_cell_conditional=[{"textAlign": "center", "font-family": "Salesforce Sans"}], 80 | style_table={"overflowX": "scroll"}, 81 | style_header=dict(backgroundColor=TABLE_HEADER_COLOR, color="white"), 82 | style_data=dict(backgroundColor=TABLE_DATA_COLOR), 83 | ) 84 | return table 85 | 86 | 87 | def create_empty_figure(): 88 | return plot_timeseries(pd.DataFrame(index=pd.DatetimeIndex([]))) 89 | -------------------------------------------------------------------------------- /merlion/dashboard/server.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import dash 8 | import dash_bootstrap_components as dbc 9 | from dash import dcc 10 | from dash import html 11 | from dash.dependencies import Input, Output, State 12 | import logging 13 | 14 | from merlion.dashboard.utils.layout import create_banner, create_layout 15 | from merlion.dashboard.pages.data import create_data_layout 16 | from merlion.dashboard.pages.forecast import create_forecasting_layout 17 | from merlion.dashboard.pages.anomaly import create_anomaly_layout 18 | 19 | from merlion.dashboard.callbacks import data 20 | from merlion.dashboard.callbacks import forecast 21 | from merlion.dashboard.callbacks import anomaly 22 | 23 | logging.basicConfig(format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", level=logging.INFO) 24 | 25 | app = dash.Dash( 26 | __name__, 27 | meta_tags=[{"name": "viewport", "content": "width=device-width, initial-scale=1"}], 28 | external_stylesheets=[dbc.themes.BOOTSTRAP], 29 | title="Merlion Dashboard", 30 | ) 31 | app.config["suppress_callback_exceptions"] = True 32 | app.layout = html.Div( 33 | [ 34 | dcc.Location(id="url", refresh=False), 35 | html.Div(id="page-content"), 36 | dcc.Store(id="data-state"), 37 | dcc.Store(id="anomaly-state"), 38 | dcc.Store(id="forecasting-state"), 39 | ] 40 | ) 41 | server = app.server 42 | 43 | 44 | @app.callback(Output("page-content", "children"), [Input("url", "pathname")]) 45 | def _display_page(pathname): 46 | return html.Div(id="app-container", children=[create_banner(app), html.Br(), create_layout()]) 47 | 48 | 49 | @app.callback( 50 | Output("plots", "children"), 51 | Input("tabs", "value"), 52 | [State("data-state", "data"), State("anomaly-state", "data"), State("forecasting-state", "data")], 53 | ) 54 | def _click_tab(tab, data_state, anomaly_state, forecasting_state): 55 | if tab == "file-manager": 56 | return create_data_layout() 57 | elif tab == "forecasting": 58 | return create_forecasting_layout() 59 | elif tab == "anomaly": 60 | return create_anomaly_layout() 61 | -------------------------------------------------------------------------------- /merlion/dashboard/settings.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | TABLE_HEADER_COLOR = "#014486" 8 | TABLE_DATA_COLOR = "white" 9 | -------------------------------------------------------------------------------- /merlion/dashboard/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | -------------------------------------------------------------------------------- /merlion/dashboard/utils/file_manager.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import os 8 | import base64 9 | import zipfile 10 | import diskcache 11 | from pathlib import Path 12 | from dash.long_callback import DiskcacheLongCallbackManager 13 | 14 | 15 | class SingletonClass: 16 | def __new__(cls): 17 | if not hasattr(cls, "instance"): 18 | cls.instance = super(SingletonClass, cls).__new__(cls) 19 | return cls.instance 20 | 21 | 22 | class FileManager(SingletonClass): 23 | def __init__(self, directory=None): 24 | self.directory = os.path.join(str(Path.home()), "merlion") if directory is None else directory 25 | if not os.path.exists(self.directory): 26 | os.makedirs(self.directory) 27 | 28 | self.data_folder = os.path.join(self.directory, "data") 29 | if not os.path.exists(self.data_folder): 30 | os.makedirs(self.data_folder) 31 | 32 | self.model_folder = os.path.join(self.directory, "models") 33 | if not os.path.exists(self.model_folder): 34 | os.makedirs(self.model_folder) 35 | 36 | self.cache_folder = os.path.join(self.directory, "cache") 37 | self.long_callback_manager = DiskcacheLongCallbackManager(diskcache.Cache(self.cache_folder)) 38 | 39 | def save_file(self, name, content): 40 | data = content.encode("utf8").split(b";base64,")[1] 41 | with open(os.path.join(self.data_folder, name), "wb") as fp: 42 | fp.write(base64.decodebytes(data)) 43 | 44 | def uploaded_files(self): 45 | files = [] 46 | for filename in os.listdir(self.data_folder): 47 | path = os.path.join(self.data_folder, filename) 48 | if os.path.isfile(path): 49 | files.append(filename) 50 | return files 51 | 52 | def get_model_download_path(self, model_name): 53 | path = os.path.join(self.model_folder, model_name) 54 | zip_file = os.path.join(path, f"{model_name}.zip") 55 | with zipfile.ZipFile(zip_file, mode="w") as f: 56 | for file in Path(path).iterdir(): 57 | if Path(file).name != f"{model_name}.zip": 58 | f.write(file, arcname=file.name) 59 | return zip_file 60 | 61 | def get_model_list(self): 62 | models = [] 63 | for name in os.listdir(self.model_folder): 64 | folder = os.path.join(self.model_folder, name) 65 | if os.path.isdir(folder): 66 | models.append(name) 67 | return models 68 | 69 | @property 70 | def base_directory(self): 71 | return self.directory 72 | 73 | @property 74 | def data_directory(self): 75 | return self.data_folder 76 | 77 | @property 78 | def model_directory(self): 79 | return self.model_folder 80 | 81 | def get_long_callback_manager(self): 82 | return self.long_callback_manager 83 | -------------------------------------------------------------------------------- /merlion/dashboard/utils/layout.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | from dash import dcc 8 | from dash import html 9 | 10 | 11 | tab_style = { 12 | "borderBottom": "1px solid #d6d6d6", 13 | "padding": "6px", 14 | "fontWeight": "bold", 15 | "backgroundColor": "#1B96FF", 16 | "color": "white", 17 | } 18 | 19 | tab_selected_style = { 20 | "borderTop": "1px solid #d6d6d6", 21 | "borderBottom": "1px solid #d6d6d6", 22 | "backgroundColor": "#0176D3", 23 | "color": "white", 24 | "padding": "6px", 25 | "fontWeight": "bold", 26 | } 27 | 28 | 29 | def create_banner(app): 30 | return html.Div( 31 | id="banner", 32 | className="banner", 33 | children=[ 34 | html.Img(src=app.get_asset_url("merlion_small.svg")), 35 | html.Plaintext(" Powered by Salesforce AI Research"), 36 | ], 37 | ) 38 | 39 | 40 | def create_layout() -> html.Div: 41 | children, values = [], [] 42 | # Data analysis tab 43 | children.append( 44 | dcc.Tab(label="File Manager", value="file-manager", style=tab_style, selected_style=tab_selected_style) 45 | ) 46 | values.append("file-manager") 47 | # Anomaly detection tab 48 | children.append( 49 | dcc.Tab(label="Anomaly Detection", value="anomaly", style=tab_style, selected_style=tab_selected_style) 50 | ) 51 | values.append("anomaly") 52 | # Forecasting tab 53 | children.append( 54 | dcc.Tab(label="Forecasting", value="forecasting", style=tab_style, selected_style=tab_selected_style) 55 | ) 56 | values.append("forecasting") 57 | 58 | layout = html.Div( 59 | id="app-content", 60 | children=[dcc.Tabs(id="tabs", value=values[0] if values else "none", children=children), html.Div(id="plots")], 61 | ) 62 | return layout 63 | -------------------------------------------------------------------------------- /merlion/dashboard/utils/log.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import logging 8 | 9 | 10 | class DashLogger(logging.StreamHandler): 11 | def __init__(self, stream=None): 12 | super().__init__(stream=stream) 13 | self.logs = list() 14 | 15 | def emit(self, record): 16 | try: 17 | msg = self.format(record) 18 | self.logs.append(msg) 19 | self.logs = self.logs[-1000:] 20 | self.flush() 21 | except Exception: 22 | self.handleError(record) 23 | -------------------------------------------------------------------------------- /merlion/dashboard/utils/plot.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import plotly 8 | import plotly.graph_objects as go 9 | from plotly.subplots import make_subplots 10 | from dash import dash_table, dcc 11 | from merlion.dashboard.settings import * 12 | 13 | 14 | def data_table(df, n=1000, page_size=10): 15 | if df is not None: 16 | df = df.head(n) 17 | columns = [{"name": "Index", "id": "Index"}] + [{"name": c, "id": c} for c in df.columns] 18 | data = [] 19 | for i in range(df.shape[0]): 20 | d = {c: v for c, v in zip(df.columns, df.values[i])} 21 | d.update({"Index": df.index[i]}) 22 | data.append(d) 23 | 24 | table = dash_table.DataTable( 25 | id="table", 26 | columns=columns, 27 | data=data, 28 | style_cell_conditional=[{"textAlign": "center"}], 29 | style_table={"overflowX": "scroll"}, 30 | editable=False, 31 | column_selectable="single", 32 | page_action="native", 33 | page_size=page_size, 34 | page_current=0, 35 | style_header=dict(backgroundColor=TABLE_HEADER_COLOR), 36 | style_data=dict(backgroundColor=TABLE_DATA_COLOR), 37 | ) 38 | return table 39 | else: 40 | return dash_table.DataTable() 41 | 42 | 43 | def plot_timeseries(ts, figure_height=500): 44 | traces = [] 45 | color_list = plotly.colors.qualitative.Dark24 46 | for i, col in enumerate(ts.columns): 47 | v = ts[col] 48 | if v.dtype in ["int", "float", "bool"]: 49 | v = v.astype(float) 50 | color = color_list[i % len(color_list)] 51 | traces.append(go.Scatter(name=col, x=v.index, y=v.values.flatten(), mode="lines", line=dict(color=color))) 52 | 53 | layout = dict( 54 | showlegend=True, 55 | xaxis=dict( 56 | title="Time", 57 | type="date", 58 | rangeselector=dict( 59 | buttons=list( 60 | [ 61 | dict(count=7, label="1w", step="day", stepmode="backward"), 62 | dict(count=1, label="1m", step="month", stepmode="backward"), 63 | dict(count=6, label="6m", step="month", stepmode="backward"), 64 | dict(count=1, label="1y", step="year", stepmode="backward"), 65 | dict(step="all"), 66 | ] 67 | ) 68 | ), 69 | ), 70 | ) 71 | fig = make_subplots(figure=go.Figure(layout=layout)) 72 | fig.update_yaxes(title_text="Time Series") 73 | for trace in traces: 74 | fig.add_trace(trace) 75 | fig.update_layout( 76 | height=figure_height, 77 | xaxis_rangeselector_font_color="white", 78 | xaxis_rangeselector_activecolor="#0176D3", 79 | xaxis_rangeselector_bgcolor="#1B96FF", 80 | xaxis_rangeselector_font_family="Salesforce Sans", 81 | ) 82 | return dcc.Graph(figure=fig) 83 | -------------------------------------------------------------------------------- /merlion/models/anomaly/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | """ 8 | Contains all anomaly detection models. Forecaster-based anomaly detection models 9 | may be found in :py:mod:`merlion.models.anomaly.forecast_based`. Change-point detection models may be 10 | found in :py:mod:`merlion.models.anomaly.change_point`. 11 | 12 | For anomaly detection, we define an abstract `DetectorBase` class which inherits from `ModelBase` and supports the 13 | following interface, in addition to ``model.save`` and ``DetectorClass.load`` defined for `ModelBase`: 14 | 15 | 1. ``model = DetectorClass(config)`` 16 | 17 | - initialization with a model-specific config 18 | - configs contain: 19 | 20 | - a (potentially trainable) data pre-processing transform from :py:mod:`merlion.transform`; 21 | note that ``model.transform`` is a property which refers to ``model.config.transform`` 22 | - **a (potentially trainable) post-processing rule** from :py:mod:`merlion.post_process`; 23 | note that ``model.post_rule`` is a property which refers to ``model.config.post_rule``. 24 | In general, this post-rule will have two stages: :py:mod:`calibration <merlion.post_process.calibrate>` 25 | and :py:mod:`thresholding <merlion.post_process.threshold>`. 26 | - booleans ``enable_calibrator`` and ``enable_threshold`` (both defaulting to ``True``) indicating 27 | whether to enable calibration and thresholding in the post-rule. 28 | - model-specific hyperparameters 29 | 30 | 2. ``model.get_anomaly_score(time_series, time_series_prev=None)`` 31 | 32 | - returns a time series of anomaly scores for each timestamp in ``time_series`` 33 | - ``time_series_prev`` (optional): the most recent context, only used for some models. If not provided, the 34 | training data is used as the context instead. 35 | 36 | 3. ``model.get_anomaly_label(time_series, time_series_prev=None)`` 37 | 38 | - returns a time series of post-processed anomaly scores for each timestamp in ``time_series``. These scores 39 | are calibrated to correspond to z-scores if ``enable_calibrator`` is ``True``, and they have also been filtered 40 | by a thresholding rule (``model.threshold``) if ``enable_threshold`` is ``True``. ``threshold`` is specified 41 | manually in the config (though it may be modified by `DetectorBase.train`), . 42 | - ``time_series_prev`` (optional): the most recent context, only used for some models. If not provided, the 43 | training data is used as the context instead. 44 | 45 | 4. ``model.train(train_data, anomaly_labels=None, train_config=None, post_rule_train_config=None)`` 46 | 47 | - trains the model on the time series ``train_data`` 48 | - ``anomaly_labels`` (optional): a time series aligned with ``train_data``, which indicates whether each 49 | time stamp is anomalous 50 | - ``train_config`` (optional): extra configuration describing how the model should be trained. 51 | Not used for all models. Class-level default provided for models which do use it. 52 | - ``post_rule_train_config``: extra configuration describing how to train the model's post-rule. Class-level 53 | default is provided for all models. 54 | - returns a time series of anomaly scores produced by the model on ``train_data``. 55 | """ 56 | -------------------------------------------------------------------------------- /merlion/models/anomaly/change_point/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | """ 8 | Contains all change point detection algorithms. These models implement the anomaly detector interface, but 9 | they are specialized for detecting change points in time series. 10 | """ 11 | -------------------------------------------------------------------------------- /merlion/models/anomaly/forecast_based/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | """ 8 | Contains all forecaster-based anomaly detectors. These models support all functionality 9 | of both anomaly detectors (:py:mod:`merlion.models.anomaly`) and forecasters 10 | (:py:mod:`merlion.models.forecast`). 11 | 12 | Forecasting-based anomaly detectors are instances of an abstract `ForecastingDetectorBase` 13 | class. Many forecasting models support anomaly detection variants, where the anomaly score 14 | is based on the difference between the predicted and true time series value, and optionally 15 | the model's uncertainty in its own prediction. 16 | 17 | Note that the model will detect anomalies in only one target univariate, though the underlying 18 | forecaster may model the full multivariate time series to predict said univariate. 19 | """ 20 | -------------------------------------------------------------------------------- /merlion/models/anomaly/forecast_based/arima.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | """ 8 | Classic ARIMA (AutoRegressive Integrated Moving Average) forecasting model, 9 | adapted for anomaly detection. 10 | """ 11 | from merlion.models.anomaly.forecast_based.base import ForecastingDetectorBase 12 | from merlion.models.anomaly.base import DetectorConfig 13 | from merlion.models.forecast.arima import ArimaConfig, Arima 14 | from merlion.post_process.threshold import AggregateAlarms 15 | 16 | 17 | class ArimaDetectorConfig(ArimaConfig, DetectorConfig): 18 | _default_threshold = AggregateAlarms(alm_threshold=2.5) 19 | 20 | 21 | class ArimaDetector(ForecastingDetectorBase, Arima): 22 | config_class = ArimaDetectorConfig 23 | -------------------------------------------------------------------------------- /merlion/models/anomaly/forecast_based/ets.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | """ 8 | ETS (error, trend, seasonal) forecasting model, adapted for anomaly detection. 9 | """ 10 | from merlion.models.anomaly.base import NoCalibrationDetectorConfig 11 | from merlion.models.anomaly.forecast_based.base import ForecastingDetectorBase 12 | from merlion.models.forecast.ets import ETSConfig, ETS 13 | from merlion.post_process.threshold import AggregateAlarms 14 | 15 | 16 | class ETSDetectorConfig(ETSConfig, NoCalibrationDetectorConfig): 17 | # Because the errors & residuals returned by ETS.train() are not 18 | # representative of the test-time errors & residuals, ETSDetector inherits 19 | # from NoCalibrationDetectorConfig and uses the model-predicted z-scores 20 | # directly as anomaly scores. 21 | _default_threshold = AggregateAlarms(alm_threshold=3.0) 22 | 23 | 24 | class ETSDetector(ForecastingDetectorBase, ETS): 25 | config_class = ETSDetectorConfig 26 | -------------------------------------------------------------------------------- /merlion/models/anomaly/forecast_based/mses.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | """ 8 | MSES (Multi-Scale Exponential Smoother) forecasting model adapted for anomaly detection. 9 | """ 10 | import pandas as pd 11 | 12 | from merlion.models.anomaly.forecast_based.base import ForecastingDetectorBase 13 | from merlion.models.anomaly.base import DetectorConfig 14 | from merlion.models.forecast.smoother import MSESConfig, MSES, MSESTrainConfig 15 | from merlion.post_process.threshold import AggregateAlarms 16 | from merlion.utils.time_series import TimeSeries 17 | 18 | 19 | class MSESDetectorConfig(MSESConfig, DetectorConfig): 20 | """ 21 | Configuration class for an MSES forecasting model adapted for anomaly detection. 22 | """ 23 | 24 | _default_threshold = AggregateAlarms(alm_threshold=2) 25 | 26 | def __init__(self, max_forecast_steps: int, online_updates: bool = True, **kwargs): 27 | super().__init__(max_forecast_steps=max_forecast_steps, **kwargs) 28 | self.online_updates = online_updates 29 | 30 | 31 | class MSESDetector(ForecastingDetectorBase, MSES): 32 | config_class = MSESDetectorConfig 33 | 34 | @property 35 | def online_updates(self): 36 | return self.config.online_updates 37 | 38 | @property 39 | def _default_train_config(self): 40 | return MSESTrainConfig(train_cadence=1 if self.online_updates else None) 41 | 42 | def get_anomaly_score( 43 | self, time_series: TimeSeries, time_series_prev: TimeSeries = None, exog_data=None 44 | ) -> TimeSeries: 45 | if self.online_updates: 46 | time_series, time_series_prev = self.transform_time_series(time_series, time_series_prev) 47 | if time_series_prev is None: 48 | full_ts = time_series 49 | else: 50 | full_ts = time_series_prev + time_series 51 | forecast, err = self.update(full_ts.to_pd(), train_cadence=pd.to_timedelta(0)) 52 | forecast, err = [x.bisect(time_series.t0, t_in_left=False)[1] for x in [forecast, err]] 53 | return TimeSeries.from_pd(self.forecast_to_anom_score(time_series, forecast, err)) 54 | else: 55 | return super().get_anomaly_score(time_series, time_series_prev) 56 | -------------------------------------------------------------------------------- /merlion/models/anomaly/forecast_based/prophet.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | """ 8 | Adaptation of Facebook's Prophet forecasting model to anomaly detection. 9 | """ 10 | 11 | from merlion.models.anomaly.forecast_based.base import ForecastingDetectorBase 12 | from merlion.models.anomaly.base import DetectorConfig 13 | from merlion.models.forecast.prophet import ProphetConfig, Prophet 14 | from merlion.post_process.threshold import AggregateAlarms 15 | 16 | 17 | class ProphetDetectorConfig(ProphetConfig, DetectorConfig): 18 | _default_threshold = AggregateAlarms(alm_threshold=3) 19 | 20 | 21 | class ProphetDetector(ForecastingDetectorBase, Prophet): 22 | config_class = ProphetDetectorConfig 23 | -------------------------------------------------------------------------------- /merlion/models/anomaly/forecast_based/sarima.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | """ 8 | Seasonal ARIMA (SARIMA) forecasting model, adapted for anomaly detection. 9 | """ 10 | from merlion.models.anomaly.base import DetectorConfig 11 | from merlion.models.anomaly.forecast_based.base import ForecastingDetectorBase 12 | from merlion.models.forecast.sarima import SarimaConfig, Sarima 13 | from merlion.post_process.threshold import AggregateAlarms 14 | 15 | 16 | class SarimaDetectorConfig(SarimaConfig, DetectorConfig): 17 | _default_threshold = AggregateAlarms(alm_threshold=2.5) 18 | 19 | 20 | class SarimaDetector(ForecastingDetectorBase, Sarima): 21 | config_class = SarimaDetectorConfig 22 | -------------------------------------------------------------------------------- /merlion/models/anomaly/isolation_forest.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | """ 8 | The classic isolation forest model for anomaly detection. 9 | """ 10 | import logging 11 | 12 | import numpy as np 13 | import pandas as pd 14 | from sklearn.ensemble import IsolationForest as skl_IsolationForest 15 | 16 | from merlion.models.anomaly.base import DetectorConfig, DetectorBase 17 | from merlion.transform.moving_average import DifferenceTransform 18 | from merlion.transform.sequence import TransformSequence 19 | from merlion.transform.resample import Shingle 20 | from merlion.utils import UnivariateTimeSeries, TimeSeries 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | class IsolationForestConfig(DetectorConfig): 26 | """ 27 | Configuration class for `IsolationForest`. 28 | """ 29 | 30 | _default_transform = TransformSequence([DifferenceTransform(), Shingle(size=2, stride=1)]) 31 | 32 | def __init__(self, max_n_samples: int = None, n_estimators: int = 100, n_jobs=-1, **kwargs): 33 | """ 34 | :param max_n_samples: Maximum number of samples to allow the isolation 35 | forest to train on. Specify ``None`` to use all samples in the 36 | training data. 37 | :param n_estimators: number of trees in the isolation forest. 38 | """ 39 | self.max_n_samples = 1.0 if max_n_samples is None else max_n_samples 40 | self.n_estimators = n_estimators 41 | self.n_jobs = n_jobs 42 | # Isolation forest's uncalibrated scores are between 0 and 1 43 | kwargs["max_score"] = 1.0 44 | super().__init__(**kwargs) 45 | 46 | 47 | class IsolationForest(DetectorBase): 48 | """ 49 | The classic isolation forest algorithm, proposed in 50 | `Liu et al. 2008 <https://ieeexplore.ieee.org/document/4781136>`_ 51 | """ 52 | 53 | config_class = IsolationForestConfig 54 | 55 | def __init__(self, config: IsolationForestConfig): 56 | super().__init__(config) 57 | self.model = skl_IsolationForest( 58 | max_samples=config.max_n_samples, n_estimators=config.n_estimators, random_state=0, n_jobs=config.n_jobs 59 | ) 60 | 61 | @property 62 | def require_even_sampling(self) -> bool: 63 | return False 64 | 65 | @property 66 | def require_univariate(self) -> bool: 67 | return False 68 | 69 | def _train(self, train_data: pd.DataFrame, train_config=None) -> pd.DataFrame: 70 | times, train_values = train_data.index, train_data.values 71 | self.model.fit(train_values) 72 | train_scores = -self.model.score_samples(train_values) 73 | return pd.DataFrame(train_scores, index=times, columns=["anom_score"]) 74 | 75 | def _get_anomaly_score(self, time_series: pd.DataFrame, time_series_prev: pd.DataFrame = None) -> pd.DataFrame: 76 | # Return the negative of model's score, since model scores are in [-1, 0), where more negative = more anomalous 77 | scores = -self.model.score_samples(np.array(time_series.values)) 78 | return pd.DataFrame(scores, index=time_series.index) 79 | -------------------------------------------------------------------------------- /merlion/models/anomaly/stat_threshold.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | """ 8 | Simple static thresholding model for anomaly detection. 9 | """ 10 | import pandas as pd 11 | 12 | from merlion.models.base import NormalizingConfig 13 | from merlion.models.anomaly.base import DetectorConfig, DetectorBase 14 | from merlion.transform.moving_average import DifferenceTransform 15 | 16 | 17 | class StatThresholdConfig(DetectorConfig, NormalizingConfig): 18 | """ 19 | Config class for `StatThreshold`. 20 | """ 21 | 22 | def __init__(self, target_seq_index: int = None, **kwargs): 23 | """ 24 | :param target_seq_index (optional): The index of the univariate whose value we are considering thresholds of. 25 | If not provided, the model only works for univariate data. 26 | """ 27 | super().__init__(**kwargs) 28 | self.target_seq_index = target_seq_index 29 | 30 | 31 | class StatThreshold(DetectorBase): 32 | """ 33 | Anomaly detection based on a static threshold. 34 | """ 35 | 36 | config_class = StatThresholdConfig 37 | 38 | @property 39 | def require_even_sampling(self) -> bool: 40 | return False 41 | 42 | @property 43 | def require_univariate(self) -> bool: 44 | return self.config.target_seq_index is None 45 | 46 | def _train(self, train_data: pd.DataFrame, train_config=None) -> pd.DataFrame: 47 | return pd.DataFrame(train_data.iloc[:, self.config.target_seq_index or 0]) 48 | 49 | def _get_anomaly_score(self, time_series: pd.DataFrame, time_series_prev: pd.DataFrame = None) -> pd.DataFrame: 50 | return pd.DataFrame(time_series.iloc[:, self.config.target_seq_index or 0]) 51 | -------------------------------------------------------------------------------- /merlion/models/automl/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | """ 8 | Contains all AutoML model variants & some utilities. 9 | """ 10 | -------------------------------------------------------------------------------- /merlion/models/automl/search.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | """ 8 | Abstractions for hyperparameter search. 9 | """ 10 | from collections import OrderedDict 11 | import itertools 12 | from typing import Any, Dict, List 13 | 14 | 15 | class GridSearch: 16 | """ 17 | Iterator over a grid of parameter values, skipping any restricted combinations of values. 18 | """ 19 | 20 | def __init__(self, param_values: Dict[str, List], restrictions: List[Dict[str, Any]] = None): 21 | """ 22 | :param param_values: a dict mapping a set of parameter names to lists of values they can take on. 23 | :param restrictions: a list of dicts indicating inadmissible combinations of parameter values. 24 | For example, an ETS model has parameters error (add/mul), trend (add/mul/none), seasonal (add/mul), 25 | and damped_trend (True/False). If we are only considering additive models, we would impose the restrictions 26 | ``[{"error": "mul"}, {"trend": "mul"}, {"seasonal": "mul"}]``. Since a damped trend is only possible if 27 | the model has a trend, we would add the restriction ``{"trend": None, "damped_trend": True}``. 28 | """ 29 | self.param_values = param_values 30 | self.restrictions = [] if restrictions is None else restrictions 31 | 32 | def __iter__(self): 33 | for val_tuples in itertools.product(*(itertools.product([k], v) for k, v in self.param_values.items())): 34 | val_dict = OrderedDict(val_tuples) 35 | if not any(all(k in val_dict and val_dict[k] == v for k, v in r.items()) for r in self.restrictions): 36 | yield val_dict 37 | -------------------------------------------------------------------------------- /merlion/models/ensemble/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | """ 8 | Ensembles of models and automated model selection. 9 | """ 10 | -------------------------------------------------------------------------------- /merlion/models/forecast/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | """ 8 | Contains all forecasting models, including those which support 9 | `exogenous regressors <tutorials/forecast/3_ForecastExogenous>`. 10 | 11 | For forecasting, we define an abstract base `ForecasterBase` class which inherits from `ModelBase` and supports the 12 | following interface, in addition to ``model.save()`` and ``ForecasterClass.load`` defined for ``ModelBase``: 13 | 14 | 1. ``model = ForecasterClass(config)`` 15 | 16 | - initialization with a model-specific config (which inherits from `ForecasterConfig`) 17 | - configs contain: 18 | 19 | - a (potentially trainable) data pre-processing transform from :py:mod:`merlion.transform`; 20 | note that ``model.transform`` is a property which refers to ``model.config.transform`` 21 | - model-specific hyperparameters 22 | - **optionally, a maximum number of steps the model can forecast for** 23 | 24 | 2. ``model.forecast(time_stamps, time_series_prev=None)`` 25 | 26 | - returns the forecast (`TimeSeries`) for future values at the time stamps specified by ``time_stamps``, 27 | as well as the standard error of that forecast (`TimeSeries`, may be ``None``) 28 | - if ``time_series_prev`` is specified, it is used as the most recent context. Otherwise, the training data is used 29 | 30 | 3. ``model.train(train_data, train_config=None)`` 31 | 32 | - trains the model on the `TimeSeries` ``train_data`` 33 | - ``train_config`` (optional): extra configuration describing how the model should be trained. 34 | Not used for all models. Class-level default provided for models which do use it. 35 | - returns the model's prediction ``train_data``, in the same format as if you called `ForecasterBase.forecast` 36 | on the time stamps of ``train_data`` 37 | """ 38 | -------------------------------------------------------------------------------- /merlion/models/forecast/arima.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | """ 8 | The classic statistical forecasting model ARIMA (AutoRegressive Integrated 9 | Moving Average). 10 | """ 11 | import logging 12 | from typing import Tuple 13 | 14 | from merlion.models.forecast.sarima import SarimaConfig, Sarima 15 | from merlion.transform.resample import TemporalResample 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | class ArimaConfig(SarimaConfig): 21 | """ 22 | Configuration class for `Arima`. Just a `Sarima` model with seasonal order ``(0, 0, 0, 0)``. 23 | """ 24 | 25 | _default_transform = TemporalResample(granularity=None, trainable_granularity=True) 26 | 27 | def __init__(self, order=(4, 1, 2), seasonal_order=(0, 0, 0, 0), **kwargs): 28 | """ 29 | :param seasonal_order: (0, 0, 0, 0) because ARIMA has no seasonal order. 30 | """ 31 | super().__init__(order=order, seasonal_order=seasonal_order, **kwargs) 32 | 33 | @property 34 | def seasonal_order(self) -> Tuple[int, int, int, int]: 35 | """ 36 | :return: (0, 0, 0, 0) because ARIMA has no seasonal order. 37 | """ 38 | return 0, 0, 0, 0 39 | 40 | @seasonal_order.setter 41 | def seasonal_order(self, seasonal_order: Tuple[int, int, int, int]): 42 | assert tuple(seasonal_order) == (0, 0, 0, 0), "Seasonal order must be (0, 0, 0, 0) for ARIMA" 43 | 44 | 45 | class Arima(Sarima): 46 | """ 47 | Implementation of the classic statistical model ARIMA (AutoRegressive Integrated Moving Average) for forecasting. 48 | """ 49 | 50 | config_class = ArimaConfig 51 | -------------------------------------------------------------------------------- /merlion/models/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | """ 8 | Contains various utility files & functions useful for different models. 9 | """ 10 | -------------------------------------------------------------------------------- /merlion/models/utils/early_stopping.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | """ 8 | Earlying Stopping 9 | """ 10 | import logging 11 | 12 | try: 13 | import torch 14 | import torch.nn as nn 15 | import torch.nn.functional as F 16 | except ImportError as e: 17 | err = ( 18 | "Try installing Merlion with optional dependencies using `pip install salesforce-merlion[deep-learning]` or " 19 | "`pip install `salesforce-merlion[all]`" 20 | ) 21 | raise ImportError(str(e) + ". " + err) 22 | 23 | import numpy as np 24 | 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | 29 | class EarlyStopping: 30 | """ 31 | Early stopping for deep model training 32 | """ 33 | 34 | def __init__(self, patience=7, delta=0): 35 | """ 36 | :param patience: Number of epochs with no improvement after which training will be stopped. 37 | :param delta: Minimum change in the monitored quantity to qualify as an improvement, 38 | i.e. an absolute change of less than min_delta, will count as no improvement. 39 | """ 40 | 41 | self.patience = patience 42 | self.counter = 0 43 | self.best_score = None 44 | self.early_stop = False 45 | self.val_loss_min = np.Inf 46 | self.delta = delta 47 | self.best_model_state_dict = None 48 | 49 | def __call__(self, val_loss, model): 50 | score = -val_loss 51 | if self.best_score is None: 52 | self.best_score = score 53 | self.save_best_state_and_dict(val_loss, model) 54 | elif score < self.best_score + self.delta: 55 | self.counter += 1 56 | logger.info(f"EarlyStopping counter: {self.counter} out of {self.patience}") 57 | if self.counter >= self.patience: 58 | self.early_stop = True 59 | else: 60 | self.best_score = score 61 | self.save_best_state_and_dict(val_loss, model) 62 | self.counter = 0 63 | 64 | def save_best_state_and_dict(self, val_loss, model): 65 | self.best_model_state_dict = model.state_dict() 66 | 67 | self.val_loss_min = val_loss 68 | 69 | def load_best_model(self, model): 70 | model.load_state_dict(self.best_model_state_dict) 71 | -------------------------------------------------------------------------------- /merlion/models/utils/nn_modules/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | from .blocks import ( 8 | AutoCorrelation, 9 | SeasonalLayernorm, 10 | SeriesDecomposeBlock, 11 | MovingAverageBlock, 12 | FullAttention, 13 | ProbAttention, 14 | ) 15 | from .layers import AutoCorrelationLayer, ConvLayer, AttentionLayer 16 | 17 | from .embed import DataEmbedding, DataEmbeddingWoPos, ETSEmbedding 18 | -------------------------------------------------------------------------------- /merlion/post_process/base.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | """ 8 | Base class for post-processing rules in Merlion. 9 | """ 10 | from abc import abstractmethod 11 | from copy import copy, deepcopy 12 | import inspect 13 | 14 | from merlion.utils import TimeSeries 15 | from merlion.utils.misc import AutodocABCMeta 16 | 17 | 18 | class PostRuleBase(metaclass=AutodocABCMeta): 19 | """ 20 | Base class for post-processing rules in Merlion. These objects are primarily 21 | for post-processing the sequence of anomaly scores returned by anomaly detection 22 | models. All post-rules are callable objects, and they have a ``train()`` method 23 | which may accept additional implementation-specific keyword arguments. 24 | """ 25 | 26 | def to_dict(self): 27 | params = inspect.signature(self.__init__).parameters 28 | d = {k: deepcopy(getattr(self, k)) for k in params} 29 | d["name"] = type(self).__name__ 30 | return d 31 | 32 | @classmethod 33 | def from_dict(cls, state_dict): 34 | state_dict = copy(state_dict) 35 | state_dict.pop("name", None) 36 | return cls(**state_dict) 37 | 38 | def __copy__(self): 39 | return self.from_dict(self.to_dict()) 40 | 41 | def __deepcopy__(self, memodict={}): 42 | return self.__copy__() 43 | 44 | def __repr__(self): 45 | kwargs = self.to_dict() 46 | name = kwargs.pop("name") 47 | kwargs_str = ", ".join(f"{k}={v}" for k, v in sorted(kwargs.items())) 48 | return f"{name}({kwargs_str})" 49 | 50 | @abstractmethod 51 | def train(self, anomaly_scores: TimeSeries): 52 | raise NotImplementedError 53 | 54 | @abstractmethod 55 | def __call__(self, anomaly_scores: TimeSeries): 56 | raise NotImplementedError 57 | -------------------------------------------------------------------------------- /merlion/post_process/factory.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | """ 8 | Contains the `PostRuleFactory`. 9 | """ 10 | from typing import Type 11 | from merlion.post_process.base import PostRuleBase 12 | from merlion.utils import dynamic_import 13 | 14 | import_alias = dict( 15 | Threshold="merlion.post_process.threshold:Threshold", 16 | AggregateAlarms="merlion.post_process.threshold:AggregateAlarms", 17 | AdaptiveThreshold="merlion.post_process.threshold:AdaptiveThreshold", 18 | AdaptiveAggregateAlarms="merlion.post_process.threshold:AdaptiveAggregateAlarms", 19 | AnomScoreCalibrator="merlion.post_process.calibrate:AnomScoreCalibrator", 20 | PostRuleSequence="merlion.post_process.sequence:PostRuleSequence", 21 | ) 22 | 23 | 24 | class PostRuleFactory(object): 25 | @classmethod 26 | def get_post_rule_class(cls, name: str) -> Type[PostRuleBase]: 27 | return dynamic_import(name, import_alias) 28 | 29 | @classmethod 30 | def create(cls, name: str, **kwargs) -> PostRuleBase: 31 | """ 32 | Uses the given ``kwargs`` to create a post-rule of the given name 33 | """ 34 | post_rule_class = cls.get_post_rule_class(name) 35 | return post_rule_class.from_dict(kwargs) 36 | -------------------------------------------------------------------------------- /merlion/post_process/sequence.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | """ 8 | Class to compose a sequence of post-rules into a single post-rule. 9 | """ 10 | import inspect 11 | from typing import Iterable 12 | 13 | from merlion.post_process.base import PostRuleBase 14 | from merlion.post_process.factory import PostRuleFactory 15 | from merlion.utils import TimeSeries 16 | 17 | 18 | class PostRuleSequence(PostRuleBase): 19 | def __init__(self, post_rules: Iterable): 20 | self.post_rules = list(post_rules) 21 | 22 | def train(self, anomaly_scores: TimeSeries, **kwargs) -> TimeSeries: 23 | for post_rule in self.post_rules: 24 | params = inspect.signature(post_rule.train).parameters 25 | if not any(v.kind.name == "VAR_KEYWORD" for v in params.values()): 26 | local_kwargs = {k: v for k, v in kwargs.items() if k in params} 27 | anomaly_scores = post_rule.train(anomaly_scores, **local_kwargs) 28 | return anomaly_scores 29 | 30 | def __call__(self, anomaly_scores: TimeSeries) -> TimeSeries: 31 | for post_rule in self.post_rules: 32 | anomaly_scores = post_rule(anomaly_scores) 33 | return anomaly_scores 34 | 35 | def to_dict(self): 36 | return {"name": type(self).__name__, "post_rules": [p.to_dict() for p in self.post_rules]} 37 | 38 | @classmethod 39 | def from_dict(cls, state_dict): 40 | post_rules = [ 41 | d if isinstance(d, PostRuleBase) else PostRuleFactory.create(**d) for d in state_dict["post_rules"] 42 | ] 43 | return cls(post_rules) 44 | 45 | def __repr__(self): 46 | return "PostRuleSequence(\n " + ",\n ".join([repr(f) for f in self.post_rules]) + "\n)" 47 | -------------------------------------------------------------------------------- /merlion/resources/gson-2.8.9.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/Merlion/085ef8a69e5dcdfb9dcaa394cc21e087cccbb8f0/merlion/resources/gson-2.8.9.jar -------------------------------------------------------------------------------- /merlion/resources/randomcutforest-core-1.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/Merlion/085ef8a69e5dcdfb9dcaa394cc21e087cccbb8f0/merlion/resources/randomcutforest-core-1.0.jar -------------------------------------------------------------------------------- /merlion/resources/randomcutforest-serialization-json-1.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/Merlion/085ef8a69e5dcdfb9dcaa394cc21e087cccbb8f0/merlion/resources/randomcutforest-serialization-json-1.0.jar -------------------------------------------------------------------------------- /merlion/transform/bound.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | """ 8 | Transforms that clip the input. 9 | """ 10 | 11 | from collections import OrderedDict 12 | import logging 13 | import numpy as np 14 | 15 | from merlion.transform.base import TransformBase 16 | from merlion.utils import UnivariateTimeSeries, TimeSeries 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | class LowerUpperClip(TransformBase): 22 | """ 23 | Clips the values of a time series to lie between lower and upper. 24 | """ 25 | 26 | def __init__(self, lower=None, upper=None): 27 | super().__init__() 28 | assert not (lower is None and upper is None), "Must provide at least one of lower or upper" 29 | if lower is not None and upper is not None: 30 | assert lower < upper 31 | self.lower = lower 32 | self.upper = upper 33 | 34 | @property 35 | def requires_inversion_state(self): 36 | """ 37 | ``False`` because "inverting" value clipping is stateless. 38 | """ 39 | return False 40 | 41 | def train(self, time_series: TimeSeries): 42 | pass 43 | 44 | def __call__(self, time_series: TimeSeries) -> TimeSeries: 45 | new_vars = OrderedDict() 46 | for name, var in time_series.items(): 47 | x = np.clip(var.np_values, self.lower, self.upper) 48 | new_vars[name] = UnivariateTimeSeries(var.index, x) 49 | 50 | return TimeSeries(new_vars) 51 | -------------------------------------------------------------------------------- /merlion/transform/factory.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | """ 8 | Contains the `TransformFactory` for instantiating transforms. 9 | """ 10 | 11 | from typing import Type 12 | from merlion.transform.base import TransformBase 13 | from merlion.utils import dynamic_import 14 | 15 | 16 | import_alias = dict( 17 | Identity="merlion.transform.base:Identity", 18 | MovingAverage="merlion.transform.moving_average:MovingAverage", 19 | ExponentialMovingAverage="merlion.transform.moving_average:ExponentialMovingAverage", 20 | DifferenceTransform="merlion.transform.moving_average:DifferenceTransform", 21 | LagTransform="merlion.transform.moving_average:LagTransform", 22 | LowerUpperClip="merlion.transform.bound:LowerUpperClip", 23 | Rescale="merlion.transform.normalize:Rescale", 24 | AbsVal="merlion.transform.normalize:AbsVal", 25 | MeanVarNormalize="merlion.transform.normalize:MeanVarNormalize", 26 | MinMaxNormalize="merlion.transform.normalize:MinMaxNormalize", 27 | BoxCoxTransform="merlion.transform.normalize:BoxCoxTransform", 28 | TemporalResample="merlion.transform.resample:TemporalResample", 29 | Shingle="merlion.transform.resample:Shingle", 30 | TransformSequence="merlion.transform.sequence:TransformSequence", 31 | TransformStack="merlion.transform.sequence:TransformStack", 32 | InvertibleTransformSequence="merlion.transform.sequence:InvertibleTransformSequence", 33 | ) 34 | 35 | 36 | class TransformFactory(object): 37 | @classmethod 38 | def get_transform_class(cls, name: str) -> Type[TransformBase]: 39 | return dynamic_import(name, import_alias) 40 | 41 | @classmethod 42 | def create(cls, name: str, **kwargs) -> TransformBase: 43 | transform_class = cls.get_transform_class(name) 44 | return transform_class.from_dict(kwargs) 45 | -------------------------------------------------------------------------------- /merlion/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | from merlion.utils.misc import dynamic_import 8 | from merlion.utils.resample import to_pd_datetime, to_timestamp 9 | from merlion.utils.time_series import UnivariateTimeSeries, TimeSeries 10 | -------------------------------------------------------------------------------- /merlion_logo.svg: -------------------------------------------------------------------------------- 1 | <?xml version="1.0" encoding="utf-8"?> 2 | <svg viewBox="0 0 475 125" xmlns="http://www.w3.org/2000/svg"> 3 | <defs> 4 | <mask id="merlion"> 5 | <circle style="fill: rgb(255, 255, 255); fill-rule: evenodd;" cx="1059.161" cy="-552.178" r="436.487"/> 6 | <path d="M 1065.97 743.828 C 1065.97 743.828 1074.82 756.641 1104.97 742.152 C 1104.97 742.152 1104.79 726.934 1076.8 726.191 C 1048.82 725.449 1060.55 729.531 1060.55 729.531 L 1065.97 743.828 Z M 1019.692 105.788 C 1186.644 50.493 1280.918 121.345 1361.865 159.466 C 1413.367 183.72 1460.719 212.499 1408.14 290.586 C 1403.2 325.277 1370.85 326.828 1350.36 434.133 C 1364.9 434.266 1375.96 433.473 1392.43 432.742 C 1390.98 437.477 1385.78 445.145 1380.83 453.422 L 1380.14 453.043 C 1367.93 475.609 1359.83 506.273 1354.92 530.113 L 1354.86 530.352 L 1354.5 532.121 L 1354.48 532.223 C 1350.48 551.68 1348.61 566.539 1348.38 568.461 L 1348.37 568.465 L 1348.36 568.617 L 1348.35 568.672 C 1347.4 575.406 1346.8 581.598 1346.46 586.926 L 1379.69 579.527 C 1351.5 668.84 1366.73 719.148 1303.68 775.535 L 1310.32 803.211 C 1268.22 798.031 1216.68 835.66 1141.58 820.383 C 1115.8 806.504 1111.68 815.742 1081.47 781.215 C 1061.97 758.918 1038.42 764.828 1031.22 731.887 C 999.563 730.07 962.398 727.082 964.949 691.66 C 965.844 679.148 967.336 648.309 971.137 640.098 C 979.477 622.129 989.816 630.41 1007.63 624.125 C 1010.59 627.766 1013.58 630.992 1016.61 633.848 L 1024.87 620.465 C 1024.87 620.465 1028.61 635.293 1040.57 647.945 L 1041.32 649.398 C 1093.13 667.285 1134.97 596.297 1062.68 588.473 L 1045.56 588.574 C 1035.25 596.699 1032.21 608.77 1032.21 608.77 L 1026.25 599.117 L 1006.96 588.813 C 1006.71 567.852 999.313 563.59 1003.22 542.496 C 1019.03 535.438 1036.63 534.711 1058.25 534.066 C 974.945 346.973 912.406 521.961 1006.97 322.227 C 1041.41 246.652 1037.3 179.41 1024.37 124.531 M 962.121 595.965 C 962.121 595.965 749.273 571.44 619.827 426.444 C 640.883 383.102 637.567 365.548 665.799 354.572 C 788.567 533.219 960.399 597.222 956.138 594.244" style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none'" transform="matrix(1, 0, 0, -1, 0, 0)"/> 7 | </mask> 8 | </defs> 9 | <g id="g10" transform="matrix(0.13, 0, 0, 0.13, -80.57, 135.32)"> 10 | <path d="m 2274.48,441.102 -0.96,337.796 h -64.27 L 2084.84,568.984 1958.48,778.898 h -64.66 V 441.102 h 73.35 v 198.336 l 98.85,-162.625 h 35.22 l 99.33,166.964 0.48,-202.675 z M 4122.79,778.898 V 573.805 l -167.93,205.093 h -64.67 V 441.102 h 77.21 V 646.191 L 4135.82,441.102 H 4200 V 778.898 Z M 3646.5,435.309 c 106.16,0 184.82,73.832 184.82,174.691 0,100.859 -78.66,174.688 -184.82,174.688 -106.65,0 -184.83,-74.313 -184.83,-174.688 0,-100.375 78.18,-174.691 184.83,-174.691 z m 0,66.593 c -60.32,0 -105.68,43.914 -105.68,108.098 0,64.18 45.36,108.094 105.68,108.094 60.32,0 105.68,-43.914 105.68,-108.094 0,-64.184 -45.36,-108.098 -105.68,-108.098 z m -321.87,-60.8 h 78.17 v 337.796 h -78.17 z m -291.47,0 h 247.55 v 63.699 h -169.38 v 274.097 h -78.17 z m -55.5,0 -75.76,108.574 c 43.92,18.82 69.49,55.98 69.49,106.656 0,75.754 -56.46,122.566 -146.7,122.566 H 2678.47 V 441.102 h 78.18 v 94.097 h 68.04 3.86 l 65.15,-94.097 z m -85.41,215.23 c 0,-36.684 -24.13,-58.879 -71.9,-58.879 h -63.7 v 117.746 h 63.7 c 47.77,0 71.9,-21.715 71.9,-58.867 z m -459.89,-152.5 v 78.18 h 156.84 v 60.801 h -156.84 v 73.351 h 177.59 v 62.734 H 2354.67 V 441.102 h 261.55 v 62.73 h -183.86" style="fill:#02144a;fill-opacity:1;fill-rule:evenodd;stroke:#ffffff;stroke-opacity:1;stroke-width:4;shape-rendering='geometricPrecision'" id="path14" transform="matrix(1, 0, 0, -1, -160, 35)"/> 11 | <circle style="fill: rgb(0, 161, 224); fill-rule: evenodd;" cx="1059.161" cy="-552.178" r="436.487" mask="url(#merlion)"/> 12 | </g> 13 | </svg> -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | log_format = %(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s 3 | log_date_format = %Y-%m-%d %H:%M:%S 4 | log_cli=true 5 | log_cli_level=INFO 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | from setuptools import setup, find_namespace_packages 8 | 9 | MERLION_JARS = [ 10 | "resources/gson-2.8.9.jar", 11 | "resources/randomcutforest-core-1.0.jar", 12 | "resources/randomcutforest-serialization-json-1.0.jar", 13 | ] 14 | 15 | MERLION_DASHBOARD_ASSETS = [ 16 | "dashboard/assets/fonts/SalesforceSans-Bold.woff", 17 | "dashboard/assets/fonts/SalesforceSans-BoldItalic.woff", 18 | "dashboard/assets/fonts/SalesforceSans-Italic.woff", 19 | "dashboard/assets/fonts/SalesforceSans-Light.woff", 20 | "dashboard/assets/fonts/SalesforceSans-LightItalic.woff", 21 | "dashboard/assets/fonts/SalesforceSans-Regular.woff", 22 | "dashboard/assets/fonts/SalesforceSans-Thin.woff", 23 | "dashboard/assets/fonts/SalesforceSans-ThinItalic.woff", 24 | "dashboard/assets/Acumin-BdPro.otf", 25 | "dashboard/assets/base.css", 26 | "dashboard/assets/merlion.css", 27 | "dashboard/assets/merlion_small.svg", 28 | "dashboard/assets/modal.css", 29 | "dashboard/assets/resizing.js", 30 | "dashboard/assets/styles.css", 31 | "dashboard/assets/upload.svg", 32 | ] 33 | 34 | # optional dependencies 35 | extra_require = { 36 | "dashboard": ["dash[diskcache]>=2.4", "dash_bootstrap_components>=1.0", "diskcache"], 37 | "deep-learning": ["torch>=1.9.0", "einops>=0.4.0"], 38 | "spark": ["pyspark[sql]>=3"], 39 | } 40 | extra_require["all"] = sum(extra_require.values(), []) 41 | 42 | 43 | def read_file(fname): 44 | with open(fname, "r", encoding="utf-8") as f: 45 | return f.read() 46 | 47 | 48 | setup( 49 | name="salesforce-merlion", 50 | version="2.0.2", 51 | author=", ".join(read_file("AUTHORS.md").split("\n")), 52 | author_email="abhatnagar@salesforce.com", 53 | description="Merlion: A Machine Learning Framework for Time Series Intelligence", 54 | long_description=read_file("README.md"), 55 | long_description_content_type="text/markdown", 56 | keywords="time series, forecasting, anomaly detection, machine learning, autoML, " 57 | "ensemble learning, benchmarking, Python, scientific toolkit", 58 | url="https://github.com/salesforce/Merlion", 59 | license="3-Clause BSD", 60 | packages=find_namespace_packages(include="merlion.*"), 61 | package_dir={"merlion": "merlion"}, 62 | package_data={"merlion": MERLION_JARS + MERLION_DASHBOARD_ASSETS}, 63 | install_requires=[ 64 | "cython", 65 | "dill", 66 | "GitPython", 67 | "py4j", 68 | "matplotlib", 69 | "plotly>=4.13", 70 | "numpy>=1.21,<2.0", # 1.21 remediates a security risk 71 | "packaging", 72 | "pandas>=1.1.0", # >=1.1.0 for origin kwarg to df.resample() 73 | "prophet>=1.1", # 1.1 removes dependency on pystan 74 | "scikit-learn>=0.22", # >=0.22 for changes to isolation forest algorithm 75 | "scipy>=1.6.0", # 1.6.0 adds multivariate_t density to scipy.stats 76 | "statsmodels>=0.12.2", 77 | "lightgbm", # if running at MacOS, need OpenMP: "brew install libomp" 78 | "tqdm", 79 | ], 80 | extras_require=extra_require, 81 | python_requires=">=3.7.0", 82 | zip_safe=False, 83 | ) 84 | -------------------------------------------------------------------------------- /tests/anomaly/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | -------------------------------------------------------------------------------- /tests/anomaly/forecast_based/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | -------------------------------------------------------------------------------- /tests/anomaly/multivariate/test_autoencoder.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import sys 8 | import logging 9 | import unittest 10 | import torch 11 | import random 12 | import numpy as np 13 | import pandas as pd 14 | from os.path import abspath, dirname, join 15 | from merlion.utils import TimeSeries 16 | from ts_datasets.anomaly import * 17 | from merlion.models.anomaly.autoencoder import AutoEncoder 18 | 19 | rootdir = dirname(dirname(dirname(dirname(abspath(__file__))))) 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | def set_random_seeds(): 24 | torch.manual_seed(12345) 25 | random.seed(12345) 26 | np.random.seed(12345) 27 | 28 | 29 | def get_train_test_splits(df: pd.DataFrame, metadata: pd.DataFrame, n: int) -> (pd.DataFrame, pd.DataFrame, np.ndarray): 30 | train_df = df[metadata.trainval] 31 | test_df = df[~metadata.trainval] 32 | test_labels = metadata[~metadata.trainval].anomaly.values 33 | return train_df.tail(n), test_df.head(n), test_labels[:n] 34 | 35 | 36 | class TestAutoEncoder(unittest.TestCase): 37 | def __init__(self, *args, **kwargs): 38 | super().__init__(*args, **kwargs) 39 | set_random_seeds() 40 | 41 | self.model = AutoEncoder(config=AutoEncoder.config_class(num_epochs=10)) 42 | self.dataset = MSL(rootdir=join(rootdir, "data", "smap")) 43 | df, metadata = self.dataset[0] 44 | self.train_df, self.test_df, self.test_labels = get_train_test_splits(df, metadata, 2000) 45 | 46 | logger.info("Training model...\n") 47 | train_ts = TimeSeries.from_pd(self.train_df) 48 | self.model.train(train_ts) 49 | 50 | def test_score(self): 51 | print("-" * 80) 52 | logger.info("test_score\n" + "-" * 80 + "\n") 53 | test_ts = TimeSeries.from_pd(self.test_df) 54 | score_ts = self.model.get_anomaly_score(test_ts) 55 | scores = score_ts.to_pd().values.flatten() 56 | min_score, max_score, sum_score = min(scores), max(scores), sum(scores) 57 | 58 | logger.info(f"scores look like: {scores[:10]}") 59 | logger.info(f"min score = {min_score}") 60 | logger.info(f"max score = {max_score}") 61 | logger.info(f"sum score = {sum_score}") 62 | 63 | def test_save_load(self): 64 | print("-" * 80) 65 | logger.info("test_save_load\n" + "-" * 80 + "\n") 66 | self.model.save(dirname=join(rootdir, "tmp", "ae")) 67 | loaded_model = AutoEncoder.load(dirname=join(rootdir, "tmp", "ae")) 68 | 69 | test_ts = TimeSeries.from_pd(self.test_df) 70 | scores = self.model.get_anomaly_score(test_ts) 71 | loaded_model_scores = loaded_model.get_anomaly_score(test_ts) 72 | self.assertSequenceEqual(list(scores), list(loaded_model_scores)) 73 | 74 | alarms = self.model.get_anomaly_label(test_ts) 75 | loaded_model_alarms = loaded_model.get_anomaly_label(test_ts) 76 | self.assertSequenceEqual(list(alarms), list(loaded_model_alarms)) 77 | 78 | 79 | if __name__ == "__main__": 80 | logging.basicConfig( 81 | format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", stream=sys.stdout, level=logging.DEBUG 82 | ) 83 | unittest.main() 84 | -------------------------------------------------------------------------------- /tests/anomaly/multivariate/test_dagmm.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import sys 8 | import logging 9 | import unittest 10 | import torch 11 | import random 12 | import numpy as np 13 | import pandas as pd 14 | from os.path import abspath, dirname, join 15 | from merlion.utils import TimeSeries 16 | from ts_datasets.anomaly import * 17 | from merlion.models.anomaly.dagmm import DAGMM 18 | 19 | rootdir = dirname(dirname(dirname(dirname(abspath(__file__))))) 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | def set_random_seeds(): 24 | torch.manual_seed(12345) 25 | random.seed(12345) 26 | np.random.seed(12345) 27 | 28 | 29 | def get_train_test_splits(df: pd.DataFrame, metadata: pd.DataFrame, n: int) -> (pd.DataFrame, pd.DataFrame, np.ndarray): 30 | train_df = df[metadata.trainval] 31 | test_df = df[~metadata.trainval] 32 | test_labels = metadata[~metadata.trainval].anomaly.values 33 | return train_df.tail(n), test_df.head(n), test_labels[:n] 34 | 35 | 36 | class TestDAGMM(unittest.TestCase): 37 | def __init__(self, *args, **kwargs): 38 | super().__init__(*args, **kwargs) 39 | set_random_seeds() 40 | 41 | self.model = DAGMM(config=DAGMM.config_class(num_epochs=10)) 42 | self.dataset = MSL(rootdir=join(rootdir, "data", "smap")) 43 | df, metadata = self.dataset[0] 44 | self.train_df, self.test_df, self.test_labels = get_train_test_splits(df, metadata, 500) 45 | 46 | logger.info("Training model...\n") 47 | train_ts = TimeSeries.from_pd(self.train_df) 48 | self.model.train(train_ts) 49 | 50 | logger.info("Training multiple timeseries model...\n") 51 | self.model.train_multiple([train_ts] * 10) 52 | 53 | def test_score(self): 54 | print("-" * 80) 55 | logger.info("test_score\n" + "-" * 80 + "\n") 56 | test_ts = TimeSeries.from_pd(self.test_df) 57 | score_ts = self.model.get_anomaly_score(test_ts) 58 | scores = score_ts.to_pd().values.flatten() 59 | min_score, max_score, sum_score = min(scores), max(scores), sum(scores) 60 | 61 | logger.info(f"scores look like: {scores[:10]}") 62 | logger.info(f"min score = {min_score}") 63 | logger.info(f"max score = {max_score}") 64 | logger.info(f"sum score = {sum_score}") 65 | 66 | def test_save_load(self): 67 | print("-" * 80) 68 | logger.info("test_save_load\n" + "-" * 80 + "\n") 69 | self.model.save(dirname=join(rootdir, "tmp", "dagmm")) 70 | loaded_model = DAGMM.load(dirname=join(rootdir, "tmp", "dagmm")) 71 | 72 | test_ts = TimeSeries.from_pd(self.test_df) 73 | scores = self.model.get_anomaly_score(test_ts) 74 | loaded_model_scores = loaded_model.get_anomaly_score(test_ts) 75 | self.assertSequenceEqual(list(scores), list(loaded_model_scores)) 76 | 77 | alarms = self.model.get_anomaly_label(test_ts) 78 | loaded_model_alarms = loaded_model.get_anomaly_label(test_ts) 79 | self.assertSequenceEqual(list(alarms), list(loaded_model_alarms)) 80 | 81 | 82 | if __name__ == "__main__": 83 | logging.basicConfig( 84 | format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", stream=sys.stdout, level=logging.DEBUG 85 | ) 86 | unittest.main() 87 | -------------------------------------------------------------------------------- /tests/anomaly/multivariate/test_lstmed.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import sys 8 | import logging 9 | import unittest 10 | import torch 11 | import random 12 | import numpy as np 13 | import pandas as pd 14 | from os.path import abspath, dirname, join 15 | from merlion.utils import TimeSeries 16 | from ts_datasets.anomaly import * 17 | from merlion.models.anomaly.lstm_ed import LSTMED 18 | 19 | rootdir = dirname(dirname(dirname(dirname(abspath(__file__))))) 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | def set_random_seeds(): 24 | torch.manual_seed(12345) 25 | random.seed(12345) 26 | np.random.seed(12345) 27 | 28 | 29 | def get_train_test_splits(df: pd.DataFrame, metadata: pd.DataFrame, n: int) -> (pd.DataFrame, pd.DataFrame, np.ndarray): 30 | train_df = df[metadata.trainval] 31 | test_df = df[~metadata.trainval] 32 | test_labels = metadata[~metadata.trainval].anomaly.values 33 | return train_df.tail(n), test_df.head(n), test_labels[:n] 34 | 35 | 36 | class TestLSTMED(unittest.TestCase): 37 | def __init__(self, *args, **kwargs): 38 | super().__init__(*args, **kwargs) 39 | set_random_seeds() 40 | 41 | self.model = LSTMED(config=LSTMED.config_class(num_epochs=5)) 42 | self.dataset = MSL(rootdir=join(rootdir, "data", "smap")) 43 | df, metadata = self.dataset[0] 44 | self.train_df, self.test_df, self.test_labels = get_train_test_splits(df, metadata, 1000) 45 | 46 | logger.info("Training model...\n") 47 | train_ts = TimeSeries.from_pd(self.train_df) 48 | self.model.train(train_ts) 49 | 50 | def test_score(self): 51 | print("-" * 80) 52 | logger.info("test_score\n" + "-" * 80 + "\n") 53 | test_ts = TimeSeries.from_pd(self.test_df) 54 | score_ts = self.model.get_anomaly_label(test_ts) 55 | scores = score_ts.to_pd().values.flatten() 56 | min_score, max_score, sum_score = min(scores), max(scores), sum(scores) 57 | 58 | logger.info(f"scores look like: {scores[:10]}") 59 | logger.info(f"min score = {min_score}") 60 | logger.info(f"max score = {max_score}") 61 | logger.info(f"sum score = {sum_score}") 62 | 63 | def test_save_load(self): 64 | print("-" * 80) 65 | logger.info("test_save_load\n" + "-" * 80 + "\n") 66 | self.model.save(dirname=join(rootdir, "tmp", "lstmed")) 67 | loaded_model = LSTMED.load(dirname=join(rootdir, "tmp", "lstmed")) 68 | 69 | test_ts = TimeSeries.from_pd(self.test_df) 70 | scores = self.model.get_anomaly_score(test_ts) 71 | loaded_model_scores = loaded_model.get_anomaly_score(test_ts) 72 | self.assertSequenceEqual(list(scores), list(loaded_model_scores)) 73 | 74 | alarms = self.model.get_anomaly_label(test_ts) 75 | loaded_model_alarms = loaded_model.get_anomaly_label(test_ts) 76 | self.assertSequenceEqual(list(alarms), list(loaded_model_alarms)) 77 | 78 | 79 | if __name__ == "__main__": 80 | logging.basicConfig( 81 | format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", stream=sys.stdout, level=logging.DEBUG 82 | ) 83 | unittest.main() 84 | -------------------------------------------------------------------------------- /tests/anomaly/multivariate/test_vae.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import sys 8 | import logging 9 | import unittest 10 | import torch 11 | import random 12 | import numpy as np 13 | import pandas as pd 14 | from os.path import abspath, dirname, join 15 | from merlion.utils import TimeSeries 16 | from ts_datasets.anomaly import * 17 | from merlion.models.anomaly.vae import VAE 18 | 19 | rootdir = dirname(dirname(dirname(dirname(abspath(__file__))))) 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | def set_random_seeds(): 24 | torch.manual_seed(12345) 25 | random.seed(12345) 26 | np.random.seed(12345) 27 | 28 | 29 | def get_train_test_splits(df: pd.DataFrame, metadata: pd.DataFrame, n: int) -> (pd.DataFrame, pd.DataFrame, np.ndarray): 30 | train_df = df[metadata.trainval] 31 | test_df = df[~metadata.trainval] 32 | test_labels = metadata[~metadata.trainval].anomaly.values 33 | return train_df.tail(n), test_df.head(n), test_labels[:n] 34 | 35 | 36 | class TestVAE(unittest.TestCase): 37 | def __init__(self, *args, **kwargs): 38 | super().__init__(*args, **kwargs) 39 | set_random_seeds() 40 | 41 | self.model = VAE(config=VAE.config_class(num_epochs=5)) 42 | self.dataset = MSL(rootdir=join(rootdir, "data", "smap")) 43 | df, metadata = self.dataset[0] 44 | self.train_df, self.test_df, self.test_labels = get_train_test_splits(df, metadata, 5000) 45 | 46 | logger.info("Training model...\n") 47 | train_ts = TimeSeries.from_pd(self.train_df) 48 | self.model.train(train_ts) 49 | 50 | def test_score(self): 51 | print("-" * 80) 52 | logger.info("test_score\n" + "-" * 80 + "\n") 53 | test_ts = TimeSeries.from_pd(self.test_df) 54 | 55 | set_random_seeds() 56 | score_ts = self.model.get_anomaly_score(test_ts) 57 | scores = score_ts.to_pd().values.flatten() 58 | min_score, max_score, sum_score = min(scores), max(scores), sum(scores) 59 | 60 | logger.info(f"scores look like: {scores[:10]}") 61 | logger.info(f"min score = {min_score}") 62 | logger.info(f"max score = {max_score}") 63 | logger.info(f"sum score = {sum_score}") 64 | 65 | def test_save_load(self): 66 | print("-" * 80) 67 | logger.info("test_save_load\n" + "-" * 80 + "\n") 68 | self.model.save(dirname=join(rootdir, "tmp", "vae")) 69 | loaded_model = VAE.load(dirname=join(rootdir, "tmp", "vae")) 70 | 71 | test_ts = TimeSeries.from_pd(self.test_df) 72 | set_random_seeds() 73 | scores = self.model.get_anomaly_score(test_ts) 74 | set_random_seeds() 75 | loaded_model_scores = loaded_model.get_anomaly_score(test_ts) 76 | self.assertSequenceEqual(list(scores), list(loaded_model_scores)) 77 | 78 | set_random_seeds() 79 | alarms = self.model.get_anomaly_label(test_ts) 80 | set_random_seeds() 81 | loaded_model_alarms = loaded_model.get_anomaly_label(test_ts) 82 | self.assertSequenceEqual(list(alarms), list(loaded_model_alarms)) 83 | 84 | 85 | if __name__ == "__main__": 86 | logging.basicConfig( 87 | format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", stream=sys.stdout, level=logging.DEBUG 88 | ) 89 | unittest.main() 90 | -------------------------------------------------------------------------------- /tests/anomaly/test_spectral_residual.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import logging 8 | import sys 9 | import unittest 10 | from os.path import join, dirname, abspath 11 | 12 | import numpy as np 13 | 14 | from merlion.models.anomaly.spectral_residual import SpectralResidual, SpectralResidualConfig 15 | from merlion.post_process.threshold import AggregateAlarms 16 | from merlion.utils.data_io import csv_to_time_series 17 | 18 | rootdir = dirname(dirname(dirname(abspath(__file__)))) 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | class TestSpectralResidual(unittest.TestCase): 23 | def __init__(self, *args, **kwargs) -> None: 24 | super().__init__(*args, **kwargs) 25 | self.csv_name = join(rootdir, "data", "example.csv") 26 | self.test_len = 32768 27 | self.data = csv_to_time_series(self.csv_name, timestamp_unit="ms", data_cols="kpi") 28 | logger.info(f"Data looks like:\n{self.data[:5]}") 29 | self.vals_train = self.data[: -self.test_len] 30 | self.vals_test = self.data[-self.test_len :] 31 | self.model = SpectralResidual( 32 | SpectralResidualConfig( 33 | local_wind_sz=21, 34 | estimated_points=5, 35 | predicting_points=5, 36 | target_seq_index=0, 37 | threshold=AggregateAlarms(alm_threshold=3.5, min_alm_in_window=1), 38 | ) 39 | ) 40 | print() 41 | logger.info("Training model...\n") 42 | self.model.train(self.vals_train) 43 | 44 | def test_score(self): 45 | # score function returns the raw anomaly scores 46 | print("-" * 80) 47 | logger.info("test_score\n" + "-" * 80 + "\n") 48 | scores = self.model.get_anomaly_score(self.vals_test) 49 | logger.info(f"Scores look like:\n{scores[:5]}") 50 | scores = scores.to_pd().values.flatten() 51 | logger.info("max score = " + str(max(scores))) 52 | logger.info("min score = " + str(min(scores)) + "\n") 53 | 54 | self.assertEqual(len(scores), len(self.model.transform(self.vals_test))) 55 | 56 | def test_alarm(self): 57 | # alarm function returns the post-rule processed anomaly scores 58 | print("-" * 80) 59 | logger.info("test_alarm\n" + "-" * 80 + "\n") 60 | alarms = self.model.get_anomaly_label(self.vals_test) 61 | n_alarms = np.sum(alarms.to_pd().values != 0) 62 | logger.info(f"Alarms look like:\n{alarms[:5]}") 63 | logger.info(f"Number of alarms: {n_alarms}\n") 64 | self.assertLessEqual(n_alarms, 6) 65 | self.assertGreaterEqual(n_alarms, 1) 66 | 67 | def test_save_load(self): 68 | print("-" * 80) 69 | logger.info("test_save_load\n" + "-" * 80 + "\n") 70 | self.model.save(dirname=join(rootdir, "tmp", "spectral_residual")) 71 | loaded_model = SpectralResidual.load(dirname=join(rootdir, "tmp", "spectral_residual")) 72 | 73 | scores = self.model.get_anomaly_score(self.vals_test) 74 | loaded_model_scores = loaded_model.get_anomaly_score(self.vals_test) 75 | self.assertSequenceEqual(list(scores), list(loaded_model_scores)) 76 | 77 | alarms = self.model.get_anomaly_label(self.vals_test) 78 | loaded_model_alarms = loaded_model.get_anomaly_label(self.vals_test) 79 | self.assertSequenceEqual(list(alarms), list(loaded_model_alarms)) 80 | 81 | 82 | if __name__ == "__main__": 83 | logging.basicConfig( 84 | format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", stream=sys.stdout, level=logging.DEBUG 85 | ) 86 | unittest.main() 87 | -------------------------------------------------------------------------------- /tests/anomaly/test_stat_threshold.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import logging 8 | from os.path import abspath, dirname, join 9 | import sys 10 | import unittest 11 | 12 | import numpy as np 13 | 14 | from merlion.models.anomaly.stat_threshold import StatThreshold, StatThresholdConfig 15 | from merlion.post_process.threshold import AggregateAlarms 16 | from merlion.utils.data_io import csv_to_time_series 17 | 18 | rootdir = dirname(dirname(dirname(abspath(__file__)))) 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | class TestStatThreshold(unittest.TestCase): 23 | def __init__(self, *args, **kwargs): 24 | super().__init__(*args, **kwargs) 25 | self.csv_name = join(rootdir, "data", "example.csv") 26 | self.test_len = 32768 27 | self.data = csv_to_time_series(self.csv_name, timestamp_unit="ms", data_cols=["kpi"]) 28 | logger.info(f"Data looks like:\n{self.data[:5]}") 29 | self.vals_train = self.data[: -self.test_len] 30 | self.vals_test = self.data[-self.test_len :] 31 | self.model = StatThreshold( 32 | StatThresholdConfig(enable_calibrator=True, threshold=AggregateAlarms(alm_threshold=3.5)) 33 | ) 34 | print() 35 | logger.info("Training model...\n") 36 | self.model.train(self.vals_train) 37 | 38 | def test_score(self): 39 | # score function returns the raw anomaly scores 40 | print("-" * 80) 41 | logger.info("test_score\n" + "-" * 80 + "\n") 42 | scores = self.model.get_anomaly_score(self.vals_test) 43 | logger.info(f"Scores look like:\n{scores[:5]}") 44 | scores = scores.to_pd().values.flatten() 45 | logger.info("max score = " + str(max(scores))) 46 | logger.info("min score = " + str(min(scores)) + "\n") 47 | 48 | self.assertEqual(len(scores), len(self.model.transform(self.vals_test))) 49 | 50 | def test_alarm(self): 51 | # alarm function returns the post-rule processed anomaly scores 52 | print("-" * 80) 53 | logger.info("test_alarm\n" + "-" * 80 + "\n") 54 | alarms = self.model.get_anomaly_label(self.vals_test) 55 | n_alarms = np.sum(alarms.to_pd().values != 0) 56 | logger.info(f"Alarms look like:\n{alarms[:5]}") 57 | logger.info(f"Number of alarms: {n_alarms}\n") 58 | scores = alarms.to_pd().values.flatten() 59 | logger.info("max score = " + str(max(scores))) 60 | logger.info("min score = " + str(min(scores)) + "\n") 61 | self.assertLessEqual(n_alarms, 6) 62 | 63 | def test_save_load(self): 64 | print("-" * 80) 65 | logger.info("test_save_load\n" + "-" * 80 + "\n") 66 | self.model.save(dirname=join(rootdir, "tmp", "stat_threshold")) 67 | loaded_model = StatThreshold.load(dirname=join(rootdir, "tmp", "stat_threshold")) 68 | 69 | scores = self.model.get_anomaly_score(self.vals_test) 70 | loaded_model_scores = loaded_model.get_anomaly_score(self.vals_test) 71 | self.assertSequenceEqual(list(scores), list(loaded_model_scores)) 72 | 73 | alarms = self.model.get_anomaly_label(self.vals_test) 74 | loaded_model_alarms = loaded_model.get_anomaly_label(self.vals_test) 75 | self.assertSequenceEqual(list(alarms), list(loaded_model_alarms)) 76 | 77 | 78 | if __name__ == "__main__": 79 | logging.basicConfig( 80 | format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", stream=sys.stdout, level=logging.DEBUG 81 | ) 82 | unittest.main() 83 | -------------------------------------------------------------------------------- /tests/anomaly/test_windstats.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import logging 8 | from os.path import abspath, dirname, join 9 | import sys 10 | import unittest 11 | 12 | import numpy as np 13 | 14 | from merlion.models.anomaly.windstats import WindStatsConfig, WindStats 15 | from merlion.post_process.threshold import AggregateAlarms 16 | from merlion.utils.data_io import csv_to_time_series 17 | 18 | rootdir = dirname(dirname(dirname(abspath(__file__)))) 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | class TestWindStats(unittest.TestCase): 23 | def __init__(self, *args, **kwargs): 24 | super().__init__(*args, **kwargs) 25 | self.csv_name = join(rootdir, "data", "example.csv") 26 | self.test_len = 32768 27 | self.data = csv_to_time_series(self.csv_name, timestamp_unit="ms", data_cols=["kpi"]) 28 | logger.info(f"Data looks like:\n{self.data[:5]}") 29 | self.vals_train = self.data[: -self.test_len] 30 | self.vals_test = self.data[-self.test_len :] 31 | self.model = WindStats( 32 | WindStatsConfig( 33 | wind_sz=30, threshold=AggregateAlarms(alm_threshold=4, alm_window_minutes=30, alm_suppress_minutes=300) 34 | ) 35 | ) 36 | print() 37 | logger.info("Training model...\n") 38 | self.model.train(self.vals_train) 39 | 40 | def test_score(self): 41 | # score function returns the raw anomaly scores 42 | print("-" * 80) 43 | logger.info("test_score\n" + "-" * 80 + "\n") 44 | scores = self.model.get_anomaly_score(self.vals_test) 45 | logger.info(f"Scores look like:\n{scores[:5]}") 46 | scores = scores.to_pd().values.flatten() 47 | logger.info("max score = " + str(max(scores))) 48 | logger.info("min score = " + str(min(scores)) + "\n") 49 | 50 | self.assertEqual(len(scores), len(self.model.transform(self.vals_test))) 51 | 52 | def test_alarm(self): 53 | # alarm function returns the post-rule processed anomaly scores 54 | print("-" * 80) 55 | logger.info("test_alarm\n" + "-" * 80 + "\n") 56 | alarms = self.model.get_anomaly_label(self.vals_test) 57 | n_alarms = np.sum(alarms.to_pd().values != 0) 58 | logger.info(f"Alarms look like:\n{alarms[:5]}") 59 | logger.info(f"Number of alarms: {n_alarms}\n") 60 | self.assertLessEqual(n_alarms, 6) 61 | 62 | def test_save_load(self): 63 | print("-" * 80) 64 | logger.info("test_save_load\n" + "-" * 80 + "\n") 65 | self.model.save(dirname=join(rootdir, "tmp", "windstats")) 66 | loaded_model = WindStats.load(dirname=join(rootdir, "tmp", "windstats")) 67 | 68 | scores = self.model.get_anomaly_score(self.vals_test) 69 | loaded_model_scores = loaded_model.get_anomaly_score(self.vals_test) 70 | self.assertSequenceEqual(list(scores), list(loaded_model_scores)) 71 | 72 | alarms = self.model.get_anomaly_label(self.vals_test) 73 | loaded_model_alarms = loaded_model.get_anomaly_label(self.vals_test) 74 | self.assertSequenceEqual(list(alarms), list(loaded_model_alarms)) 75 | 76 | # serialize and deserialize 77 | obj = self.model.to_bytes() 78 | loaded_model = WindStats.from_bytes(obj) 79 | loaded_model_scores = loaded_model.get_anomaly_score(self.vals_test) 80 | self.assertSequenceEqual(list(scores), list(loaded_model_scores)) 81 | loaded_model_alarms = loaded_model.get_anomaly_label(self.vals_test) 82 | self.assertSequenceEqual(list(alarms), list(loaded_model_alarms)) 83 | 84 | 85 | if __name__ == "__main__": 86 | logging.basicConfig( 87 | format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", stream=sys.stdout, level=logging.DEBUG 88 | ) 89 | unittest.main() 90 | -------------------------------------------------------------------------------- /tests/change_point/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | -------------------------------------------------------------------------------- /tests/evaluate/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | -------------------------------------------------------------------------------- /tests/forecast/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | -------------------------------------------------------------------------------- /tests/forecast/test_prophet.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import os 8 | import logging 9 | import sys 10 | import unittest 11 | 12 | import pandas as pd 13 | import numpy as np 14 | 15 | from merlion.evaluate.forecast import ForecastMetric 16 | from merlion.models.automl.autoprophet import AutoProphet, AutoProphetConfig 17 | from merlion.models.anomaly.forecast_based.prophet import ProphetDetector, ProphetDetectorConfig 18 | from merlion.models.forecast.prophet import Prophet, ProphetConfig 19 | from merlion.utils.resample import to_timestamp 20 | from merlion.utils.time_series import TimeSeries 21 | from ts_datasets.forecast import CustomDataset 22 | 23 | logger = logging.getLogger(__name__) 24 | rootdir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 25 | 26 | 27 | class TestProphet(unittest.TestCase): 28 | def test_resample_time_stamps(self): 29 | # arrange 30 | config = ProphetConfig() 31 | prophet = Prophet(config) 32 | prophet.last_train_time = pd.Timestamp(year=2022, month=1, day=1) 33 | prophet.timedelta = pd.Timedelta(days=1) 34 | target = np.array([to_timestamp(pd.Timestamp(year=2022, month=1, day=2))]) 35 | 36 | # act 37 | output = prophet.resample_time_stamps(time_stamps=1) 38 | 39 | # assert 40 | assert output == target 41 | 42 | def _test_exog(self, auto: bool): 43 | print("-" * 80) 44 | logger.info(f"TestProphet.test_exog{'_auto' if auto else ''}\n" + "-" * 80) 45 | # Get train, test, and exogenous data 46 | csv = os.path.join(rootdir, "data", "walmart", "walmart_mini.csv") 47 | index_cols = ["Store", "Dept"] 48 | target = ["Weekly_Sales"] 49 | ts, md = CustomDataset(rootdir=csv, test_frac=0.25, index_cols=index_cols)[0] 50 | train = TimeSeries.from_pd(ts.loc[md.trainval, target]) 51 | test = TimeSeries.from_pd(ts.loc[~md.trainval, target]) 52 | exog = TimeSeries.from_pd(ts[[c for c in ts.columns if "MarkDown" in c or "Holiday" in c]]) 53 | 54 | # Train model & get prediction 55 | model = Prophet(ProphetConfig()) 56 | exog_model = ProphetDetector(ProphetDetectorConfig()) 57 | if auto: 58 | model = AutoProphet(model=model) 59 | exog_model = AutoProphet(model=exog_model) 60 | model.train(train_data=train) 61 | exog_model.train(train_data=train, exog_data=exog) 62 | pred, _ = model.forecast(time_stamps=test.time_stamps) 63 | exog_pred, _ = exog_model.forecast(time_stamps=test.time_stamps, exog_data=exog) 64 | 65 | # Evaluate model 66 | smape = ForecastMetric.sMAPE.value(test, pred) 67 | exog_smape = ForecastMetric.sMAPE.value(test, exog_pred) 68 | logger.info(f"sMAPE = {smape:.2f} (no exog)") 69 | logger.info(f"sMAPE = {exog_smape:.2f} (with exog)") 70 | 71 | # Test that exog model can also get anomaly scores 72 | anomaly_labels = exog_model.get_anomaly_label(test, exog_data=exog).to_pd() 73 | logger.info(f"Alarms detected (anomaly detection): {anomaly_labels.sum().sum().item()}") 74 | 75 | def test_exog(self): 76 | self._test_exog(auto=False) 77 | 78 | def test_exog_auto(self): 79 | self._test_exog(auto=True) 80 | 81 | 82 | if __name__ == "__main__": 83 | logging.basicConfig( 84 | format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", stream=sys.stdout, level=logging.DEBUG 85 | ) 86 | unittest.main() 87 | -------------------------------------------------------------------------------- /tests/spark/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | -------------------------------------------------------------------------------- /tests/spark/conftest.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import pytest 8 | from pyspark import SparkConf 9 | from pyspark.sql import SparkSession 10 | 11 | 12 | @pytest.fixture(scope="session") 13 | def spark_session(): 14 | # Creates more helpful debug messages if Spark tests fail for some Java-related reason 15 | try: 16 | import faulthandler 17 | 18 | faulthandler.enable() 19 | faulthandler.disable() 20 | except: 21 | pass 22 | # Set timeout & heartbeat interval to 10 minutes to ensure tests can run to completion 23 | conf = SparkConf(False).setMaster("local[2]").setAppName("unit-tests") 24 | conf = conf.set("spark.network.timeout", "600000").set("spark.executor.heartbeatInterval", "600000") 25 | return SparkSession.builder.config(conf=conf).getOrCreate() 26 | -------------------------------------------------------------------------------- /tests/spark/test_anomaly.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | from os.path import abspath, dirname, join 8 | import logging 9 | 10 | from pyspark.sql.types import DateType, FloatType, StructField, StructType 11 | from merlion.spark.dataset import read_dataset, write_dataset, TSID_COL_NAME 12 | from merlion.spark.pandas_udf import anomaly 13 | 14 | logger = logging.getLogger(__name__) 15 | rootdir = dirname(dirname(dirname(abspath(__file__)))) 16 | 17 | 18 | def _run_job(spark, name: str, data_cols: list, model: dict, robust: bool = False): 19 | logger.info(f"test_spark_anomaly_{name}\n{'-' * 80}") 20 | index_cols = ["Store", "Dept"] 21 | time_col = "Date" 22 | train_test_split = "2012-09-15" if robust else "2012-06-01" 23 | 24 | df = read_dataset( 25 | spark=spark, 26 | file_format="csv", 27 | path=join(rootdir, "data", "walmart", "walmart_mini_error.csv" if robust else "walmart_mini.csv"), 28 | index_cols=index_cols, 29 | time_col=time_col, 30 | data_cols=data_cols, 31 | ) 32 | index_cols = index_cols + [TSID_COL_NAME] 33 | 34 | index_fields = [df.schema[c] for c in index_cols] 35 | pred_fields = [StructField(time_col, DateType()), StructField("anom_score", FloatType())] 36 | output_schema = StructType(index_fields + pred_fields) 37 | anomaly_df = df.groupBy(index_cols).applyInPandas( 38 | lambda pdf: anomaly( 39 | pdf, 40 | index_cols=index_cols, 41 | time_col=time_col, 42 | train_test_split=train_test_split, 43 | model=model, 44 | predict_on_train=robust, 45 | ), 46 | schema=output_schema, 47 | ) 48 | df.unpersist() 49 | 50 | output_path = join(rootdir, "tmp", "spark", "anomaly", name) 51 | write_dataset(df=anomaly_df, time_col=time_col, path=output_path, file_format="csv") 52 | anomaly_df.unpersist() 53 | 54 | 55 | def test_univariate(spark_session): 56 | _run_job(spark=spark_session, name="univariate", data_cols=["Weekly_Sales"], model={"name": "StatThreshold"}) 57 | 58 | 59 | def test_multivariate(spark_session): 60 | _run_job( 61 | spark=spark_session, 62 | name="multivariate", 63 | data_cols=["Weekly_Sales", "Temperature", "CPI"], 64 | model={"name": "IsolationForest"}, 65 | ) 66 | 67 | 68 | def test_robust(spark_session): 69 | _run_job( 70 | spark=spark_session, 71 | name="robust", 72 | data_cols=["Weekly_Sales", "Temperature", "CPI"], 73 | model={"name": "IsolationForest"}, 74 | robust=True, 75 | ) 76 | -------------------------------------------------------------------------------- /tests/test_custom_dataset.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import glob 8 | import os 9 | import pandas as pd 10 | from ts_datasets.forecast import CustomDataset 11 | from ts_datasets.anomaly import CustomAnomalyDataset 12 | 13 | rootdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 14 | 15 | 16 | def test_custom_anom_dataset(): 17 | data_dir = os.path.join(rootdir, "data", "synthetic_anomaly") 18 | dataset = CustomAnomalyDataset(rootdir=data_dir, test_frac=0.75, time_unit="s", assume_no_anomaly=True) 19 | assert len(dataset) == len(glob.glob(os.path.join(data_dir, "*.csv"))) 20 | assert all("anomaly" in md.columns and "trainval" in md.columns for ts, md in dataset) 21 | assert all(abs((~md.trainval).mean() - dataset.test_frac) < 2 / len(ts) for ts, md in dataset) 22 | 23 | 24 | def test_custom_dataset(): 25 | csv = os.path.join(rootdir, "data", "walmart", "walmart_mini.csv") 26 | index_cols = ["Store", "Dept"] 27 | data_cols = ["Weekly_Sales", "Temperature", "CPI"] 28 | df = pd.read_csv(csv, index_col=[0, 1, 2], parse_dates=True) 29 | dataset = CustomDataset(rootdir=csv, test_frac=0.25, data_cols=data_cols, index_cols=index_cols) 30 | assert len(dataset) == len(df.groupby(index_cols).groups) 31 | assert all(list(ts.columns) == data_cols for ts, md in dataset) 32 | assert all((c in md.columns for c in ["trainval"] + index_cols) for ts, md in dataset) 33 | assert all(abs((~md.trainval).mean() - dataset.test_frac) < 2 / len(ts) for ts, md in dataset) 34 | -------------------------------------------------------------------------------- /tests/test_generator.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | from os.path import abspath, dirname 8 | import sys 9 | import logging 10 | import unittest 11 | 12 | import numpy as np 13 | from operator import mul 14 | from math import exp, log, sin 15 | 16 | from merlion.utils.ts_generator import TimeSeriesGenerator, GeneratorComposer, GeneratorConcatenator 17 | 18 | logger = logging.getLogger(__name__) 19 | rootdir = dirname(dirname(abspath(__file__))) 20 | 21 | 22 | class TestTimeSeriesGenerator(unittest.TestCase): 23 | def test_generator_sequence(self): 24 | logger.info("test_generator_sequence\n" + "-" * 80 + "\n") 25 | 26 | np.random.seed(1234) 27 | y_generated = GeneratorComposer( 28 | generators=[ 29 | TimeSeriesGenerator(f=lambda x: x**1.3, n=3), 30 | TimeSeriesGenerator(f=lambda x: 4.5 / (1 + exp(-x)), scale=4.5, n=7), 31 | TimeSeriesGenerator(f=lambda x: sin(x) * sin(3 * x), n=11), 32 | ], 33 | n=20, 34 | x0=-7, 35 | step=1.5, 36 | per_generator_noise=False, 37 | ).generate(return_ts=False) 38 | 39 | np.random.seed(1234) 40 | x = np.arange(20) * 1.5 - 7 41 | y_expected = (4.5 / (1.0 + np.exp(-np.sin(x) * np.sin(3 * x)))) ** 1.3 + np.random.normal(size=20) 42 | 43 | self.assertAlmostEqual(np.max(np.abs(y_expected - y_generated)), 0, places=8) 44 | 45 | def test_generator_series(self): 46 | logger.info("test_generator_series\n" + "-" * 80 + "\n") 47 | 48 | np.random.seed(1234) 49 | y_generated = GeneratorConcatenator( 50 | generators=[ 51 | TimeSeriesGenerator(f=lambda x: x**2, n=3, x0=0), 52 | TimeSeriesGenerator(f=lambda x: exp(-(x % 5)), n=7, x0=10), 53 | TimeSeriesGenerator(f=lambda x: 4 * log(x), n=11, x0=-99), 54 | ], 55 | n=20, 56 | x0=-7, 57 | step=1.5, 58 | noise=np.random.uniform, 59 | distort=mul, 60 | string_outputs=False, 61 | per_generator_noise=False, 62 | ).generate(return_ts=False) 63 | 64 | np.random.seed(1234) 65 | x = np.arange(21) * 1.5 - 7 66 | y_expected = np.hstack((x[:3] ** 2, np.exp(-(x[3:10] % 5)), np.log(x[10:21]) * 4)) * np.random.uniform(size=21) 67 | 68 | self.assertAlmostEqual(np.max(np.abs(y_expected - y_generated)), 0, places=8) 69 | 70 | 71 | if __name__ == "__main__": 72 | logging.basicConfig( 73 | format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", stream=sys.stdout, level=logging.DEBUG 74 | ) 75 | unittest.main() 76 | -------------------------------------------------------------------------------- /tests/transform/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | -------------------------------------------------------------------------------- /tests/transform/test_anomalize.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import logging 8 | from os.path import abspath, dirname 9 | import sys 10 | import unittest 11 | 12 | import numpy as np 13 | 14 | from merlion.utils.ts_generator import TimeSeriesGenerator 15 | from merlion.transform.anomalize import Shock, TrendChange 16 | 17 | logger = logging.getLogger(__name__) 18 | rootdir = dirname(dirname(dirname(abspath(__file__)))) 19 | 20 | 21 | class TestAnomalize(unittest.TestCase): 22 | def __init__(self, *args, **kwargs): 23 | super().__init__(*args, **kwargs) 24 | logger.info("Generating Data...\n") 25 | np.random.seed(111) 26 | self.ts = TimeSeriesGenerator(f=lambda x: x**1.6, n=200, name="metric").generate(return_ts=True) 27 | 28 | def test_shock(self): 29 | print("-" * 80) 30 | logger.info("test_shock\n" + "-" * 80 + "\n") 31 | 32 | # test anomalies are statistically deviant from preceding values 33 | shock = Shock(anom_prob=0.2, pos_prob=0.5, sd_range=(5, 5), anom_width_range=(1, 3)) 34 | anom_ts = shock(self.ts) 35 | vals = anom_ts.univariates["metric"].values 36 | labs = anom_ts.univariates["anomaly"].values 37 | ems = self.ts.univariates["metric"].to_pd().ewm(alpha=shock.alpha, adjust=False).std(bias=True) 38 | 39 | for i, (x, is_anom, sd) in enumerate(zip(vals, labs, ems)): 40 | if is_anom == 1.0 and labs[i - 1] == 0.0: 41 | shift = np.abs(x - vals[i - 1]) 42 | assert shift > 3 * sd 43 | 44 | def test_trend_change(self): 45 | print("-" * 80) 46 | logger.info("test_trend_change\n" + "-" * 80 + "\n") 47 | 48 | # test strictly positive trend changes 49 | trend = TrendChange(anom_prob=0.2, pos_prob=1.0, scale_range=(2, 3)) 50 | anom_ts = trend(self.ts) 51 | self.assertTrue(all(self.ts.univariates["metric"].np_values <= anom_ts.univariates["metric"].np_values)) 52 | 53 | # test strictly negative trend changes 54 | trend = TrendChange(anom_prob=0.2, pos_prob=0.0, scale_range=(2, 3)) 55 | anom_ts = trend(self.ts) 56 | self.assertTrue(all(self.ts.univariates["metric"].np_values >= anom_ts.univariates["metric"].np_values)) 57 | 58 | def test_natural_bounds(self): 59 | print("-" * 80) 60 | logger.info("test_natural_bounds\n" + "-" * 80 + "\n") 61 | 62 | # generate data 63 | np.random.seed(111) 64 | ts = TimeSeriesGenerator(f=np.sin, n=200, name="metric").generate(return_ts=True) 65 | 66 | shock = Shock(anom_prob=0.5, sd_range=(5, 5), natural_bounds=(-1, 1)) 67 | anom_vals = shock(ts).univariates["metric"].values 68 | self.assertTrue(all(np.abs(anom_vals) <= 1)) 69 | 70 | def test_anom_prob(self): 71 | print("-" * 80) 72 | logger.info("test_anom_prob\n" + "-" * 80 + "\n") 73 | 74 | # test no anoms when anom_prob is 0 75 | for anomaly in (Shock(anom_prob=0.0), TrendChange(anom_prob=0.0)): 76 | anom_ts = anomaly(self.ts) 77 | self.assertEqual(self.ts.univariates["metric"], anom_ts.univariates["metric"]) 78 | self.assertTrue(all(0.0 == anom_ts.univariates["anomaly"].np_values)) 79 | 80 | 81 | if __name__ == "__main__": 82 | logging.basicConfig( 83 | format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", stream=sys.stdout, level=logging.DEBUG 84 | ) 85 | unittest.main() 86 | -------------------------------------------------------------------------------- /tests/transform/test_inverse.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import logging 8 | from os.path import abspath, dirname, join 9 | import pickle 10 | import sys 11 | import unittest 12 | 13 | from merlion.utils import TimeSeries 14 | from merlion.transform.bound import LowerUpperClip 15 | from merlion.transform.moving_average import DifferenceTransform, ExponentialMovingAverage, LagTransform, MovingAverage 16 | from merlion.transform.normalize import MinMaxNormalize 17 | from merlion.transform.resample import TemporalResample, Shingle 18 | from merlion.transform.sequence import TransformSequence, TransformStack 19 | 20 | 21 | logger = logging.getLogger(__name__) 22 | rootdir = dirname(dirname(dirname(abspath(__file__)))) 23 | 24 | 25 | class TestInverse(unittest.TestCase): 26 | """Tests a number of transforms & their inverses.""" 27 | 28 | def test_full(self): 29 | with open(join(rootdir, "data", "test_transform.pkl"), "rb") as f: 30 | df = pickle.load(f).drop(columns=["anomaly", "trainval"]) 31 | 32 | ts = TimeSeries.from_pd(df) 33 | transform = TransformSequence( 34 | [ 35 | MinMaxNormalize(), 36 | LowerUpperClip(0, 1), 37 | TemporalResample(), 38 | DifferenceTransform(), 39 | MovingAverage(weights=[0.1, 0.2, 0.3, 0.4]), 40 | LagTransform(k=20, pad=True), 41 | LagTransform(k=3, pad=False), 42 | TransformStack( 43 | [ExponentialMovingAverage(alpha=0.7), MovingAverage(weights=[0.1, 0.2, 0.3, 0.4])], 44 | check_aligned=False, 45 | ), 46 | Shingle(size=10, stride=7), 47 | ] 48 | ) 49 | transform.train(ts) 50 | ts1 = transform(ts) 51 | ts2 = transform.invert(ts1, retain_inversion_state=True) 52 | df, df2 = ts.to_pd(), ts2.to_pd() 53 | rae = ((df - df2).abs() / ((df - df.mean()).abs() + 1e-8)).mean().mean() 54 | self.assertLess(rae, 1e-6) 55 | 56 | df2_prime = transform.invert(ts1).to_pd() 57 | rae = ((df2_prime - df2) / ((df2 - df2.mean()).abs() + 1e-8)).mean().mean() 58 | self.assertLess(rae, 1e-6) 59 | 60 | with self.assertRaises(RuntimeError) as context: 61 | transform.invert(ts1) 62 | self.assertTrue("Inversion state not set" in str(context.exception)) 63 | 64 | 65 | if __name__ == "__main__": 66 | logging.basicConfig( 67 | format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", stream=sys.stdout, level=logging.DEBUG 68 | ) 69 | unittest.main() 70 | -------------------------------------------------------------------------------- /tests/transform/test_moving_average.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import numpy as np 8 | import unittest 9 | 10 | from merlion.utils.time_series import UnivariateTimeSeries 11 | from merlion.transform.moving_average import ( 12 | DifferenceTransform, 13 | LagTransform, 14 | MovingPercentile, 15 | ExponentialMovingAverage, 16 | ) 17 | from merlion.utils.ts_generator import TimeSeriesGenerator 18 | 19 | 20 | class TestMovingAverage(unittest.TestCase): 21 | def test_difference_transform(self): 22 | n = 8 23 | ts = UnivariateTimeSeries(range(n), range(n)).to_ts() 24 | diff = DifferenceTransform() 25 | 26 | transformed_ts = diff(ts) 27 | expected_ts = UnivariateTimeSeries(range(1, n), np.ones(n - 1)).to_ts() 28 | self.assertEqual(expected_ts, transformed_ts) 29 | 30 | def test_lag_transform(self): 31 | n = 8 32 | ts = UnivariateTimeSeries(range(n), range(n)).to_ts() 33 | 34 | for k in range(1, 9): 35 | lag = LagTransform(k) 36 | transformed_ts = lag(ts) 37 | expected_ts = UnivariateTimeSeries(range(k, n), np.repeat(k, n - k)).to_ts() 38 | self.assertEqual(expected_ts, transformed_ts) 39 | 40 | lag = LagTransform(k=3, pad=True) 41 | transformed_ts = lag(ts) 42 | expected_vals = list(range(3)) + [3] * (n - 3) 43 | expected_ts = UnivariateTimeSeries(range(n), expected_vals).to_ts() 44 | self.assertEqual(expected_ts, transformed_ts) 45 | 46 | def test_moving_percentile(self): 47 | n = 20 48 | ts = UnivariateTimeSeries(range(n), range(n)).to_ts() 49 | 50 | transformed_ts = MovingPercentile(n_steps=1, q=23)(ts) 51 | expected_ts = UnivariateTimeSeries(range(n), range(n)).to_ts() 52 | self.assertEqual(expected_ts, transformed_ts) 53 | 54 | transformed_ts = MovingPercentile(n_steps=4, q=100)(ts) 55 | expected_ts = UnivariateTimeSeries(range(n), range(n)).to_ts() 56 | self.assertEqual(expected_ts, transformed_ts) 57 | 58 | transformed_ts = MovingPercentile(n_steps=6, q=0)(ts) 59 | expected_ts = UnivariateTimeSeries(range(n), [0] * 6 + list(range(1, 14 + 1))).to_ts() 60 | self.assertEqual(expected_ts, transformed_ts) 61 | 62 | transformed_ts = MovingPercentile(n_steps=3, q=50)(ts) 63 | expected_ts = UnivariateTimeSeries(range(n), [0, 0.5] + list(range(1, 18 + 1))).to_ts() 64 | self.assertEqual(expected_ts, transformed_ts) 65 | 66 | def test_exponential_moving_average_ci(self): 67 | np.random.seed(12345) 68 | name = "metric" 69 | ts = TimeSeriesGenerator(f=lambda x: x, n=100, name=name).generate() 70 | ema = ExponentialMovingAverage(alpha=0.1, ci=True)(ts) 71 | y = ema.univariates[name] 72 | lb = ema.univariates[f"{name}_lb"] 73 | ub = ema.univariates[f"{name}_ub"] 74 | self.assertTrue(all(l <= x <= u for (l, x, u) in zip(lb.values, y.values, ub.values))) 75 | 76 | 77 | if __name__ == "__main__": 78 | unittest.main() 79 | -------------------------------------------------------------------------------- /tests/transform/test_sequence.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import logging 8 | from merlion.transform.base import Identity 9 | from merlion.transform.sequence import TransformSequence, TransformStack 10 | import unittest 11 | 12 | from merlion.utils import TimeSeries, UnivariateTimeSeries 13 | from merlion.transform.moving_average import LagTransform, MovingAverage 14 | 15 | 16 | class TestSequence(unittest.TestCase): 17 | def test_transform_sequence(self): 18 | n = 25 19 | ts = TimeSeries([UnivariateTimeSeries(range(n), range(n))]) 20 | 21 | f, g, h = Identity(), MovingAverage(n_steps=3), LagTransform(k=2) 22 | seq = TransformSequence([f, g, h]) 23 | seq.train(ts) 24 | 25 | transformed_ts = seq(ts) 26 | expected_ts = h(g(f(ts))) 27 | self.assertEqual(expected_ts, transformed_ts) 28 | 29 | def test_transform_stack(self): 30 | n = 25 31 | ts = TimeSeries([UnivariateTimeSeries(range(n), range(n))]) 32 | 33 | f, g, h = Identity(), MovingAverage(n_steps=3), LagTransform(k=2) 34 | stack = TransformStack([f, g, h]) 35 | stack.train(ts) 36 | 37 | transformed_ts = stack(ts) 38 | expected_ts = TimeSeries.from_ts_list([f(ts), g(ts), h(ts)]) 39 | self.assertEqual(expected_ts, transformed_ts) 40 | 41 | 42 | if __name__ == "__main__": 43 | unittest.main() 44 | -------------------------------------------------------------------------------- /ts_datasets/README.md: -------------------------------------------------------------------------------- 1 | # ts_datasets 2 | This library implements Python classes that manipulate numerous time series datasets 3 | into standardized `pandas` DataFrames. The sub-modules are `ts_datasets.anomaly` for time series anomaly detection, and 4 | `ts_datasets.forecast` for time series forecasting. Simply install the package by calling `pip install -e .` from the 5 | command line. Then, you can load a dataset (e.g. the "realAWSCloudwatch" split of the Numenta Anomaly Benchmark) by 6 | calling 7 | ```python 8 | from ts_datasets.anomaly import NAB 9 | dataset = NAB(subset="realAWSCloudwatch", rootdir=path_to_NAB) 10 | ``` 11 | Note that if you have installed this package in editable mode (i.e. by specifying `-e`), the root directory 12 | need not be specified. 13 | 14 | Each dataset supports the following features: 15 | 1. ``__getitem__``: you may call ``ts, metadata = dataset[i]``. ``ts`` is a time-indexed ``pandas`` DataFrame, with 16 | each column representing a different variable (in the case of multivariate time series). ``metadata`` is a dict or 17 | ``pd.DataFrame`` with the same index as ``ts``, with different keys indicating different dataset-specific 18 | metadata (train/test split, anomaly labels, etc.) for each timestamp. 19 | 2. ``__len__``: Calling ``len(dataset)`` will return the number of time series in the dataset. 20 | 3. ``__iter__``: You may iterate over the `pandas` representations of the time series in the dataset with 21 | ``for ts, metadata in dataset: ...`` 22 | 23 | For each time series in the dataset, `metadata` is a dict or `pd.DataFrame` that will always have the following keys: 24 | - ``trainval``: (``bool``) a `pd.Series` indicating whether each timestamp of the time series should be used for 25 | training/validation (if `True`) or testing (if `False`) 26 | 27 | For anomaly detection datasets, ``metadata`` will also have the key: 28 | - ``anomaly``: (``bool``) a `pd.Series` indicating whether each timestamp is anomalous 29 | 30 | We currently support the following datasets for time series anomaly detection (`ts_datasets.anomaly`): 31 | - [IOps Competition](http://iops.ai/competition_detail/?competition_id=5) 32 | - [Numenta Anomaly Benchmark](https://github.com/numenta/NAB) 33 | - Synthetic (synthetic data generated using [this script](../examples/misc/generate_synthetic_tsad_dataset.py)) 34 | - [SMAP & MSL](https://github.com/khundman/telemanom/) (multivariate time series anomaly detection datasets from NASA) 35 | - [SMD](https://github.com/NetManAIOps/OmniAnomaly) (server machine dataset) 36 | 37 | We currently support the following datasets for time series forecasting (`ts_datasets.forecast`): 38 | - [M4 Competition](https://github.com/Mcompetitions/M4-methods/tree/master/Dataset) 39 | - There are 100,000 univariate time series with different granularity, including Yearly (23,000 sequences), 40 | Quarterly (24,000 sequences), Monthly (48,000 sequences), Weekly (359 sequences), Daily (4,227 sequences) and 41 | Hourly (414 sequences) data. 42 | - [Energy Power Grid](https://www.kaggle.com/robikscube/hourly-energy-consumption) 43 | - There is one 10-variable time series. 44 | - Each univariate records the energy power usage in a particular region. 45 | - [Seattle Trail for Bike and Pedestrian](https://www.kaggle.com/city-of-seattle/seattle-burke-gilman-trail) 46 | - There is one 5-variable time series. 47 | - Each univariate records the bicycle/pedestrian flow along a different 48 | direction on the trail 49 | - [Solar Energy Plant](https://www.nrel.gov/grid/solar-power-data.html) 50 | - There is one 405-variable time series. 51 | - Each univariate records the solar energy power in each detector in the plant 52 | - By default, the data loader returns only the first 100 of 405 univariates 53 | 54 | More details on each dataset can be found in their class-level docstrings, or in the API doc. 55 | -------------------------------------------------------------------------------- /ts_datasets/setup.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | from setuptools import find_packages, setup 8 | 9 | setup( 10 | name="ts_datasets", 11 | version="0.1.0", 12 | author="Aadyot Bhatnagar, Tian Lan, Chenghao Liu, Wenzhuo Yang", 13 | author_email="abhatnagar@salesforce.com", 14 | description="A library for easily loading time series anomaly detection & forecasting datasets", 15 | long_description=open("README.md", "r", encoding="utf-8").read(), 16 | long_description_content_type="text/markdown", 17 | license="Apache 2.0", 18 | packages=find_packages(include=["ts_datasets*"]), 19 | install_requires=["cython", "numpy", "pandas", "requests", "tqdm", "wheel", "gdown"], 20 | ) 21 | -------------------------------------------------------------------------------- /ts_datasets/ts_datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | """ 8 | .. autosummary:: 9 | anomaly 10 | forecast 11 | """ 12 | -------------------------------------------------------------------------------- /ts_datasets/ts_datasets/anomaly/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | """ 8 | Datasets for time series anomaly detection (TSAD). All the time series in these 9 | datasets have anomaly labels. 10 | """ 11 | from ts_datasets.anomaly.base import TSADBaseDataset 12 | from ts_datasets.anomaly.custom import CustomAnomalyDataset 13 | from ts_datasets.anomaly.iops_competition import IOpsCompetition 14 | from ts_datasets.anomaly.nab import NAB 15 | from ts_datasets.anomaly.synthetic import Synthetic 16 | from ts_datasets.anomaly.ucr import UCR 17 | 18 | from ts_datasets.anomaly.smd import SMD 19 | from ts_datasets.anomaly.smap import SMAP 20 | from ts_datasets.anomaly.msl import MSL 21 | 22 | __all__ = [ 23 | "get_dataset", 24 | "TSADBaseDataset", 25 | "CustomAnomalyDataset", 26 | "IOpsCompetition", 27 | "NAB", 28 | "Synthetic", 29 | "UCR", 30 | "SMD", 31 | "SMAP", 32 | "MSL", 33 | ] 34 | 35 | 36 | def get_dataset(dataset_name: str, rootdir: str = None, **kwargs) -> TSADBaseDataset: 37 | """ 38 | :param dataset_name: the name of the dataset to load, formatted as 39 | ``<name>`` or ``<name>_<subset>``, e.g. ``IOPsCompetition`` 40 | or ``NAB_realAWSCloudwatch`` 41 | :param rootdir: the directory where the desired dataset is stored. Not 42 | required if the package :py:mod:`ts_datasets` is installed in editable 43 | mode, i.e. with flag ``-e``. 44 | :param kwargs: keyword arguments for the data loader you are trying to load. 45 | :return: the data loader for the desired dataset (and subset) desired 46 | """ 47 | name_subset = dataset_name.split("_", maxsplit=1) 48 | valid_datasets = set(__all__).difference({"TSADBaseDataset", "get_dataset"}) 49 | if name_subset[0] in valid_datasets: 50 | cls = globals()[name_subset[0]] 51 | else: 52 | raise KeyError( 53 | "Dataset should be formatted as <name> or " 54 | "<name>_<subset>, where <name> is one of " 55 | f"{valid_datasets}. Got {dataset_name} instead." 56 | ) 57 | if not hasattr(cls, "valid_subsets") and len(name_subset) == 2: 58 | raise ValueError( 59 | f"Dataset {name_subset[0]} does not have any subsets, " 60 | f"but attempted to load subset {name_subset[1]} by " 61 | f"specifying dataset name {dataset_name}." 62 | ) 63 | 64 | if len(name_subset) > 1: 65 | kwargs.update(subset=name_subset[1]) 66 | return cls(rootdir=rootdir, **kwargs) 67 | -------------------------------------------------------------------------------- /ts_datasets/ts_datasets/anomaly/base.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import numpy as np 8 | import pandas as pd 9 | 10 | from ts_datasets.base import BaseDataset, _main_fns_docstr 11 | 12 | _intro_docstr = """ 13 | Base dataset class for storing time series intended for anomaly detection. 14 | """ 15 | 16 | _extra_note = """ 17 | 18 | .. note:: 19 | 20 | For each time series, the ``metadata`` will always have the key ``anomaly``, which is a 21 | ``pd.Series`` of ``bool`` indicating whether each timestamp is anomalous. 22 | """ 23 | 24 | 25 | class TSADBaseDataset(BaseDataset): 26 | __doc__ = _intro_docstr + _main_fns_docstr + _extra_note 27 | 28 | @property 29 | def max_lead_sec(self): 30 | """ 31 | The maximum number of seconds an anomaly may be detected early, for 32 | this dataset. ``None`` signifies no early detections allowed, or that 33 | the user may override this value with something better suited for their 34 | purposes. 35 | """ 36 | return None 37 | 38 | @property 39 | def max_lag_sec(self): 40 | """ 41 | The maximum number of seconds after the start of an anomaly, that we 42 | consider detections to be accurate (and not ignored for being too late). 43 | ``None`` signifies that any detection in the window is acceptable, or 44 | that the user may override this value with something better suited for 45 | their purposes. 46 | """ 47 | return None 48 | 49 | def describe(self): 50 | anom_bds = [] 51 | anom_locs = [] 52 | anom_in_trainval = [] 53 | for ts, md in self: 54 | boundaries = md.anomaly.iloc[1:] != md.anomaly.values[:-1] 55 | boundaries = boundaries[boundaries].index 56 | if len(boundaries) == 0: 57 | continue 58 | 59 | ts_len = ts.index[-1] - ts.index[0] 60 | if md.anomaly.iloc[0]: 61 | anom_bds.append((ts.index[0], boundaries[0])) 62 | anom_locs.append((boundaries[0] - ts.index[0]) / ts_len) 63 | anom_in_trainval.append(True) 64 | 65 | for t0, tf in zip(boundaries[:-1], boundaries[1:]): 66 | if md.anomaly[t0]: 67 | anom_bds.append((t0, tf)) 68 | anom_locs.append((tf - ts.index[0]) / ts_len) 69 | anom_in_trainval.append(bool(md.trainval[t0])) 70 | 71 | if md.anomaly[boundaries[-1]]: 72 | anom_bds.append((boundaries[-1], ts.index[-1])) 73 | anom_locs.append(1.0) 74 | anom_in_trainval.append(False) 75 | 76 | print("=" * 80) 77 | print(f"Time series in dataset have average length {int(np.mean([len(ts) for ts, md in self]))}.") 78 | print(f"Time series in dataset have {len(anom_bds) / len(self):.1f} anomalies on average.") 79 | print( 80 | f"{sum(anom_in_trainval) / len(anom_in_trainval) * 100:.1f}% of " 81 | f"anomalies are in the train/val split of their respective time " 82 | f"series." 83 | ) 84 | print(f"Anomalies in dataset have average length {pd.Timedelta(np.mean([(tf - t0) for t0, tf in anom_bds]))}.") 85 | print( 86 | f"Average anomaly occurs {np.mean(anom_locs) * 100:.1f}% " 87 | f"(+/- {np.std(anom_locs) * 100:.1f}%) of the way through " 88 | f"its respective time series." 89 | ) 90 | print("=" * 80) 91 | -------------------------------------------------------------------------------- /ts_datasets/ts_datasets/anomaly/custom.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import glob 8 | import logging 9 | import os 10 | 11 | import pandas as pd 12 | 13 | from ts_datasets.forecast.custom import CustomDataset 14 | from ts_datasets.anomaly.base import TSADBaseDataset 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | class CustomAnomalyDataset(CustomDataset, TSADBaseDataset): 20 | """ 21 | Wrapper to load a custom dataset for anomaly detection. Please review the `tutorial <tutorials/CustomDataset>` 22 | to get started. 23 | """ 24 | 25 | def __init__( 26 | self, 27 | rootdir, 28 | test_frac=0.5, 29 | assume_no_anomaly=False, 30 | time_col=None, 31 | time_unit="s", 32 | data_cols=None, 33 | index_cols=None, 34 | ): 35 | """ 36 | :param rootdir: Filename of a single CSV, or a directory containing many CSVs. Each CSV must contain 1 37 | or more time series. 38 | :param test_frac: If we don't find a column "trainval" in the time series, this is the fraction of each 39 | time series which we use for testing. 40 | :param assume_no_anomaly: If we don't find a column "anomaly" in the time series, we assume there are no 41 | anomalies in the data if this value is ``True``, and we throw an exception if this value is ``False``. 42 | :param time_col: Name of the column used to index time. We use the first non-index, non-metadata column 43 | if none is given. 44 | :param data_cols: Name of the columns to fetch from the dataset. If ``None``, use all non-time, non-index columns. 45 | :param time_unit: If the time column is numerical, we assume it is a timestamp expressed in this unit. 46 | :param index_cols: If a CSV file contains multiple time series, these are the columns used to index those 47 | time series. For example, a CSV file may contain time series of sales for many (store, department) pairs. 48 | In this case, ``index_cols`` may be ``["Store", "Dept"]``. The values of the index columns will be added 49 | to the metadata of the data loader. 50 | """ 51 | self.assume_no_anomaly = assume_no_anomaly 52 | super().__init__( 53 | rootdir=rootdir, 54 | test_frac=test_frac, 55 | time_col=time_col, 56 | time_unit=time_unit, 57 | data_cols=data_cols, 58 | index_cols=index_cols, 59 | ) 60 | 61 | @property 62 | def metadata_cols(self): 63 | return ["anomaly", "trainval"] 64 | 65 | def check_ts_for_metadata(self, ts, col): 66 | if col == "anomaly": 67 | if col not in ts: 68 | if self.assume_no_anomaly: 69 | ts[col] = False 70 | else: 71 | raise ValueError(f"Time series {ts} does not have metadata column {col}.") 72 | ts[col] = ts[col].astype(bool) 73 | else: 74 | ts = super().check_ts_for_metadata(ts, col) 75 | return ts 76 | -------------------------------------------------------------------------------- /ts_datasets/ts_datasets/anomaly/msl.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import os 8 | import sys 9 | import logging 10 | from ts_datasets.anomaly.base import TSADBaseDataset 11 | from ts_datasets.anomaly.smd import download, combine_train_test_datasets 12 | from ts_datasets.anomaly.smap import preprocess, load_data 13 | 14 | _logger = logging.getLogger(__name__) 15 | _logger.setLevel(logging.DEBUG) 16 | _handler = logging.StreamHandler(sys.stdout) 17 | _handler.setLevel(logging.DEBUG) 18 | _logger.addHandler(_handler) 19 | 20 | 21 | class MSL(TSADBaseDataset): 22 | """ 23 | Soil Moisture Active Passive (SMAP) satellite and Mars Science Laboratory (MSL) rover Datasets. 24 | SMAP and MSL are two realworld public datasets, which are two real-world datasets expert-labeled by NASA. 25 | 26 | - source: https://github.com/khundman/telemanom 27 | """ 28 | 29 | url = "https://www.dropbox.com/s/uv9ojw353qwzqht/SMAP.tar.gz?dl=1" 30 | 31 | def __init__(self, subset=None, rootdir=None): 32 | super().__init__() 33 | 34 | if rootdir is None: 35 | fdir = os.path.dirname(os.path.abspath(__file__)) 36 | merlion_root = os.path.abspath(os.path.join(fdir, "..", "..", "..")) 37 | rootdir = os.path.join(merlion_root, "data", "smap") 38 | 39 | # Download the SMAP dataset if it doesn't exist 40 | download(_logger, rootdir, MSL.url, "SMAP") 41 | preprocess(_logger, os.path.join(rootdir, "SMAP"), dataset="MSL") 42 | # Load training/test datasets 43 | df, metadata = combine_train_test_datasets(*load_data(os.path.join(rootdir, "SMAP"), "MSL")) 44 | self.time_series.append(df) 45 | self.metadata.append(metadata) 46 | -------------------------------------------------------------------------------- /ts_datasets/ts_datasets/anomaly/smap.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import os 8 | import sys 9 | import csv 10 | import ast 11 | import logging 12 | import pickle 13 | import numpy as np 14 | import pandas as pd 15 | from ts_datasets.anomaly.base import TSADBaseDataset 16 | from ts_datasets.anomaly.smd import download, combine_train_test_datasets 17 | 18 | _logger = logging.getLogger(__name__) 19 | _logger.setLevel(logging.DEBUG) 20 | _handler = logging.StreamHandler(sys.stdout) 21 | _handler.setLevel(logging.DEBUG) 22 | _logger.addHandler(_handler) 23 | 24 | 25 | class SMAP(TSADBaseDataset): 26 | """ 27 | Soil Moisture Active Passive (SMAP) satellite and Mars Science Laboratory (MSL) rover Datasets. 28 | SMAP and MSL are two realworld public datasets, which are two real-world datasets expert-labeled by NASA. 29 | 30 | - source: https://github.com/khundman/telemanom 31 | """ 32 | 33 | url = "https://www.dropbox.com/s/uv9ojw353qwzqht/SMAP.tar.gz?dl=1" 34 | 35 | def __init__(self, subset=None, rootdir=None): 36 | super().__init__() 37 | 38 | if rootdir is None: 39 | fdir = os.path.dirname(os.path.abspath(__file__)) 40 | merlion_root = os.path.abspath(os.path.join(fdir, "..", "..", "..")) 41 | rootdir = os.path.join(merlion_root, "data", "smap") 42 | 43 | # Download the SMAP dataset if it doesn't exist 44 | download(_logger, rootdir, SMAP.url, "SMAP") 45 | preprocess(_logger, os.path.join(rootdir, "SMAP"), dataset="SMAP") 46 | # Load training/test datasets 47 | df, metadata = combine_train_test_datasets(*load_data(os.path.join(rootdir, "SMAP"), "SMAP")) 48 | self.time_series.append(df) 49 | self.metadata.append(metadata) 50 | 51 | 52 | def preprocess(logger, data_folder, dataset): 53 | if ( 54 | os.path.exists(os.path.join(data_folder, f"{dataset}_test_label.pkl")) 55 | and os.path.exists(os.path.join(data_folder, f"{dataset}_train.pkl")) 56 | and os.path.exists(os.path.join(data_folder, f"{dataset}_test.pkl")) 57 | ): 58 | return 59 | 60 | logger.info(f"Preprocessing {dataset}") 61 | with open(os.path.join(data_folder, "labeled_anomalies.csv"), "r") as f: 62 | csv_reader = csv.reader(f, delimiter=",") 63 | res = [row for row in csv_reader][1:] 64 | res = sorted(res, key=lambda k: k[0]) 65 | 66 | labels = [] 67 | data_info = [row for row in res if row[1] == dataset and row[0] != "P-2"] 68 | for row in data_info: 69 | anomalies = ast.literal_eval(row[2]) 70 | length = int(row[-1]) 71 | label = np.zeros([length], dtype=bool) 72 | for anomaly in anomalies: 73 | label[anomaly[0] : anomaly[1] + 1] = True 74 | labels.extend(label) 75 | labels = np.asarray(labels) 76 | with open(os.path.join(data_folder, f"{dataset}_test_label.pkl"), "wb") as f: 77 | pickle.dump(labels, f) 78 | 79 | for category in ["train", "test"]: 80 | data = [] 81 | for row in data_info: 82 | data.extend(np.load(os.path.join(data_folder, category, row[0] + ".npy"))) 83 | data = np.asarray(data) 84 | with open(os.path.join(data_folder, f"{dataset}_{category}.pkl"), "wb") as f: 85 | pickle.dump(data, f) 86 | 87 | 88 | def load_data(directory, dataset): 89 | with open(os.path.join(directory, f"{dataset}_test.pkl"), "rb") as f: 90 | test_data = pickle.load(f) 91 | with open(os.path.join(directory, f"{dataset}_test_label.pkl"), "rb") as f: 92 | test_labels = pickle.load(f) 93 | with open(os.path.join(directory, f"{dataset}_train.pkl"), "rb") as f: 94 | train_data = pickle.load(f) 95 | train_df, test_df = pd.DataFrame(train_data), pd.DataFrame(test_data) 96 | return train_df, test_df, test_labels.astype(int) 97 | -------------------------------------------------------------------------------- /ts_datasets/ts_datasets/anomaly/synthetic.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import glob 8 | import os 9 | 10 | import pandas as pd 11 | 12 | from ts_datasets.anomaly.base import TSADBaseDataset 13 | 14 | 15 | class Synthetic(TSADBaseDataset): 16 | """ 17 | Wrapper to load a sythetically generated dataset. 18 | The dataset was generated using three base time series, each of which 19 | was separately injected with shocks, spikes, dips and level shifts, making 20 | a total of 15 time series (including the base time series without anomalies). 21 | Subsets can are defined by the base time series used ("horizontal", 22 | "seasonal", "upward_downward"), or the type of injected anomaly ("shock", 23 | "spike", "dip", "level"). The "anomaly" subset refers to all times series with 24 | injected anomalies (12) while "base" refers to all time series without them (3). 25 | """ 26 | 27 | base_ts_subsets = ["horizontal", "seasonal", "upward_downward"] 28 | anomaly_subsets = ["shock", "spike", "dip", "level", "trend"] 29 | valid_subsets = ["anomaly", "all", "base"] + base_ts_subsets + anomaly_subsets 30 | 31 | def __init__(self, subset="anomaly", rootdir=None): 32 | super().__init__() 33 | 34 | assert subset in self.valid_subsets, f"subset should be in {self.valid_subsets}, but got {subset}" 35 | self.subset = subset 36 | 37 | if rootdir is None: 38 | fdir = os.path.dirname(os.path.abspath(__file__)) 39 | merlion_root = os.path.abspath(os.path.join(fdir, "..", "..", "..")) 40 | rootdir = os.path.join(merlion_root, "data", "synthetic_anomaly") 41 | 42 | csvs = sorted(glob.glob(f"{rootdir}/*.csv")) 43 | if subset == "base": 44 | csvs = [csv for csv in csvs if "anom" not in os.path.basename(csv)] 45 | elif subset != "all": 46 | csvs = [csv for csv in csvs if "anom" in os.path.basename(csv)] 47 | if subset in self.base_ts_subsets + self.anomaly_subsets: 48 | csvs = [csv for csv in csvs if subset in os.path.basename(csv)] 49 | 50 | for csv in csvs: 51 | df = pd.read_csv(csv) 52 | df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s") 53 | df = df.set_index("timestamp") 54 | 55 | ts = df[df.columns[0:1]] 56 | metadata = pd.DataFrame( 57 | { 58 | "anomaly": df["anomaly"].astype(bool) if df.shape[1] > 1 else [False] * len(df), 59 | "trainval": [j < len(df) * 0.5 for j in range(len(df))], 60 | }, 61 | index=df.index, 62 | ) 63 | 64 | self.time_series.append(ts) 65 | self.metadata.append(metadata) 66 | -------------------------------------------------------------------------------- /ts_datasets/ts_datasets/anomaly/ucr.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import glob 8 | import os 9 | import logging 10 | import requests 11 | from pathlib import Path 12 | import sys 13 | import zipfile 14 | 15 | import numpy as np 16 | import pandas as pd 17 | 18 | from ts_datasets.anomaly.base import TSADBaseDataset 19 | 20 | logger = logging.getLogger(__name__) 21 | logger.setLevel(logging.DEBUG) 22 | handler = logging.StreamHandler(sys.stdout) 23 | handler.setLevel(logging.DEBUG) 24 | logger.addHandler(handler) 25 | 26 | 27 | class UCR(TSADBaseDataset): 28 | """ 29 | Data loader for the Hexagon ML/UC Riverside Time Series Anomaly Archive. 30 | 31 | See `here <https://compete.hexagon-ml.com/practice/competition/39/>`_ for details. 32 | 33 | Hoang Anh Dau, Eamonn Keogh, Kaveh Kamgar, Chin-Chia Michael Yeh, Yan Zhu, 34 | Shaghayegh Gharghabi, Chotirat Ann Ratanamahatana, Yanping Chen, Bing Hu, 35 | Nurjahan Begum, Anthony Bagnall , Abdullah Mueen, Gustavo Batista, & Hexagon-ML (2019). 36 | The UCR Time Series Classification Archive. URL https://www.cs.ucr.edu/~eamonn/time_series_data_2018/ 37 | """ 38 | 39 | def __init__(self, rootdir=None): 40 | super().__init__() 41 | if rootdir is None: 42 | fdir = os.path.dirname(os.path.abspath(__file__)) 43 | merlion_root = os.path.abspath(os.path.join(fdir, "..", "..", "..")) 44 | rootdir = os.path.join(merlion_root, "data", "ucr") 45 | 46 | self.download(rootdir) 47 | self.time_series = sorted( 48 | glob.glob( 49 | os.path.join( 50 | rootdir, "UCR_TimeSeriesAnomalyDatasets2021", "FilesAreInHere", "UCR_Anomaly_FullData", "*.txt" 51 | ) 52 | ) 53 | ) 54 | 55 | def __getitem__(self, i): 56 | fname = self.time_series[i] 57 | split, anom_start, anom_end = [int(x) for x in fname[: -len(".txt")].split("_")[-3:]] 58 | name = fname.split("_")[-4] 59 | arr = np.loadtxt(fname) 60 | trainval = [i < split for i in range(len(arr))] 61 | anomaly = [anom_start <= i <= anom_end for i in range(len(arr))] 62 | index = pd.date_range(start=0, periods=len(arr), freq="1min") 63 | df = pd.DataFrame({name: arr}, index=index) 64 | return ( 65 | df, 66 | pd.DataFrame( 67 | { 68 | "anomaly": [anom_start - 100 <= i <= anom_end + 100 for i in range(len(arr))], 69 | "trainval": [i < split for i in range(len(arr))], 70 | }, 71 | index=index, 72 | ), 73 | ) 74 | 75 | def download(self, rootdir): 76 | filename = "UCR_TimeSeriesAnomalyDatasets2021.zip" 77 | url = f"https://www.cs.ucr.edu/~eamonn/time_series_data_2018/{filename}" 78 | 79 | os.makedirs(rootdir, exist_ok=True) 80 | compressed_file = os.path.join(rootdir, filename) 81 | 82 | # Download the compressed dataset 83 | if not os.path.exists(compressed_file): 84 | logger.info("Downloading " + url) 85 | with requests.get(url, stream=True) as r: 86 | with open(compressed_file, "wb") as f: 87 | for chunk in r.iter_content(chunk_size=16 * 1024**2): 88 | if chunk: # filter out keep-alive new chunks 89 | f.write(chunk) 90 | f.flush() 91 | 92 | # Uncompress the downloaded zip file 93 | if not os.path.isfile(os.path.join(rootdir, "_SUCCESS")): 94 | logger.info(f"Uncompressing {compressed_file}") 95 | with zipfile.ZipFile(compressed_file, "r") as zip_ref: 96 | zip_ref.extractall(rootdir) 97 | Path(os.path.join(rootdir, "_SUCCESS")).touch() 98 | -------------------------------------------------------------------------------- /ts_datasets/ts_datasets/base.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import pandas as pd 8 | from typing import Tuple 9 | 10 | _intro_docstr = "Base dataset class for storing time series as ``pd.DataFrame`` s." 11 | 12 | _main_fns_docstr = """ 13 | Each dataset supports the following features: 14 | 15 | 1. ``__getitem__``: you may call ``ts, metadata = dataset[i]``. ``ts`` is a time-indexed ``pandas`` DataFrame, with 16 | each column representing a different variable (in the case of multivariate time series). ``metadata`` is a dict or 17 | ``pd.DataFrame`` with the same index as ``ts``, with different keys indicating different dataset-specific 18 | metadata (train/test split, anomaly labels, etc.) for each timestamp. 19 | 2. ``__len__``: Calling ``len(dataset)`` will return the number of time series in the dataset. 20 | 3. ``__iter__``: You may iterate over the ``pandas`` representations of the time series in the dataset with 21 | ``for ts, metadata in dataset: ...`` 22 | 23 | .. note:: 24 | 25 | For each time series, the ``metadata`` will always have the key ``trainval``, which is a 26 | ``pd.Series`` of ``bool`` indicating whether each timestamp of the time series should be 27 | training/validation (if ``True``) or testing (if ``False``). 28 | """ 29 | 30 | 31 | class BaseDataset: 32 | __doc__ = _intro_docstr + _main_fns_docstr 33 | 34 | time_series: list 35 | """ 36 | A list of all individual time series contained in the dataset. Iterating over 37 | the dataset will iterate over this list. Note that for some large datasets, 38 | ``time_series`` may be a list of filenames, which are read lazily either during 39 | iteration, or whenever ``__getitem__`` is invoked. 40 | """ 41 | 42 | metadata: list 43 | """ 44 | A list containing the metadata for all individual time series in the dataset. 45 | """ 46 | 47 | def __init__(self): 48 | self.subset = None 49 | self.time_series = [] 50 | self.metadata = [] 51 | 52 | def __getitem__(self, i) -> Tuple[pd.DataFrame, pd.DataFrame]: 53 | return self.time_series[i], self.metadata[i] 54 | 55 | def __len__(self): 56 | return len(self.time_series) 57 | 58 | def __iter__(self): 59 | return (self[i] for i in range(len(self))) 60 | 61 | def describe(self): 62 | for ts_df in self.time_series: 63 | print(f"length of the data: {len(ts_df)}") 64 | print(f"timestamp index name: {ts_df.index.name}") 65 | print(f"number of data columns: {len(ts_df.columns)}") 66 | print("data columns names (the first 20): ") 67 | print(ts_df.columns[:20]) 68 | print(f"number of null entries: {ts_df.isnull().sum()}") 69 | -------------------------------------------------------------------------------- /ts_datasets/ts_datasets/forecast/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | """ 8 | Datasets for time series forecasting. Really, these are just time series with 9 | no labels of any sort. 10 | """ 11 | from ts_datasets.base import BaseDataset 12 | from ts_datasets.forecast.custom import CustomDataset 13 | from ts_datasets.forecast.m4 import M4 14 | from ts_datasets.forecast.energy_power import EnergyPower 15 | from ts_datasets.forecast.seattle_trail import SeattleTrail 16 | from ts_datasets.forecast.solar_plant import SolarPlant 17 | 18 | __all__ = ["get_dataset", "CustomDataset", "M4", "EnergyPower", "SeattleTrail", "SolarPlant"] 19 | 20 | 21 | def get_dataset(dataset_name: str, rootdir: str = None, **kwargs) -> BaseDataset: 22 | """ 23 | :param dataset_name: the name of the dataset to load, formatted as 24 | ``<name>`` or ``<name>_<subset>``, e.g. ``EnergyPower`` or ``M4_Hourly`` 25 | :param rootdir: the directory where the desired dataset is stored. Not 26 | required if the package :py:mod:`ts_datasets` is installed in editable 27 | mode, i.e. with flag ``-e``. 28 | :param kwargs: keyword arguments for the data loader you are trying to load. 29 | :return: the data loader for the desired dataset (and subset) desired 30 | """ 31 | name_subset = dataset_name.split("_", maxsplit=1) 32 | valid_datasets = set(__all__).difference({"get_dataset"}) 33 | if name_subset[0] in valid_datasets: 34 | cls = globals()[name_subset[0]] 35 | else: 36 | raise KeyError( 37 | "Dataset should be formatted as <name> or " 38 | "<name>_<subset>, where <name> is one of " 39 | f"{valid_datasets}. Got {dataset_name} instead." 40 | ) 41 | if not hasattr(cls, "valid_subsets") and len(name_subset) == 2: 42 | raise ValueError( 43 | f"Dataset {name_subset[0]} does not have any subsets, " 44 | f"but attempted to load subset {name_subset[1]} by " 45 | f"specifying dataset name {dataset_name}." 46 | ) 47 | 48 | if len(name_subset) > 1: 49 | kwargs.update(subset=name_subset[1]) 50 | return cls(rootdir=rootdir, **kwargs) 51 | -------------------------------------------------------------------------------- /ts_datasets/ts_datasets/forecast/energy_power.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import glob 8 | import logging 9 | import os 10 | 11 | import pandas as pd 12 | 13 | from ts_datasets.base import BaseDataset 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | class EnergyPower(BaseDataset): 19 | """ 20 | Wrapper to load the open source energy grid power usage dataset. 21 | 22 | - source: https://www.kaggle.com/robikscube/hourly-energy-consumption 23 | - contains one 10-variable time series 24 | """ 25 | 26 | def __init__(self, rootdir=None): 27 | """ 28 | :param rootdir: The root directory at which the dataset can be found. 29 | """ 30 | super().__init__() 31 | if rootdir is None: 32 | fdir = os.path.dirname(os.path.abspath(__file__)) 33 | merlion_root = os.path.abspath(os.path.join(fdir, "..", "..", "..")) 34 | rootdir = os.path.join(merlion_root, "data", "multivariate", "energy_power") 35 | 36 | assert ( 37 | "energy_power" in rootdir.split("/")[-1] 38 | ), "energy_power should be found as the last level of the directory for this dataset" 39 | 40 | dsetdirs = [rootdir] 41 | extension = "csv.gz" 42 | 43 | fnames = sum([sorted(glob.glob(f"{d}/*.{extension}")) for d in dsetdirs], []) 44 | assert len(fnames) == 1, f"rootdir {rootdir} does not contain dataset file." 45 | 46 | start_timestamp = "2014-01-01 00:00:00" 47 | 48 | for i, fn in enumerate(sorted(fnames)): 49 | df = pd.read_csv(fn, index_col="Datetime", parse_dates=True) 50 | df = df[df.index >= start_timestamp] 51 | df.drop(["NI", "PJM_Load"], axis=1, inplace=True) 52 | df.index.rename("timestamp", inplace=True) 53 | assert isinstance(df.index, pd.DatetimeIndex) 54 | df.sort_index(inplace=True) 55 | 56 | self.time_series.append(df) 57 | self.metadata.append( 58 | { 59 | "trainval": pd.Series(df.index <= "2018-01-01 00:00:00", index=df.index), 60 | "start_timestamp": start_timestamp, 61 | } 62 | ) 63 | -------------------------------------------------------------------------------- /ts_datasets/ts_datasets/forecast/seattle_trail.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import glob 8 | import logging 9 | import os 10 | 11 | import pandas as pd 12 | 13 | from ts_datasets.base import BaseDataset 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | class SeattleTrail(BaseDataset): 19 | """ 20 | Wrapper to load the open source Seattle Trail pedestrian/bike traffic 21 | dataset. 22 | 23 | - source: https://www.kaggle.com/city-of-seattle/seattle-burke-gilman-trail 24 | - contains one 5-variable time series 25 | """ 26 | 27 | def __init__(self, rootdir=None): 28 | """ 29 | :param rootdir: The root directory at which the dataset can be found. 30 | """ 31 | super().__init__() 32 | if rootdir is None: 33 | fdir = os.path.dirname(os.path.abspath(__file__)) 34 | merlion_root = os.path.abspath(os.path.join(fdir, "..", "..", "..")) 35 | rootdir = os.path.join(merlion_root, "data", "multivariate", "seattle_trail") 36 | 37 | assert ( 38 | "seattle_trail" in rootdir.split("/")[-1] 39 | ), "seattle_trail should be found as the last level of the directory for this dataset" 40 | 41 | dsetdirs = [rootdir] 42 | extension = "csv" 43 | 44 | fnames = sum([sorted(glob.glob(f"{d}/*.{extension}")) for d in dsetdirs], []) 45 | assert len(fnames) == 1, f"rootdir {rootdir} does not contain dataset file." 46 | for i, fn in enumerate(sorted(fnames)): 47 | df = pd.read_csv(fn) 48 | 49 | df["timestamp"] = pd.to_datetime(df["Date"]) 50 | df.set_index("timestamp", inplace=True) 51 | df.drop("Date", axis=1, inplace=True) 52 | assert isinstance(df.index, pd.DatetimeIndex) 53 | df.sort_index(inplace=True) 54 | 55 | self.time_series.append(df) 56 | self.metadata.append( 57 | {"trainval": pd.Series(df.index <= "2019-01-01 00:00:00", index=df.index), "quantile_clip": 300} 58 | ) 59 | -------------------------------------------------------------------------------- /ts_datasets/ts_datasets/forecast/solar_plant.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2023 salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | # 7 | import glob 8 | import logging 9 | import os 10 | import zipfile 11 | 12 | import pandas as pd 13 | 14 | from ts_datasets.base import BaseDataset 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | class SolarPlant(BaseDataset): 20 | """ 21 | Wrapper to load the open source solar plant power dataset. 22 | 23 | - source: https://www.nrel.gov/grid/solar-power-data.html 24 | - contains one 405-variable time series 25 | 26 | .. note:: 27 | 28 | The loader currently only includes the first 100 (of 405) variables. 29 | """ 30 | 31 | def __init__(self, rootdir=None, num_columns=100): 32 | """ 33 | :param rootdir: The root directory at which the dataset can be found. 34 | :param num_columns: indicates how many univariate columns should be returned 35 | """ 36 | super().__init__() 37 | if rootdir is None: 38 | fdir = os.path.dirname(os.path.abspath(__file__)) 39 | merlion_root = os.path.abspath(os.path.join(fdir, "..", "..", "..")) 40 | rootdir = os.path.join(merlion_root, "data", "multivariate", "solar_plant") 41 | 42 | assert ( 43 | "solar_plant" in rootdir.split("/")[-1] 44 | ), "solar_plant should be found as the last level of the directory for this dataset" 45 | 46 | # Get all filenames, extracting the zipfile if needed 47 | fnames = glob.glob(f"{rootdir}/*.csv") 48 | if len(fnames) == 0 and os.path.isfile(f"{rootdir}/merged.zip"): 49 | with zipfile.ZipFile(f"{rootdir}/merged.zip", "r") as zip_ref: 50 | zip_ref.extractall(rootdir) 51 | fnames = glob.glob(f"{rootdir}/*.csv") 52 | assert len(fnames) == 1, f"rootdir {rootdir} does not contain dataset file." 53 | 54 | for i, fn in enumerate(sorted(fnames)): 55 | 56 | df = pd.read_csv(fn) 57 | 58 | df["timestamp"] = pd.to_datetime(df["Datetime"]) 59 | df.set_index("timestamp", inplace=True) 60 | df.drop(["LocalTime", "Datetime"], axis=1, inplace=True) 61 | num_columns = min(num_columns, len(df.columns)) 62 | cols = [f"Power_{i}" for i in range(num_columns)] 63 | df = df[cols] 64 | assert isinstance(df.index, pd.DatetimeIndex) 65 | df.sort_index(inplace=True) 66 | 67 | self.time_series.append(df) 68 | self.metadata.append( 69 | { 70 | "trainval": pd.Series(df.index <= "2006-10-01 00:00:00", index=df.index), 71 | "granularity": "30min", 72 | "aggregation": "Sum", 73 | } 74 | ) 75 | --------------------------------------------------------------------------------