├── .dvc ├── .gitignore ├── config └── plots │ ├── confusion.json │ ├── confusion_normalized.json │ ├── default.json │ ├── linear.json │ ├── scatter.json │ └── smooth.json ├── .dvcignore ├── .gitignore ├── LICENSE ├── MLproject ├── README.md ├── conda.yaml ├── data ├── .gitignore └── raw │ ├── .gitignore │ ├── SWaT_Dataset_Attack_v0.csv.dvc │ └── SWaT_Dataset_Normal_v1.csv.dvc ├── dvc.lock ├── dvc.yaml ├── metrics.json ├── params.yaml ├── requirements.txt └── src ├── featurize.py ├── model.py ├── train.py └── validate.py /.dvc/.gitignore: -------------------------------------------------------------------------------- 1 | /config.local 2 | /tmp 3 | /cache 4 | -------------------------------------------------------------------------------- /.dvc/config: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/finloop/usad-torchlightning/4aba4ed1b202e6320cce4a6fd5528cd415f9e255/.dvc/config -------------------------------------------------------------------------------- /.dvc/plots/confusion.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "title": "", 7 | "facet": { 8 | "field": "rev", 9 | "type": "nominal" 10 | }, 11 | "spec": { 12 | "transform": [ 13 | { 14 | "aggregate": [ 15 | { 16 | "op": "count", 17 | "as": "xy_count" 18 | } 19 | ], 20 | "groupby": [ 21 | "", 22 | "" 23 | ] 24 | }, 25 | { 26 | "impute": "xy_count", 27 | "groupby": [ 28 | "rev", 29 | "" 30 | ], 31 | "key": "", 32 | "value": 0 33 | }, 34 | { 35 | "impute": "xy_count", 36 | "groupby": [ 37 | "rev", 38 | "" 39 | ], 40 | "key": "", 41 | "value": 0 42 | }, 43 | { 44 | "joinaggregate": [ 45 | { 46 | "op": "max", 47 | "field": "xy_count", 48 | "as": "max_count" 49 | } 50 | ], 51 | "groupby": [] 52 | }, 53 | { 54 | "calculate": "datum.xy_count / datum.max_count", 55 | "as": "percent_of_max" 56 | } 57 | ], 58 | "encoding": { 59 | "x": { 60 | "field": "", 61 | "type": "nominal", 62 | "sort": "ascending", 63 | "title": "" 64 | }, 65 | "y": { 66 | "field": "", 67 | "type": "nominal", 68 | "sort": "ascending", 69 | "title": "" 70 | } 71 | }, 72 | "layer": [ 73 | { 74 | "mark": "rect", 75 | "width": 300, 76 | "height": 300, 77 | "encoding": { 78 | "color": { 79 | "field": "xy_count", 80 | "type": "quantitative", 81 | "title": "", 82 | "scale": { 83 | "domainMin": 0, 84 | "nice": true 85 | } 86 | } 87 | } 88 | }, 89 | { 90 | "mark": "text", 91 | "encoding": { 92 | "text": { 93 | "field": "xy_count", 94 | "type": "quantitative" 95 | }, 96 | "color": { 97 | "condition": { 98 | "test": "datum.percent_of_max > 0.5", 99 | "value": "white" 100 | }, 101 | "value": "black" 102 | } 103 | } 104 | } 105 | ] 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /.dvc/plots/confusion_normalized.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "title": "", 7 | "facet": { 8 | "field": "rev", 9 | "type": "nominal" 10 | }, 11 | "spec": { 12 | "transform": [ 13 | { 14 | "aggregate": [ 15 | { 16 | "op": "count", 17 | "as": "xy_count" 18 | } 19 | ], 20 | "groupby": [ 21 | "", 22 | "" 23 | ] 24 | }, 25 | { 26 | "impute": "xy_count", 27 | "groupby": [ 28 | "rev", 29 | "" 30 | ], 31 | "key": "", 32 | "value": 0 33 | }, 34 | { 35 | "impute": "xy_count", 36 | "groupby": [ 37 | "rev", 38 | "" 39 | ], 40 | "key": "", 41 | "value": 0 42 | }, 43 | { 44 | "joinaggregate": [ 45 | { 46 | "op": "sum", 47 | "field": "xy_count", 48 | "as": "sum_y" 49 | } 50 | ], 51 | "groupby": [ 52 | "" 53 | ] 54 | }, 55 | { 56 | "calculate": "datum.xy_count / datum.sum_y", 57 | "as": "percent_of_y" 58 | } 59 | ], 60 | "encoding": { 61 | "x": { 62 | "field": "", 63 | "type": "nominal", 64 | "sort": "ascending", 65 | "title": "" 66 | }, 67 | "y": { 68 | "field": "", 69 | "type": "nominal", 70 | "sort": "ascending", 71 | "title": "" 72 | } 73 | }, 74 | "layer": [ 75 | { 76 | "mark": "rect", 77 | "width": 300, 78 | "height": 300, 79 | "encoding": { 80 | "color": { 81 | "field": "percent_of_y", 82 | "type": "quantitative", 83 | "title": "", 84 | "scale": { 85 | "domain": [ 86 | 0, 87 | 1 88 | ] 89 | } 90 | } 91 | } 92 | }, 93 | { 94 | "mark": "text", 95 | "encoding": { 96 | "text": { 97 | "field": "percent_of_y", 98 | "type": "quantitative", 99 | "format": ".2f" 100 | }, 101 | "color": { 102 | "condition": { 103 | "test": "datum.percent_of_y > 0.5", 104 | "value": "white" 105 | }, 106 | "value": "black" 107 | } 108 | } 109 | } 110 | ] 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /.dvc/plots/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "title": "", 7 | "width": 300, 8 | "height": 300, 9 | "mark": { 10 | "type": "line" 11 | }, 12 | "encoding": { 13 | "x": { 14 | "field": "", 15 | "type": "quantitative", 16 | "title": "" 17 | }, 18 | "y": { 19 | "field": "", 20 | "type": "quantitative", 21 | "title": "", 22 | "scale": { 23 | "zero": false 24 | } 25 | }, 26 | "color": { 27 | "field": "rev", 28 | "type": "nominal" 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /.dvc/plots/linear.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "title": "", 7 | "width": 300, 8 | "height": 300, 9 | "layer": [ 10 | { 11 | "encoding": { 12 | "x": { 13 | "field": "", 14 | "type": "quantitative", 15 | "title": "" 16 | }, 17 | "y": { 18 | "field": "", 19 | "type": "quantitative", 20 | "title": "", 21 | "scale": { 22 | "zero": false 23 | } 24 | }, 25 | "color": { 26 | "field": "rev", 27 | "type": "nominal" 28 | } 29 | }, 30 | "layer": [ 31 | { 32 | "mark": "line" 33 | }, 34 | { 35 | "selection": { 36 | "label": { 37 | "type": "single", 38 | "nearest": true, 39 | "on": "mouseover", 40 | "encodings": [ 41 | "x" 42 | ], 43 | "empty": "none", 44 | "clear": "mouseout" 45 | } 46 | }, 47 | "mark": "point", 48 | "encoding": { 49 | "opacity": { 50 | "condition": { 51 | "selection": "label", 52 | "value": 1 53 | }, 54 | "value": 0 55 | } 56 | } 57 | } 58 | ] 59 | }, 60 | { 61 | "transform": [ 62 | { 63 | "filter": { 64 | "selection": "label" 65 | } 66 | } 67 | ], 68 | "layer": [ 69 | { 70 | "mark": { 71 | "type": "rule", 72 | "color": "gray" 73 | }, 74 | "encoding": { 75 | "x": { 76 | "field": "", 77 | "type": "quantitative" 78 | } 79 | } 80 | }, 81 | { 82 | "encoding": { 83 | "text": { 84 | "type": "quantitative", 85 | "field": "" 86 | }, 87 | "x": { 88 | "field": "", 89 | "type": "quantitative" 90 | }, 91 | "y": { 92 | "field": "", 93 | "type": "quantitative" 94 | } 95 | }, 96 | "layer": [ 97 | { 98 | "mark": { 99 | "type": "text", 100 | "align": "left", 101 | "dx": 5, 102 | "dy": -5 103 | }, 104 | "encoding": { 105 | "color": { 106 | "type": "nominal", 107 | "field": "rev" 108 | } 109 | } 110 | } 111 | ] 112 | } 113 | ] 114 | } 115 | ] 116 | } 117 | -------------------------------------------------------------------------------- /.dvc/plots/scatter.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "title": "", 7 | "width": 300, 8 | "height": 300, 9 | "layer": [ 10 | { 11 | "encoding": { 12 | "x": { 13 | "field": "", 14 | "type": "quantitative", 15 | "title": "" 16 | }, 17 | "y": { 18 | "field": "", 19 | "type": "quantitative", 20 | "title": "", 21 | "scale": { 22 | "zero": false 23 | } 24 | }, 25 | "color": { 26 | "field": "rev", 27 | "type": "nominal" 28 | } 29 | }, 30 | "layer": [ 31 | { 32 | "mark": "point" 33 | }, 34 | { 35 | "selection": { 36 | "label": { 37 | "type": "single", 38 | "nearest": true, 39 | "on": "mouseover", 40 | "encodings": [ 41 | "x" 42 | ], 43 | "empty": "none", 44 | "clear": "mouseout" 45 | } 46 | }, 47 | "mark": "point", 48 | "encoding": { 49 | "opacity": { 50 | "condition": { 51 | "selection": "label", 52 | "value": 1 53 | }, 54 | "value": 0 55 | } 56 | } 57 | } 58 | ] 59 | }, 60 | { 61 | "transform": [ 62 | { 63 | "filter": { 64 | "selection": "label" 65 | } 66 | } 67 | ], 68 | "layer": [ 69 | { 70 | "encoding": { 71 | "text": { 72 | "type": "quantitative", 73 | "field": "" 74 | }, 75 | "x": { 76 | "field": "", 77 | "type": "quantitative" 78 | }, 79 | "y": { 80 | "field": "", 81 | "type": "quantitative" 82 | } 83 | }, 84 | "layer": [ 85 | { 86 | "mark": { 87 | "type": "text", 88 | "align": "left", 89 | "dx": 5, 90 | "dy": -5 91 | }, 92 | "encoding": { 93 | "color": { 94 | "type": "nominal", 95 | "field": "rev" 96 | } 97 | } 98 | } 99 | ] 100 | } 101 | ] 102 | } 103 | ] 104 | } 105 | -------------------------------------------------------------------------------- /.dvc/plots/smooth.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "title": "", 7 | "mark": { 8 | "type": "line" 9 | }, 10 | "encoding": { 11 | "x": { 12 | "field": "", 13 | "type": "quantitative", 14 | "title": "" 15 | }, 16 | "y": { 17 | "field": "", 18 | "type": "quantitative", 19 | "title": "", 20 | "scale": { 21 | "zero": false 22 | } 23 | }, 24 | "color": { 25 | "field": "rev", 26 | "type": "nominal" 27 | } 28 | }, 29 | "transform": [ 30 | { 31 | "loess": "", 32 | "on": "", 33 | "groupby": [ 34 | "rev" 35 | ], 36 | "bandwidth": 0.3 37 | } 38 | ] 39 | } 40 | -------------------------------------------------------------------------------- /.dvcignore: -------------------------------------------------------------------------------- 1 | # Add patterns of files dvc should ignore, which could improve 2 | # the performance. Learn more at 3 | # https://dvc.org/doc/user-guide/dvcignore 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # mlflow 132 | mlruns 133 | 134 | # pytorch -lightning 135 | lightning_logs 136 | 137 | # pycharm 138 | .idea 139 | 140 | # dvc 141 | .dvc 142 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Piotr Krawiec 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MLproject: -------------------------------------------------------------------------------- 1 | name: usad 2 | 3 | conda_enc: conda.yaml 4 | 5 | entry_points: 6 | main: 7 | command: "python3 src/featurize.py" 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # usad-torchlightning 2 | Implementation of USAD (UnSupervised Anomaly Detection on multivariate time 3 | series) in PyTorch Lightning. 4 | 5 | Original implementation by: Francesco Galati. 6 | Original code can be found in: [USAD](https://github.com/manigalati/usad). 7 | 8 | # Getting started 9 | To start, first download the data. 10 | ## Data 11 | Data can be found in: 12 | - Normal data: [SWaT Dataset Normal](https://drive.google.com/open?id=1rVJ5ry5GG-ZZi5yI4x9lICB8VhErXwCw) 13 | - Attack data: [SWaT Dataset Attack](https://drive.google.com/open?id=1iDYc0OEmidN712fquOBRFjln90SbpaE7) 14 | 15 | After downloading them put them in `data/raw`. 16 | 17 | ## Running the model 18 | ```commandline 19 | dvc exp run 20 | ``` 21 | 22 | ## Changing the parameters 23 | All the parameters (for example epoch size) can be found in `params.yaml`. 24 | 25 | ## Requirements 26 | - pytorch 1.9 27 | - dvc 28 | - pytorch-lighting 29 | - python 3.8 30 | 31 | # How to cite 32 | If you use this software, please cite the following paper as appropriate: 33 | ``` 34 | Audibert, J., Michiardi, P., Guyard, F., Marti, S., Zuluaga, M. A. (2020). 35 | USAD : UnSupervised Anomaly Detection on multivariate time series. 36 | Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, August 23-27, 2020 37 | ``` -------------------------------------------------------------------------------- /conda.yaml: -------------------------------------------------------------------------------- 1 | name: usad-torchlightning 2 | channels: 3 | - conda-forge 4 | - pytorch 5 | - nvidia 6 | - defaults 7 | dependencies: 8 | - numpy 9 | - python=3.8.12 10 | - scikit-learn 11 | - matplotlib 12 | - seaborn 13 | - pytorch 14 | - cudatoolkit=11.1 15 | - pytorch-lightning 16 | - dvc 17 | prefix: /home/pk/miniconda3/envs/usad-torchlightning 18 | -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | /featurize 2 | /predictions 3 | -------------------------------------------------------------------------------- /data/raw/.gitignore: -------------------------------------------------------------------------------- 1 | /SWaT_Dataset_Attack_v0.csv 2 | /SWaT_Dataset_Normal_v1.csv 3 | -------------------------------------------------------------------------------- /data/raw/SWaT_Dataset_Attack_v0.csv.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 5f45dffcbfb5735c7fb867f5c8124c72 3 | size: 133454848 4 | path: SWaT_Dataset_Attack_v0.csv 5 | -------------------------------------------------------------------------------- /data/raw/SWaT_Dataset_Normal_v1.csv.dvc: -------------------------------------------------------------------------------- 1 | outs: 2 | - md5: 8d5520a9eb17ebbb461df1137131f8dd 3 | size: 171724418 4 | path: SWaT_Dataset_Normal_v1.csv 5 | -------------------------------------------------------------------------------- /dvc.lock: -------------------------------------------------------------------------------- 1 | schema: '2.0' 2 | stages: 3 | featurize: 4 | cmd: python3 src/featurize.py data/raw data/featurize 5 | deps: 6 | - path: data/raw 7 | md5: 9d3d74e16f896a88df4f97f9755bb4b0.dir 8 | size: 305179520 9 | nfiles: 5 10 | - path: src/featurize.py 11 | md5: c94a3fff61a5c2f572b027ce9c6d07c0 12 | size: 2022 13 | params: 14 | params.yaml: 15 | featurize.max_row_limit: 1000000 16 | featurize.window_size: 8 17 | outs: 18 | - path: data/featurize 19 | md5: 96dc364478248eea5f36cdb343cb0863.dir 20 | size: 83232709 21 | nfiles: 1 22 | train: 23 | cmd: python3 src/train.py data/featurize data/predictions 24 | deps: 25 | - path: data/featurize 26 | md5: 96dc364478248eea5f36cdb343cb0863.dir 27 | size: 83232709 28 | nfiles: 1 29 | - path: src/model.py 30 | md5: 39e8ead6259cd85a65a3e9e73ce9b70f 31 | size: 3742 32 | - path: src/train.py 33 | md5: 974d89f91c2f614d382e671e72ab5292 34 | size: 2185 35 | params: 36 | params.yaml: 37 | featurize.window_size: 8 38 | train.batch_size: 10000 39 | train.epochs: 100 40 | train.hidden_size: 20 41 | outs: 42 | - path: data/predictions 43 | md5: 7bde82a8b14a9e6d4a39021c3437d72b.dir 44 | size: 1374793 45 | nfiles: 1 46 | validate: 47 | cmd: python3 src/validate.py data/featurize data/predictions metrics.json 48 | deps: 49 | - path: data/featurize 50 | md5: 96dc364478248eea5f36cdb343cb0863.dir 51 | size: 83232709 52 | nfiles: 1 53 | - path: data/predictions 54 | md5: 7bde82a8b14a9e6d4a39021c3437d72b.dir 55 | size: 1374793 56 | nfiles: 1 57 | - path: src/validate.py 58 | md5: 313ee87dc6cd76edf73d77ea95526f44 59 | size: 2147 60 | params: 61 | params.yaml: 62 | featurize.window_size: 8 63 | outs: 64 | - path: metrics.json 65 | md5: f68b5abe81d2548d6d369a202a4a6815 66 | size: 92 67 | -------------------------------------------------------------------------------- /dvc.yaml: -------------------------------------------------------------------------------- 1 | stages: 2 | featurize: 3 | cmd: python3 src/featurize.py data/raw data/featurize 4 | deps: 5 | - data/raw 6 | - src/featurize.py 7 | params: 8 | - featurize.max_row_limit 9 | - featurize.window_size 10 | outs: 11 | - data/featurize 12 | train: 13 | cmd: python3 src/train.py data/featurize data/predictions 14 | deps: 15 | - data/featurize 16 | - src/model.py 17 | - src/train.py 18 | params: 19 | - featurize.window_size 20 | - train.batch_size 21 | - train.epochs 22 | - train.hidden_size 23 | outs: 24 | - data/predictions 25 | validate: 26 | cmd: python3 src/validate.py data/featurize data/predictions metrics.json 27 | deps: 28 | - data/featurize 29 | - data/predictions 30 | - src/validate.py 31 | params: 32 | - featurize.window_size 33 | metrics: 34 | - metrics.json: 35 | cache: false 36 | -------------------------------------------------------------------------------- /metrics.json: -------------------------------------------------------------------------------- 1 | {"threshold": 0.0, "acc": 0.12194856316026947, "recall": 1.0, "f1": 0.2174} -------------------------------------------------------------------------------- /params.yaml: -------------------------------------------------------------------------------- 1 | featurize: 2 | max_row_limit: 1000000 3 | window_size: 8 4 | 5 | train: 6 | batch_size: 10000 7 | epochs: 100 8 | hidden_size: 20 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy=1.21 2 | matplotlib 3 | seaborn 4 | jupyterlab 5 | -------------------------------------------------------------------------------- /src/featurize.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn import preprocessing 7 | import yaml 8 | 9 | 10 | def create_windows(data: np.ndarray, window_size): 11 | return data[np.arange(window_size) + np.arange( 12 | data.shape[0] - window_size).reshape(-1, 1)] 13 | 14 | 15 | def load_dataset(filename, nrows, sep, decimal): 16 | df = pd.read_csv(filename, nrows=nrows, decimal=decimal, sep=sep, low_memory=False) 17 | labels_ = np.array([float(label != 'Normal') for label in 18 | df["Normal/Attack"].values]) 19 | df = df.drop(["Timestamp", "Normal/Attack"], axis=1) 20 | 21 | for i in list(df): 22 | df[i] = df[i].apply(lambda x: str(x).replace(",", ".")) 23 | 24 | return df.astype(float), labels_ 25 | 26 | 27 | if __name__ == "__main__": 28 | # Read YAML params 29 | params = yaml.safe_load(open('params.yaml'))['featurize'] 30 | max_row_limit = params["max_row_limit"] 31 | window_size = params["window_size"] 32 | 33 | # Read command line params 34 | if len(sys.argv) != 3: 35 | sys.stderr.write('Arguments error. Usage:\n') 36 | sys.stderr.write( 37 | '\tpython featurize.py data-dir-path features-dir-path\n' 38 | ) 39 | sys.exit(1) 40 | 41 | data_dir = sys.argv[1] 42 | out_dir = sys.argv[2] 43 | 44 | os.makedirs(out_dir, exist_ok=True) 45 | 46 | normal_csv = os.path.join(data_dir, "SWaT_Dataset_Normal_v1.csv") 47 | attack_csv = os.path.join(data_dir, "SWaT_Dataset_Attack_v0.csv") 48 | 49 | train_file = os.path.join(out_dir, "data.npz") 50 | 51 | normal, _ = load_dataset(normal_csv, nrows=max_row_limit, sep=",", decimal=",") 52 | attack, labels = load_dataset(attack_csv, nrows=max_row_limit, sep=";", decimal=";") 53 | 54 | sc = preprocessing.StandardScaler() 55 | 56 | normal = sc.fit_transform(normal.values) 57 | attack = sc.transform(attack.values) 58 | 59 | windows_normal = create_windows(normal, window_size).reshape(-1, normal.shape[1]*window_size) 60 | windows_attack = create_windows(attack, window_size).reshape(-1, attack.shape[1]*window_size,) 61 | 62 | np.savez_compressed(train_file, train=windows_normal, test=windows_attack, labels=labels) 63 | -------------------------------------------------------------------------------- /src/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.optim.adam import Adam 4 | from pytorch_lightning.core.lightning import LightningModule 5 | from collections import OrderedDict 6 | 7 | 8 | class Encoder(LightningModule): 9 | def __init__(self, input_size, latent_size, s1=2, s2=4): 10 | super().__init__() 11 | 12 | self.layer_1 = nn.Linear(input_size, input_size // s1) 13 | self.layer_2 = nn.Linear(input_size // s1, input_size // s2) 14 | self.layer_3 = nn.Linear(input_size // s2, latent_size) 15 | 16 | self.activation = nn.ReLU(True) 17 | 18 | def forward(self, x): 19 | out = self.layer_1(x) 20 | out = self.activation(out) 21 | out = self.layer_2(out) 22 | out = self.activation(out) 23 | out = self.layer_3(out) 24 | z = self.activation(out) 25 | return z 26 | 27 | 28 | class Decoder(LightningModule): 29 | def __init__(self, latent_size, output_size, s1=2, s2=4): 30 | super().__init__() 31 | 32 | self.layer_1 = nn.Linear(latent_size, output_size // s2) 33 | self.layer_2 = nn.Linear(output_size // s2, output_size // s1) 34 | self.layer_3 = nn.Linear(output_size // s1, output_size) 35 | 36 | self.relu = nn.ReLU(True) 37 | self.sigmoid = nn.Sigmoid() 38 | 39 | def forward(self, x): 40 | out = self.layer_1(x) 41 | out = self.relu(out) 42 | out = self.layer_2(out) 43 | out = self.relu(out) 44 | out = self.layer_3(out) 45 | w = self.relu(out) 46 | return w 47 | 48 | 49 | class USADModel(LightningModule): 50 | def __init__(self, window_size, z_size, learning_rate=1e-3): 51 | super().__init__() 52 | 53 | self.encoder = Encoder(window_size, z_size) 54 | self.decoder_1 = Decoder(z_size, window_size) 55 | self.decoder_2 = Decoder(z_size, window_size) 56 | self.learning_rate = learning_rate 57 | 58 | def forward(self, x, alpha=.5, beta=.5): 59 | w1 = self.decoder_1(self.encoder(x)) 60 | w2 = self.decoder_2(self.encoder(w1)) 61 | 62 | return alpha * torch.mean((x - w1)**2, axis=1) + \ 63 | beta * torch.mean((x - w2)**2, axis=1) 64 | 65 | def configure_optimizers(self): 66 | optimizer_1 = Adam(list(self.encoder.parameters()) + list( 67 | self.decoder_1.parameters()), lr=self.learning_rate) 68 | optimizer_2 = Adam(list(self.encoder.parameters()) + list( 69 | self.decoder_2.parameters()), lr=self.learning_rate) 70 | 71 | return optimizer_1, optimizer_2 72 | 73 | def training_step(self, train_batch, batch_idx, optimizer_idx): 74 | n = self.trainer.current_epoch + 1 75 | 76 | z = self.encoder(train_batch) 77 | w1 = self.decoder_1(z) 78 | 79 | w22 = self.decoder_2(self.encoder(w1)) 80 | 81 | # Train AE1 82 | if optimizer_idx == 0: 83 | loss1 = 1 / n * torch.mean((train_batch - w1) ** 2) + \ 84 | (1 - 1 / n) * torch.mean((train_batch - w22) ** 2) 85 | output = OrderedDict({"loss": loss1}) 86 | return output 87 | 88 | if optimizer_idx == 1: 89 | w2 = self.decoder_2(z) 90 | loss2 = 1 / n * torch.mean((train_batch - w2) ** 2) - \ 91 | (1 - 1 / n) * torch.mean((train_batch - w22) ** 2) 92 | output = OrderedDict({"loss": loss2}) 93 | return output 94 | 95 | def validation_step(self, test_batch, batch_idx): 96 | n = self.trainer.current_epoch + 1 97 | z = self.encoder(test_batch) 98 | w1 = self.decoder_1(z) 99 | 100 | w22 = self.decoder_2(self.encoder(w1)) 101 | 102 | w2 = self.decoder_2(z) 103 | loss2 = 1 / n * torch.mean((test_batch - w2) ** 2) - \ 104 | (1 - 1 / n) * torch.mean((test_batch - w22) ** 2) 105 | output = OrderedDict({"val_loss": loss2}) 106 | return output 107 | 108 | def validation_epoch_end(self, validation_step_outputs): 109 | temp = [] 110 | for output in validation_step_outputs: 111 | temp += [output["val_loss"].item()] 112 | return {"val_loss": torch.mean(torch.tensor(temp))} -------------------------------------------------------------------------------- /src/train.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import yaml 3 | from model import USADModel 4 | from torch.utils.data import DataLoader, Dataset 5 | import sys 6 | import os 7 | import torch 8 | from pytorch_lightning import Trainer 9 | 10 | # Create Dataset 11 | class NpzDataset(Dataset): 12 | def __init__(self, path, key="data"): 13 | self.path = path 14 | self.data = np.load(path)[key] 15 | 16 | def __getitem__(self, index): 17 | return torch.from_numpy(self.data[index]).float() 18 | 19 | def __len__(self): 20 | return len(self.data) 21 | 22 | 23 | if __name__ == "__main__": 24 | # Read YAML params 25 | params = yaml.safe_load(open('params.yaml')) 26 | 27 | WINDOW_SIZE = params['featurize']['window_size'] 28 | BATCH_SIZE = params['train']["batch_size"] 29 | EPOCHS = params['train']["epochs"] 30 | HIDDEN_SIZE = params['train']["hidden_size"] 31 | 32 | if len(sys.argv) != 3: 33 | sys.stderr.write('Arguments error. Usage:\n') 34 | sys.stderr.write( 35 | '\tpython featurize.py features-dir-path predict-dir-path\n' 36 | ) 37 | sys.exit(1) 38 | 39 | data_dir = sys.argv[1] 40 | predict_dir = sys.argv[2] 41 | 42 | os.makedirs(predict_dir, exist_ok=True) 43 | 44 | data_file = os.path.join(data_dir, "data.npz") 45 | 46 | test = NpzDataset(data_file, "test") 47 | train = NpzDataset(data_file, "train") 48 | 49 | test_loader = DataLoader(test, batch_size=BATCH_SIZE, num_workers=3) 50 | train_loader = DataLoader(train, batch_size=BATCH_SIZE, num_workers=3) 51 | 52 | NMETRICS = test[0].size()[0] // WINDOW_SIZE 53 | 54 | model = USADModel(window_size=WINDOW_SIZE * NMETRICS, z_size=WINDOW_SIZE * HIDDEN_SIZE) 55 | 56 | trainer = Trainer(gpus=1, max_epochs=EPOCHS, ) 57 | 58 | trainer.fit(model, train_loader, train_loader) 59 | 60 | y_pred = trainer.predict(model, test_loader) 61 | 62 | y_pred = np.concatenate([torch.stack(y_pred[:-1]).flatten().detach().cpu().numpy(), 63 | y_pred[-1].flatten().detach().cpu().numpy()]) 64 | 65 | np.savez_compressed(os.path.join(predict_dir, "y_pred.npz"), y_pred=y_pred) -------------------------------------------------------------------------------- /src/validate.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import sys 3 | import os 4 | import matplotlib.pylab as plt 5 | import numpy as np 6 | import json 7 | from sklearn.metrics import precision_recall_curve 8 | from sklearn.metrics import plot_precision_recall_curve 9 | from sklearn.metrics import roc_curve # Calculate the ROC curve 10 | from sklearn.metrics import precision_recall_curve # Calculate the Precision-Recall curve 11 | from sklearn.metrics import f1_score, recall_score, accuracy_score 12 | 13 | if __name__ == "__main__": 14 | params = yaml.safe_load(open('params.yaml')) 15 | 16 | if len(sys.argv) != 4: 17 | sys.stderr.write('Arguments error. Usage:\n') 18 | sys.stderr.write( 19 | '\tpython validate.py featurize-dir-path predict-dir-path metrics-file\n' 20 | ) 21 | sys.exit(1) 22 | 23 | featurize_dir = sys.argv[1] 24 | predict_dir = sys.argv[2] 25 | metrics_file = sys.argv[3] 26 | 27 | y_pred = np.load(os.path.join(predict_dir, "y_pred.npz"))["y_pred"] 28 | labels = np.load(os.path.join(featurize_dir, "data.npz"))["labels"] 29 | 30 | WINDOW_SIZE = params["featurize"]["window_size"] 31 | windows_labels = [] 32 | for i in range(len(labels) - WINDOW_SIZE): 33 | windows_labels.append(list(np.int_(labels[i:i + WINDOW_SIZE]))) 34 | 35 | y_test = [1.0 if (np.sum(window) > 0) else 0 for window in windows_labels] 36 | 37 | 38 | thresholds = np.arange(0.0, np.max(y_pred), np.max(y_pred)/50) 39 | fscore = np.zeros(shape=(len(thresholds))) 40 | #rscore = np.zeros(shape=(len(thresholds))) 41 | 42 | # Fit the model 43 | for index, elem in enumerate(thresholds): 44 | # Corrected probabilities 45 | y_pred_prob = (y_pred > elem).astype('int') 46 | # Calculate the f-score 47 | fscore[index] = f1_score(y_test, y_pred_prob) 48 | #rscore[index] = recall_score(y_test, y_pred_prob) 49 | 50 | index = np.argmax(fscore) 51 | thresholdOpt = round(thresholds[index], ndigits=4) 52 | fscoreOpt = round(fscore[index], ndigits=4) 53 | y_pred_prob = (y_pred > thresholds[index]).astype('int') 54 | acc = accuracy_score(y_test, y_pred_prob) 55 | recall = recall_score(y_test, y_pred_prob) 56 | 57 | # save scores 58 | with open(metrics_file, 'w') as f: 59 | json.dump({'threshold': thresholdOpt, "acc": acc, "recall": recall, "f1": fscoreOpt}, f) 60 | --------------------------------------------------------------------------------