├── .dockerignore
├── .gitattributes
├── .github
    └── workflows
    │   ├── build.yml
    │   ├── deployment.yml
    │   └── train.yml
├── .gitignore
├── .idea
    ├── .gitignore
    ├── deployment.xml
    ├── inspectionProfiles
    │   └── profiles_settings.xml
    ├── misc.xml
    ├── ml-ids.iml
    ├── modules.xml
    └── vcs.xml
├── .pylintrc
├── Makefile
├── README.md
├── data
    ├── README.md
    └── Wednesday-14-02-2018_TrafficForML_CICFlowMeter.csv
├── environment-notebook.yaml
├── environment.yaml
├── ml_ids
    ├── __init__.py
    ├── conf.py
    ├── data
    │   ├── __init__.py
    │   ├── dataset.py
    │   ├── metadata.py
    │   └── split_dataset.py
    ├── keras
    │   ├── __init__.py
    │   ├── callbacks.py
    │   ├── evaluation.py
    │   ├── metrics.py
    │   ├── model_selection.py
    │   └── prediction.py
    ├── libs
    │   └── dfencoder
    │   │   └── dataframe.py
    ├── model_selection.py
    ├── models
    │   ├── __init__.py
    │   └── gradient_boost
    │   │   ├── __init__.py
    │   │   ├── mlflow_wrapper.py
    │   │   └── train.py
    ├── prediction.py
    ├── tf_utils.py
    ├── transform
    │   ├── __init__.py
    │   ├── preprocessing.py
    │   └── sampling.py
    └── visualization.py
├── models
    └── gradient_boost
    │   ├── envs
    │       ├── local
    │       │   └── train.py
    │       └── sagemaker
    │       │   ├── configs
    │       │       ├── deploy.json
    │       │       ├── train-cpu.json
    │       │       └── train-gpu.json
    │       │   ├── container
    │       │       ├── Dockerfile
    │       │       └── train.py
    │       │   └── scripts
    │       │       ├── build_image.sh
    │       │       ├── deploy.py
    │       │       ├── push_image_to_ecr.sh
    │       │       ├── train.py
    │       │       └── undeploy.py
    │   ├── project
    │       ├── MLproject
    │       ├── conda.yaml
    │       └── train.py
    │   ├── training_params.json
    │   └── training_params_quick_run.json
├── notebooks
    ├── 01_data-cleanup
    │   └── data_cleanup.ipynb
    ├── 02_exploratory-data-analysis
    │   └── exploratory_data_analysis.ipynb
    ├── 03_ml-prototype
    │   ├── ml-prototype.ipynb
    │   └── models
    │   │   └── gradient_boost_model.cbm
    ├── 04_ml-prototype-spark
    │   ├── ml-prototype-spark.ipynb
    │   └── models
    │   │   ├── gb-model
    │   │       ├── bestModel
    │   │       │   ├── data
    │   │       │   │   ├── ._SUCCESS.crc
    │   │       │   │   ├── .part-00000-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00001-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00002-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00003-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00004-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00005-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00007-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00008-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00009-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00010-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00011-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00013-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00014-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00015-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00016-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00017-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00019-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00020-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00021-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00022-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00023-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc
    │   │       │   │   ├── _SUCCESS
    │   │       │   │   ├── part-00000-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
    │   │       │   │   ├── part-00001-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
    │   │       │   │   ├── part-00002-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
    │   │       │   │   ├── part-00003-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
    │   │       │   │   ├── part-00004-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
    │   │       │   │   ├── part-00005-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
    │   │       │   │   ├── part-00007-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
    │   │       │   │   ├── part-00008-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
    │   │       │   │   ├── part-00009-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
    │   │       │   │   ├── part-00010-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
    │   │       │   │   ├── part-00011-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
    │   │       │   │   ├── part-00013-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
    │   │       │   │   ├── part-00014-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
    │   │       │   │   ├── part-00015-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
    │   │       │   │   ├── part-00016-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
    │   │       │   │   ├── part-00017-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
    │   │       │   │   ├── part-00019-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
    │   │       │   │   ├── part-00020-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
    │   │       │   │   ├── part-00021-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
    │   │       │   │   ├── part-00022-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
    │   │       │   │   └── part-00023-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet
    │   │       │   ├── metadata
    │   │       │   │   ├── ._SUCCESS.crc
    │   │       │   │   ├── .part-00000.crc
    │   │       │   │   ├── _SUCCESS
    │   │       │   │   └── part-00000
    │   │       │   └── treesMetadata
    │   │       │   │   ├── ._SUCCESS.crc
    │   │       │   │   ├── .part-00000-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00001-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00002-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00003-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00004-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00005-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00006-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00007-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00008-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00009-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00010-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00011-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00012-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00013-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00014-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00015-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00016-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00017-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00018-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
    │   │       │   │   ├── .part-00019-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc
    │   │       │   │   ├── _SUCCESS
    │   │       │   │   ├── part-00000-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
    │   │       │   │   ├── part-00001-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
    │   │       │   │   ├── part-00002-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
    │   │       │   │   ├── part-00003-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
    │   │       │   │   ├── part-00004-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
    │   │       │   │   ├── part-00005-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
    │   │       │   │   ├── part-00006-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
    │   │       │   │   ├── part-00007-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
    │   │       │   │   ├── part-00008-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
    │   │       │   │   ├── part-00009-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
    │   │       │   │   ├── part-00010-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
    │   │       │   │   ├── part-00011-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
    │   │       │   │   ├── part-00012-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
    │   │       │   │   ├── part-00013-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
    │   │       │   │   ├── part-00014-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
    │   │       │   │   ├── part-00015-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
    │   │       │   │   ├── part-00016-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
    │   │       │   │   ├── part-00017-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
    │   │       │   │   ├── part-00018-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
    │   │       │   │   └── part-00019-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet
    │   │       ├── estimator
    │   │       │   └── metadata
    │   │       │   │   ├── ._SUCCESS.crc
    │   │       │   │   ├── .part-00000.crc
    │   │       │   │   ├── _SUCCESS
    │   │       │   │   └── part-00000
    │   │       ├── evaluator
    │   │       │   └── metadata
    │   │       │   │   ├── ._SUCCESS.crc
    │   │       │   │   ├── .part-00000.crc
    │   │       │   │   ├── _SUCCESS
    │   │       │   │   └── part-00000
    │   │       └── metadata
    │   │       │   ├── ._SUCCESS.crc
    │   │       │   ├── .part-00000.crc
    │   │       │   ├── _SUCCESS
    │   │       │   └── part-00000
    │   │   └── pipeline-model
    │   │       ├── metadata
    │   │           ├── ._SUCCESS.crc
    │   │           ├── .part-00000.crc
    │   │           ├── _SUCCESS
    │   │           └── part-00000
    │   │       └── stages
    │   │           ├── 0_ValueCleaner_57f061a9e393
    │   │               └── metadata
    │   │               │   ├── ._SUCCESS.crc
    │   │               │   ├── .part-00000.crc
    │   │               │   ├── _SUCCESS
    │   │               │   └── part-00000
    │   │           ├── 1_Imputer_3f8cf4b571a8
    │   │               ├── data
    │   │               │   ├── ._SUCCESS.crc
    │   │               │   ├── .part-00000-d346f402-14f7-495c-adb5-386e07999ead-c000.snappy.parquet.crc
    │   │               │   ├── _SUCCESS
    │   │               │   └── part-00000-d346f402-14f7-495c-adb5-386e07999ead-c000.snappy.parquet
    │   │               └── metadata
    │   │               │   ├── ._SUCCESS.crc
    │   │               │   ├── .part-00000.crc
    │   │               │   ├── _SUCCESS
    │   │               │   └── part-00000
    │   │           ├── 2_OneHotEncoderEstimator_f1dc6e50f52e
    │   │               ├── data
    │   │               │   ├── ._SUCCESS.crc
    │   │               │   ├── .part-00000-c909fe56-90d1-4202-a5f4-69907defba9a-c000.snappy.parquet.crc
    │   │               │   ├── _SUCCESS
    │   │               │   └── part-00000-c909fe56-90d1-4202-a5f4-69907defba9a-c000.snappy.parquet
    │   │               └── metadata
    │   │               │   ├── ._SUCCESS.crc
    │   │               │   ├── .part-00000.crc
    │   │               │   ├── _SUCCESS
    │   │               │   └── part-00000
    │   │           ├── 3_VectorAssembler_ef6b7bf933ee
    │   │               └── metadata
    │   │               │   ├── ._SUCCESS.crc
    │   │               │   ├── .part-00000.crc
    │   │               │   ├── _SUCCESS
    │   │               │   └── part-00000
    │   │           └── 4_BinaryLabelMaker_3b174e5e0c29
    │   │               └── metadata
    │   │                   ├── ._SUCCESS.crc
    │   │                   ├── .part-00000.crc
    │   │                   ├── _SUCCESS
    │   │                   └── part-00000
    ├── 05_anomaly_detection
    │   ├── dl-anomaly-detection.ipynb
    │   ├── img
    │   │   ├── denoising_autoencoder.png
    │   │   ├── stacked_autoencoder.png
    │   │   └── undercomplete_autoencoder.png
    │   ├── models
    │   │   ├── denoising_autoencoder_model.h5
    │   │   ├── simple_autoencoder_model.h5
    │   │   └── stacked_autoencoder_model.h5
    │   └── notebook_utils.py
    ├── 06_dl_classifier
    │   ├── dl-classifier.ipynb
    │   ├── models
    │   │   ├── c0cb0656-558f-4311-b138-9b91ab4d1fe6.h5
    │   │   ├── model_class_weight.h5
    │   │   ├── model_no_class_weights.h5
    │   │   └── opt_model.h5
    │   └── notebook_utils.py
    └── 07_binary_classifier_comparison
    │   ├── binary-classifier-comparison.ipynb
    │   ├── models
    │       └── gb_835066e8-2427-48ca-a521-67195008cb91.catboost
    │   └── notebook_utils.py
├── project-proposal.pdf
├── setup.cfg
├── setup.py
├── tests
    ├── data
    │   └── test_dataset.py
    ├── transform
    │   └── test_preprocessing.py
    └── validation_data
    │   └── validation.csv
└── upload.py


/.dockerignore:
--------------------------------------------------------------------------------
1 | build/
2 | data/
3 | notebooks/
4 | tests/
5 | dataset/


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.csv filter=lfs diff=lfs merge=lfs -text
2 | tests/validation_data/*.csv -filter=lfs -diff=lfs -merge=lfs -text
3 | *.catboost filter=lfs diff=lfs merge=lfs -text
4 | *.h5 filter=lfs diff=lfs merge=lfs -text
5 | *.cbm filter=lfs diff=lfs merge=lfs -text
6 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: Build
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     runs-on: ubuntu-latest
 8 | 
 9 |     steps:
10 |     - uses: actions/checkout@v1
11 | 
12 |     - name: Install dependencies
13 |       run: |
14 |         conda env create --file environment.yaml
15 |         source /usr/share/miniconda/etc/profile.d/conda.sh
16 |         conda activate ml-ids
17 |         pip install -e .
18 | 
19 |     - name: Static Type Check
20 |       run: |
21 |         source /usr/share/miniconda/etc/profile.d/conda.sh
22 |         conda activate ml-ids
23 |         make typecheck
24 | 
25 |     - name: Code Quality Check
26 |       run: |
27 |         source /usr/share/miniconda/etc/profile.d/conda.sh
28 |         conda activate ml-ids
29 |         make lint-errors
30 | 
31 |     - name: Test with pytest
32 |       run: |
33 |         source /usr/share/miniconda/etc/profile.d/conda.sh
34 |         conda activate ml-ids
35 |         make test
36 | 


--------------------------------------------------------------------------------
/.github/workflows/deployment.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy Model on AWS Sagemaker
 2 | 
 3 | on:
 4 |   deployment
 5 | 
 6 | jobs:
 7 |   deploy:
 8 |     name: Deploy
 9 |     runs-on: ubuntu-latest
10 | 
11 |     steps:
12 |     - name: Checkout
13 |       uses: actions/checkout@v1
14 | 
15 |     - name: Set Tag in Environment
16 |       id: set-aws-tag
17 |       run: |
18 |         if [ -z "$GITHUB_REF" ]
19 |         then
20 |           echo "No Tag given. Workflow may only be run on tagged commits."
21 |           exit 1
22 |         fi
23 |         echo "::set-output name=awstag::$(echo ${GITHUB_REF:10} | sed 's/[^a-zA-Z0-9]/-/g')"
24 | 
25 |     - name: Set up Python 3.7
26 |       uses: actions/setup-python@v1
27 |       with:
28 |         python-version: 3.7
29 | 
30 |     - name: Install Python dependencies
31 |       run: |
32 |         python -m pip install --upgrade pip
33 |         pip install click==7.0
34 |         pip install boto3==1.10.28
35 |         pip install mlflow==1.4.0
36 | 
37 |     - name: Configure AWS credentials
38 |       uses: aws-actions/configure-aws-credentials@v1
39 |       with:
40 |         aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
41 |         aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
42 |         aws-region: eu-west-1
43 | 
44 |     - name: Login to Amazon ECR
45 |       id: login-ecr
46 |       uses: aws-actions/amazon-ecr-login@v1
47 | 
48 |     - name: Deploy model on AWS Sagemaker
49 |       id: deploy-model
50 |       env:
51 |         AWS_TAG: ${{ steps.set-aws-tag.outputs.awstag }}
52 |       run: |
53 |         make sagemaker_deploy JOB_ID="ml-ids-sagemaker-$AWS_TAG"
54 | 


--------------------------------------------------------------------------------
/.github/workflows/train.yml:
--------------------------------------------------------------------------------
 1 | name: Train Model on AWS Sagemaker
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |     - 'm*'
 7 | 
 8 | jobs:
 9 |   train:
10 |     name: Deploy
11 |     runs-on: ubuntu-latest
12 | 
13 |     steps:
14 |     - name: Checkout
15 |       uses: actions/checkout@v1
16 | 
17 |     - name: Set Tag in Environment
18 |       id: set-aws-tag
19 |       run: |
20 |         if [ -z "$GITHUB_REF" ]
21 |         then
22 |           echo "No Tag given. Workflow may only be run on tagged commits."
23 |           exit 1
24 |         fi
25 |         echo "::set-output name=awstag::$(echo ${GITHUB_REF:10} | sed 's/[^a-zA-Z0-9]/-/g')"
26 | 
27 |     - name: Set up Python 3.7
28 |       uses: actions/setup-python@v1
29 |       with:
30 |         python-version: 3.7
31 | 
32 |     - name: Install Python dependencies
33 |       run: |
34 |         python -m pip install --upgrade pip
35 |         pip install click==7.0
36 |         pip install pandas==0.25.2
37 |         pip install sagemaker==1.44.3
38 | 
39 |     - name: Configure AWS credentials
40 |       uses: aws-actions/configure-aws-credentials@v1
41 |       with:
42 |         aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
43 |         aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
44 |         aws-region: eu-west-1
45 | 
46 |     - name: Login to Amazon ECR
47 |       id: login-ecr
48 |       uses: aws-actions/amazon-ecr-login@v1
49 | 
50 |     - name: Build, tag, and push image to Amazon ECR
51 |       id: build-image
52 |       env:
53 |         ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
54 |         ECR_REPOSITORY: ml-ids-train-sagemaker
55 |         IMAGE_TAG: ${{ github.sha }}
56 |         AWS_TAG: ${{ steps.set-aws-tag.outputs.awstag }}
57 |       run: |
58 |         docker build -f models/gradient_boost/envs/sagemaker/container/Dockerfile -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG .
59 |         docker tag $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG $ECR_REGISTRY/$ECR_REPOSITORY:$AWS_TAG
60 |         docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
61 |         docker push $ECR_REGISTRY/$ECR_REPOSITORY:$AWS_TAG
62 |         echo "::set-output name=image::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG"
63 | 
64 |     - name: Train the packaged model on AWS Sagemaker
65 |       id: train-model
66 |       env:
67 |         IMAGE_NAME: ${{ steps.build-image.outputs.image }}
68 |         AWS_TAG: ${{ steps.set-aws-tag.outputs.awstag }}
69 |       run: |
70 |         make sagemaker_train_aws \
71 |           SAGEMAKER_TRAIN_CONFIG_PATH=models/gradient_boost/envs/sagemaker/configs/train-gpu.json \
72 |           SAGEMAKER_IMAGE_NAME=$IMAGE_NAME \
73 |           TRAIN_PARAM_PATH=models/gradient_boost/training_params.json \
74 |           JOB_ID="ml-ids-sagemaker-$AWS_TAG"
75 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | dataset/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | pip-wheel-metadata/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # pipenv
 89 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 90 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 91 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 92 | #   install all needed dependencies.
 93 | #Pipfile.lock
 94 | 
 95 | # celery beat schedule file
 96 | celerybeat-schedule
 97 | 
 98 | # SageMath parsed files
 99 | *.sage.py
100 | 
101 | # Environments
102 | .env
103 | .venv
104 | env/
105 | venv/
106 | ENV/
107 | env.bak/
108 | venv.bak/
109 | 
110 | # Spyder project settings
111 | .spyderproject
112 | .spyproject
113 | 
114 | # Rope project settings
115 | .ropeproject
116 | 
117 | # mkdocs documentation
118 | /site
119 | 
120 | # mypy
121 | .mypy_cache/
122 | .dmypy.json
123 | dmypy.json
124 | 
125 | # Pyre type checker
126 | .pyre/
127 | 
128 | # Catboost
129 | catboost_info
130 | 


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | # Default ignored files
3 | /workspace.xml


--------------------------------------------------------------------------------
/.idea/deployment.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="PublishConfigData" autoUpload="Always" serverName="glados@192.168.1.77:22">
 4 |     <serverData>
 5 |       <paths name="glados@192.168.1.77:22">
 6 |         <serverdata>
 7 |           <mappings>
 8 |             <mapping deploy="/home/glados/Development/Projects/ml-ids-remote" local="$PROJECT_DIR$" />
 9 |           </mappings>
10 |         </serverdata>
11 |       </paths>
12 |     </serverData>
13 |     <option name="myAutoUpload" value="ALWAYS" />
14 |   </component>
15 | </project>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Remote Python 3.7.3 (sftp://glados@192.168.1.77:22/home/glados/anaconda3/envs/tf2/bin/python)" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/.idea/ml-ids.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$">
 5 |       <sourceFolder url="file://$MODULE_DIR$/ml_ids" isTestSource="false" />
 6 |     </content>
 7 |     <orderEntry type="jdk" jdkName="Remote Python 3.7.3 (sftp://glados@192.168.1.77:22/home/glados/anaconda3/envs/tf2/bin/python)" jdkType="Python SDK" />
 8 |     <orderEntry type="sourceFolder" forTests="false" />
 9 |   </component>
10 |   <component name="PackageRequirementsSettings">
11 |     <option name="requirementsPath" value="" />
12 |   </component>
13 |   <component name="TestRunnerService">
14 |     <option name="projectConfiguration" value="pytest" />
15 |     <option name="PROJECT_TEST_RUNNER" value="pytest" />
16 |   </component>
17 | </module>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/ml-ids.iml" filepath="$PROJECT_DIR$/.idea/ml-ids.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | SAGEMAKER_TRAIN_CONFIG_PATH=models/gradient_boost/envs/sagemaker/configs/train-gpu.json
 2 | SAGEMAKER_DEPLOY_CONFIG_PATH=models/gradient_boost/envs/sagemaker/configs/deploy.json
 3 | TRAIN_PARAM_PATH=models/gradient_boost/training_params.json
 4 | TRAIN_PATH=dataset/train.h5
 5 | VAL_PATH=dataset/val.h5
 6 | TEST_PATH=dataset/test.h5
 7 | 
 8 | clean:
 9 | 	-rm -r -f build
10 | 	mkdir build
11 | 
12 | test:
13 | 	python -m pytest tests
14 | 
15 | lint:
16 | 	pylint ml_ids
17 | 
18 | lint-errors:
19 | 	pylint ml_ids -E
20 | 
21 | typecheck:
22 | 	mypy ml_ids
23 | 
24 | split_dataset:
25 | 	mkdir -p dataset
26 | 	python ./ml_ids/data/split_dataset.py \
27 | 		--dataset-path $(DATASET_PATH) \
28 | 		--output-path dataset \
29 | 		--random-seed 42
30 | 
31 | train_local:
32 | 	python ./models/gradient_boost/envs/local/train.py \
33 | 		--train-path $(TRAIN_PATH) \
34 | 		--val-path $(VAL_PATH) \
35 | 		--test-path $(TEST_PATH) \
36 | 		--output-path build/models/gradient_boost \
37 | 		--param-path $(TRAIN_PARAM_PATH)
38 | 
39 | sagemaker_build_image:
40 | 	./models/gradient_boost/envs/sagemaker/scripts/build_image.sh ml-ids-train-sagemaker $(TAG)
41 | 
42 | sagemaker_push_image:
43 | 	./models/gradient_boost/envs/sagemaker/scripts/push_image_to_ecr.sh ml-ids-train-sagemaker $(TAG) | grep -Po '(?<=^image-name=).*' > sagemaker-image-name.txt
44 | 
45 | sagemaker_train_local:
46 | 	python ./models/gradient_boost/envs/sagemaker/scripts/train.py \
47 |   		--config-path $(SAGEMAKER_TRAIN_CONFIG_PATH) \
48 |   		--param-path $(TRAIN_PARAM_PATH) \
49 |   		--mode LOCAL \
50 |   		--image-name "ml-ids-train-sagemaker:$(TAG)" \
51 |   		--job-id "ml-ids-sagemaker-job"
52 | 
53 | sagemaker_train_aws:
54 | 	python ./models/gradient_boost/envs/sagemaker/scripts/train.py \
55 |   		--config-path $(SAGEMAKER_TRAIN_CONFIG_PATH) \
56 |   		--param-path $(TRAIN_PARAM_PATH) \
57 |   		--mode AWS \
58 |   		--image-name $(SAGEMAKER_IMAGE_NAME) \
59 |   		--job-id $(JOB_ID)
60 | 
61 | sagemaker_deploy:
62 | 	python ./models/gradient_boost/envs/sagemaker/scripts/deploy.py \
63 |   		--config-path $(SAGEMAKER_DEPLOY_CONFIG_PATH) \
64 |   		--job-id $(JOB_ID)
65 | 
66 | sagemaker_undeploy:
67 | 	python ./models/gradient_boost/envs/sagemaker/scripts/undeploy.py \
68 | 		--config-path $(SAGEMAKER_DEPLOY_CONFIG_PATH)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # A machine learning based approach towards building an Intrusion Detection System
  2 | 
  3 | ## Problem Description
  4 | With the rising amount of network enabled devices connected to the internet such as mobile phones, IOT appliances or vehicles the concern about the security implications of using these devices is growing. The increase in numbers and types of networked devices inevitably leads to a wider surface of attack whereas the impact of successful attacks is becoming increasingly severe as more critical responsibilities are assumed be these devices.
  5 | 
  6 | To identify and counter network attacks it is common to employ a combination of multiple systems in order to prevent attacks from happening or to detect and stop ongoing attacks if they can not be prevented initially.
  7 | These systems are usually comprised of an intrusion prevention system such as a firewall as the first layer of security with intrusion detection systems representing the second layer.
  8 | Should the intrusion prevention system be unable to prevent a network attack it is the task of the detection system to identify malicious network traffic in order to stop the ongoing attack and keep the recorded network traffic data for later analysis. This data can subsequently be used to update the prevention system to allow for the detection of the specific network attack in the future. The need for intrusion detection systems is rising as absolute prevention against attacks is not possible due to the rapid emergence of new attack types.
  9 | 
 10 | Even though intrusion detection systems are an essential part of network security many detection systems deployed today have a significant weakness as they facilitate signature-based attack classification patterns which are able to detect the most common known attack patterns but have the drawback of being unable to detect novel attack types.
 11 | To overcome this limitation research in intrusion detection systems is focusing on more dynamic approaches based on machine learning and anomaly detection methods. In these systems the normal network behaviour is learned by processing previously recorded benign data packets which allows the system to identify new attack types by analyzing network traffic for anomalous data flows.
 12 | 
 13 | This project aims to implement a classifier capable of identifying network traffic as either benign or malicious based on machine learning and deep learning methodologies.
 14 | 
 15 | ## Data
 16 | The data used to train the classifier is taken from the [CSE-CIC-IDS2018](https://www.unb.ca/cic/datasets/ids-2018.html) dataset provided by the Canadian Institute for Cybersecurity. It was created by capturing all network traffic during ten days of operation inside a controlled network environment on AWS where realistic background traffic and different attack scenarios were conducted.
 17 | As a result the dataset contains both benign network traffic as well as captures of the most common network attacks.
 18 | The dataset is comprised of the raw network captures in pcap format as well as csv files created by using [CICFlowMeter-V3](https://www.unb.ca/cic/research/applications.html#CICFlowMeter) containing 80 statistical features of the individual network flows combined with their corresponding labels.
 19 | A network flow is defined as an aggregation of interrelated network packets identified by the following properties:
 20 | * Source IP
 21 | * Destination IP
 22 | * Source port
 23 | * Destination port
 24 | * Protocol
 25 | 
 26 | The dataset contains approximately 16 million individual network flows and covers the following attack scenarios:
 27 | * Brute Force
 28 | * DoS,
 29 | * DDos
 30 | * Heartbleed,
 31 | * Web Attack,
 32 | * Infiltration,
 33 | * Botnet
 34 | 
 35 | ## Approach
 36 | The goal of this project is to create a classifier capable of categorising network flows as either benign or malicious.
 37 | The problem is understood as a supervised learning problem using the labels provided in the dataset which identify the network flows as either benign or malicious. Different approaches of classifying the data will be evaluated to formulate the problem either as a binary classification or a multiclass classification problem differentiating between the individual classes of attacks provided in the dataset in the later case. A relevant subset of the features provided in the dataset will be used as predictors to classify individual network flows.
 38 | Machine learning methods like k-nearest neighbours, random forest or SVM will be applied to the problem and evaluated in the first step in order to assess the feasibility of using traditional machine learning approaches.
 39 | Subsequently deep learning models like convolutional neural networks, autoencoders or recurrent neural networks will be employed to create a competing classifier as recent research has shown that deep learning methods represent a promising application in the field of anomaly detection.
 40 | The results of both approaches will be compared to select the best performing classifier.
 41 | 
 42 | ## Deliverables
 43 | The classifier will be deployed and served via a REST API in conjunction with a simple web application providing a user interface to utilize the API.
 44 | 
 45 | The REST API will provide the following functionality:
 46 | * an endpoint to submit network capture files in pcap format. Individual network flows are extracted from the capture files and analysed for malicious network traffic.
 47 | * (optional) an endpoint to stream continuous network traffic captures which are analysed in near real-time combined with
 48 | * (optional) an endpoint to register a web-socket in order to get notified upon detection of malicious network traffic.
 49 | 
 50 | To further showcase the project, a testbed could be created against which various attack scenarios can be performed. This testbed would be connected to the streaming API for near real-time detection of malicious network traffic.
 51 | 
 52 | ## Computational resources
 53 | The requirements regarding the computational resources to train the classifiers are given below:
 54 | 
 55 | | Category      | Resource      |
 56 | | ------------- | ------------- |
 57 | | CPU | Intel Core i7 processor |
 58 | | RAM | 32 GB                   |
 59 | | GPU | 1 GPU, 8 GB RAM         |
 60 | | HDD | 100 GB                  |
 61 | 
 62 | 
 63 | ## Classifier
 64 | 
 65 | The machine learning estimator created in this project follows a supervised approach and is trained using the [Gradient Boosting](https://en.wikipedia.org/wiki/Gradient_boosting) algorithm. Employing the [CatBoost](https://catboost.ai/) library a binary classifier is created, capable of classifying network flows as either benign or malicious. The chosen parameters of the classifier and its performance metrics can be examined in the following [notebook](https://github.com/cstub/ml-ids/blob/master/notebooks/07_binary_classifier_comparison/binary-classifier-comparison.ipynb).     
 66 | 
 67 | ## Deployment Architecture
 68 | 
 69 | The deployment architecture of the complete ML-IDS system is explained in detail in the [system architecture](https://docs.google.com/document/d/1s_EBMTid4gdrsQU_xOCAYK1BzxkhhnYl6wHFSZo_9Tw/edit?usp=sharing).
 70 | 
 71 | ## Model Training and Deployment
 72 | 
 73 | The model can be trained and deployed either locally or via [Amazon SageMaker](https://aws.amazon.com/sagemaker/).     
 74 | In each case the [MLflow](https://www.mlflow.org/docs/latest/index.html) framework is utilized to train the model and create the model artifacts.
 75 | 
 76 | ### Installation
 77 | 
 78 | To install the necessary dependencies checkout the project and create a new Anaconda environment from the environment.yml file.
 79 | 
 80 | ```
 81 | conda env create -f environment.yml
 82 | ```
 83 | 
 84 | Afterwards activate the environment and install the project resources.
 85 | 
 86 | ```
 87 | conda activate ml-ids
 88 | 
 89 | pip install -e .
 90 | ```
 91 | 
 92 | ### Dataset Creation
 93 | 
 94 | To create the dataset for training use the following command:
 95 | 
 96 | ```
 97 | make split_dataset \
 98 |   DATASET_PATH={path-to-source-dataset}
 99 | ```
100 | 
101 | This command will read the source dataset and split the dataset into separate train/validation/test sets with a sample ratio of 80%/10%/10%. The specified source dataset should be a folder containing multiple `.csv` files.    
102 | You can use the [CIC-IDS-2018 dataset](https://www.unb.ca/cic/datasets/ids-2018.html) provided via [Google Drive](https://drive.google.com/open?id=1HrTPh0YRSZ4T9DLa_c47lubheKUcPl0r) for this purpose.    
103 | Once the command completes a new folder `dataset` is created that contains the splitted datasets in `.h5` format.
104 | 
105 | ### Local Mode
106 | 
107 | To train the model in local mode, using the default parameters and dataset locations created by `split_dataset`, use the following command:
108 | 
109 | ```
110 | make train_local
111 | ```
112 | 
113 | If the datasets are stored in a different location or you want to specify different training parameters, you can optionally supply the dataset locations and a training parameter file:
114 | 
115 | ```
116 | make train_local \
117 |   TRAIN_PATH={path-to-train-dataset} \
118 |   VAL_PATH={path-to-train-dataset} \
119 |   TEST_PATH={path-to-train-dataset} \
120 |   TRAIN_PARAM_PATH={path-to-param-file}
121 | ```
122 | 
123 | Upon completion of the training process the model artifacts can be found in the `build/models/gradient_boost` directory.
124 | 
125 | To deploy the model locally the MLflow CLI can be used.
126 | 
127 | ```
128 | mlflow models serve -m build/models/gradient_boost -p 5000
129 | ```
130 | 
131 | The model can also be deployed as a Docker container using the following commands:
132 | 
133 | ```
134 | mlflow models build-docker -m build/models/gradient_boost -n ml-ids-classifier:1.0
135 | 
136 | docker run -p 5001:8080 ml-ids-classifier:1.0
137 | ```
138 | 
139 | ### Amazon SageMaker
140 | 
141 | To train the model on Amazon SageMaker the following command sequence is used:
142 | 
143 | ```
144 | # build a new docker container for model training
145 | make sagemaker_build_image \
146 |   TAG=1.0
147 | 
148 | # upload the container to AWS ECR
149 | make sagemaker_push_image \
150 |   TAG=1.0
151 | 
152 | # execute the training container on Amazon SageMaker
153 | make sagemaker_train_aws \
154 |   SAGEMAKER_IMAGE_NAME={ecr-image-name}:1.0 \
155 |   JOB_ID=ml-ids-job-0001
156 | ```
157 | 
158 | This command requires a valid AWS account with the appropriate permissions to be configured locally via the [AWS CLI](https://aws.amazon.com/cli/). Furthermore, [AWS ECR](https://aws.amazon.com/ecr/) and Amazon SageMaker must be configured for the account.
159 | 
160 | Using this repository, the manual invocation of the aforementioned commands is not necessary as training on Amazon SageMaker is supported via a [GitHub workflow](https://github.com/cstub/ml-ids/blob/master/.github/workflows/train.yml) that is triggered upon creation of a new tag of the form `m*` (e.g. `m1.0`).
161 | 
162 | To deploy a trained model on Amazon SageMaker a [GitHub Deployment request](https://developer.github.com/v3/repos/deployments/) using the GitHub API must be issued, specifying the tag of the model.
163 | 
164 | ```
165 | {
166 |   "ref": "refs/tags/m1.0",
167 |   "payload": {},
168 |   "description": "Deploy request for model version m1.0",
169 |   "auto_merge": false
170 | }
171 | ```
172 | 
173 | This deployment request triggers a [GitHub workflow](https://github.com/cstub/ml-ids/blob/master/.github/workflows/deployment.yml), deploying the model to SageMaker.
174 | After successful deployment the model is accessible via the SageMaker HTTP API.
175 | 
176 | ## Using the Classifier
177 | 
178 | The classifier deployed on Amazon SageMaker is not directly available publicly, but can be accessed using the [ML-IDS REST API](https://github.com/cstub/ml-ids-api).  
179 | 
180 | ### REST API
181 | 
182 | To invoke the REST API the following command can be used to submit a prediction request for a given network flow:
183 | 
184 | ```
185 | curl -X POST \
186 |   http://ml-ids-cluster-lb-1096011980.eu-west-1.elb.amazonaws.com/api/predictions \
187 |   -H 'Accept: */*' \
188 |   -H 'Content-Type: application/json; format=pandas-split' \
189 |   -H 'Host: ml-ids-cluster-lb-1096011980.eu-west-1.elb.amazonaws.com' \
190 |   -H 'cache-control: no-cache' \
191 |   -d '{"columns":["dst_port","protocol","timestamp","flow_duration","tot_fwd_pkts","tot_bwd_pkts","totlen_fwd_pkts","totlen_bwd_pkts","fwd_pkt_len_max","fwd_pkt_len_min","fwd_pkt_len_mean","fwd_pkt_len_std","bwd_pkt_len_max","bwd_pkt_len_min","bwd_pkt_len_mean","bwd_pkt_len_std","flow_byts_s","flow_pkts_s","flow_iat_mean","flow_iat_std","flow_iat_max","flow_iat_min","fwd_iat_tot","fwd_iat_mean","fwd_iat_std","fwd_iat_max","fwd_iat_min","bwd_iat_tot","bwd_iat_mean","bwd_iat_std","bwd_iat_max","bwd_iat_min","fwd_psh_flags","bwd_psh_flags","fwd_urg_flags","bwd_urg_flags","fwd_header_len","bwd_header_len","fwd_pkts_s","bwd_pkts_s","pkt_len_min","pkt_len_max","pkt_len_mean","pkt_len_std","pkt_len_var","fin_flag_cnt","syn_flag_cnt","rst_flag_cnt","psh_flag_cnt","ack_flag_cnt","urg_flag_cnt","cwe_flag_count","ece_flag_cnt","down_up_ratio","pkt_size_avg","fwd_seg_size_avg","bwd_seg_size_avg","fwd_byts_b_avg","fwd_pkts_b_avg","fwd_blk_rate_avg","bwd_byts_b_avg","bwd_pkts_b_avg","bwd_blk_rate_avg","subflow_fwd_pkts","subflow_fwd_byts","subflow_bwd_pkts","subflow_bwd_byts","init_fwd_win_byts","init_bwd_win_byts","fwd_act_data_pkts","fwd_seg_size_min","active_mean","active_std","active_max","active_min","idle_mean","idle_std","idle_max","idle_min"],"data":[[80,17,"21\\/02\\/2018 10:15:06",119759145,75837,0,2426784,0,32,32,32.0,0.0,0,0,0.0,0.0,20263.87212,633.2460039,1579.1859130859,31767.046875,920247,1,120000000,1579.1859130859,31767.046875,920247,1,0,0.0,0.0,0,0,0,0,0,0,606696,0,633.2460327148,0.0,32,32,32.0,0.0,0.0,0,0,0,0,0,0,0,0,0,32.0004234314,32.0,0.0,0,0,0,0,0,0,75837,2426784,0,0,-1,-1,75836,8,0.0,0.0,0,0,0.0,0.0,0,0]]}'
192 | ```
193 | 
194 | ### ML-IDS API Clients
195 | 
196 | For convenience, the Python clients implemented in the [ML-IDS API Clients project](https://github.com/cstub/ml-ids-api-client) can be used to submit new prediction requests to the API and receive real-time notifications on detection of malicious network flows.
197 | 


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
 1 | ## Data
 2 | 
 3 | The data used to train the classifiers is taken from the [CSE-CIC-IDS2018](https://www.unb.ca/cic/datasets/ids-2018.html) dataset provided by the Canadian Institute for Cybersecurity.
 4 | It was created by capturing all network traffic during ten days of operation inside a controlled network environment on AWS where realistic background traffic and different attack scenarios were conducted.
 5 | 
 6 | The dataset consists of raw network captures in pcap format as well as processed csv files created by using [CICFlowMeter-V3](https://www.unb.ca/cic/research/applications.html#CICFlowMeter) containing 80 statistical features of the individual network flows combined with their corresponding labels.
 7 | 
 8 | Due to size limitations the data provided in this repository represents only a small portion of the dataset in form of processed network flows. The full dataset consisting of the raw network captures and the processed csv files can be retrieved from AWS S3.
 9 | 
10 | ## Download
11 | 
12 | A prerequisite to downloading the full dataset is the installation of the [AWS CLI](https://aws.amazon.com/cli/).
13 | 
14 | To download the processed csv files containing the analyzed network flows (~7GB) run the following command:
15 | ```bash
16 | aws s3 sync --no-sign-request --region <your-region> "s3://cse-cic-ids2018/Processed Traffic Data for ML Algorithms/" <dest-dir>
17 | ```
18 | To download the raw network captures in pcap format (~477GB) run:
19 | ```bash
20 | aws s3 sync --no-sign-request --region <your-region> "s3://cse-cic-ids2018/Original Network Traffic and Log data/" <dest-dir>
21 | ```
22 | To download the full dataset containing the raw network captures and processed csv files (~484GB) use the following command:
23 | ```bash
24 | aws s3 sync --no-sign-request --region <your-region> "s3://cse-cic-ids2018/" <dest-dir>
25 | ```
26 | 
27 | ## Preprocessed Dataset
28 | 
29 | The preprocessed dataset used for model training and evaluation can be found at [Google Drive](https://drive.google.com/drive/folders/1AWhRsVShJ_KvYKrV0VlnM1odtJ4Tp-uC?usp=sharing).
30 | 


--------------------------------------------------------------------------------
/data/Wednesday-14-02-2018_TrafficForML_CICFlowMeter.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:acff8bc61376ee031d80878ee6099e0b1a87a1bd711d8068298421418c9f8147
3 | size 358223333
4 | 


--------------------------------------------------------------------------------
/environment-notebook.yaml:
--------------------------------------------------------------------------------
 1 | name: ml-ids-notebooks
 2 | channels:
 3 |   - anaconda
 4 |   - conda-forge
 5 |   - defaults
 6 | dependencies:
 7 |   - catboost=0.18.1=py37_0
 8 |   - click=7.0=py37_0
 9 |   - cloudpickle=1.2.2=py_0
10 |   - eli5=0.10.1=py37_1
11 |   - findspark=1.3.0=py_1
12 |   - imbalanced-learn=0.5.0=py_0
13 |   - jupyter=1.0.0=py_2
14 |   - matplotlib=3.1.1=py37_1
15 |   - numpy=1.17.2=py37h95a1406_0
16 |   - pandas=0.25.2=py37hb3f55d8_0
17 |   - pip=19.2.3=py37_0
18 |   - pyspark=2.4.4=py_0
19 |   - pytest=5.2.1=py37_0
20 |   - pytest-runner=5.1=py_0
21 |   - python=3.7.3=h33d41f4_1
22 |   - python-dateutil<2.8.1
23 |   - requests<2.21.0
24 |   - scikit-learn=0.21.3=py37hcdab131_0
25 |   - scikit-plot=0.3.7=py_1
26 |   - scipy=1.3.1=py37h921218d_2
27 |   - seaborn=0.9.0=py_1
28 |   - setuptools=41.6.0=py37_1
29 |   - shap=0.31.0=py37hb3f55d8_0
30 |   - pip:
31 |     - h5py==2.10.0
32 |     - hyperopt==0.2.2
33 |     - keras==2.3.1
34 |     - keras-applications==1.0.8
35 |     - keras-preprocessing==1.1.0
36 |     - tables==3.6.1
37 |     - tensorboard==2.0.0
38 |     - tensorflow-estimator==2.0.0
39 |     - tensorflow-gpu==2.0.0
40 | 


--------------------------------------------------------------------------------
/environment.yaml:
--------------------------------------------------------------------------------
 1 | name: ml-ids
 2 | channels:
 3 |   - anaconda
 4 |   - conda-forge
 5 |   - defaults
 6 | dependencies:
 7 |   - catboost=0.18.1=py37_0
 8 |   - click=7.0=py37_0
 9 |   - cloudpickle=1.2.2=py_0
10 |   - imbalanced-learn=0.5.0=py_0
11 |   - matplotlib=3.1.1=py37_1
12 |   - mypy=0.750
13 |   - numpy=1.17.2=py37h95a1406_0
14 |   - pandas=0.25.2=py37hb3f55d8_0
15 |   - pip=19.2.3=py37_0
16 |   - pylint=2.4.4
17 |   - pytest=5.2.1=py37_0
18 |   - pytest-runner=5.1=py_0
19 |   - python=3.7.3=h33d41f4_1
20 |   - python-dateutil<2.8.1
21 |   - requests<2.21.0
22 |   - scikit-learn=0.21.3=py37hcdab131_0
23 |   - scipy=1.3.1=py37h921218d_2
24 |   - seaborn=0.9.0=py_1
25 |   - setuptools=41.6.0=py37_1
26 |   - pip:
27 |     - mlflow==1.4
28 |     - sagemaker==1.44.3
29 |     - h5py==2.10.0
30 |     - hyperopt==0.2.2
31 |     - keras==2.3.1
32 |     - keras-applications==1.0.8
33 |     - keras-preprocessing==1.1.0
34 |     - tables==3.6.1
35 |     - tensorflow-estimator==2.0.0
36 |     - tensorflow-gpu==2.0.0
37 | 


--------------------------------------------------------------------------------
/ml_ids/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/ml_ids/__init__.py


--------------------------------------------------------------------------------
/ml_ids/conf.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Global configuration variables.
 3 | """
 4 | import os
 5 | 
 6 | ROOT_DIR = os.sep.join(os.path.dirname(os.path.abspath(__file__)).split(os.sep)[:-1])
 7 | 
 8 | TEST_DIR = os.path.join(ROOT_DIR, 'tests')
 9 | 
10 | TEST_DATA_DIR = os.path.join(TEST_DIR, 'validation_data')
11 | 


--------------------------------------------------------------------------------
/ml_ids/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/ml_ids/data/__init__.py


--------------------------------------------------------------------------------
/ml_ids/data/dataset.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utilities to manipulate the CIC-IDS-2018 dataset.
  3 | """
  4 | from typing import List
  5 | import os
  6 | import glob
  7 | import numpy as np
  8 | import pandas as pd
  9 | import ml_ids.data.metadata as md
 10 | 
 11 | 
 12 | def remove_inf_values(df: pd.DataFrame) -> pd.DataFrame:
 13 |     """
 14 |     Replaces values of type `np.inf` and `-np.inf` in a DataFrame with `null` values.
 15 | 
 16 |     :param df: Input DataFrame.
 17 |     :return: The DataFrame without `np.inf` and `-np.inf` values.
 18 |     """
 19 |     inf_columns = [c for c in df.columns if df[df[c] == np.inf][c].count() > 0]
 20 |     for col in inf_columns:
 21 |         df[col].replace([np.inf, -np.inf], np.nan, inplace=True)
 22 |     return df
 23 | 
 24 | 
 25 | def remove_negative_values(df: pd.DataFrame, ignore_cols: List[str] = None) -> pd.DataFrame:
 26 |     """
 27 |     Removes negative values in a DataFrame with `null` values.
 28 | 
 29 |     :param df: Input DataFrame.
 30 |     :param ignore_cols: Columns to ignore. Negative values in this columns will be preserved.
 31 |     :return: The DataFrame without negative values.
 32 |     """
 33 |     if ignore_cols is None:
 34 |         ignore_cols = []
 35 | 
 36 |     numeric_cols = df.select_dtypes(include=[np.number]).columns.drop(ignore_cols).values
 37 | 
 38 |     columns = [c for c in numeric_cols if df[df[c] < 0][c].count() > 0]
 39 |     for col in columns:
 40 |         mask = df[col] < 0
 41 |         df.loc[mask, col] = np.nan
 42 |     return df
 43 | 
 44 | 
 45 | def add_label_category_column(df: pd.DataFrame) -> pd.DataFrame:
 46 |     """
 47 |     Adds the column `label_cat` to the DataFrame specifying the category of the label.
 48 | 
 49 |     :param df: Input DataFrame.
 50 |     :return: The DataFrame containing a new column `label_cat`.
 51 |     """
 52 |     df[md.COLUMN_LABEL_CAT] = df.label.apply(lambda l: md.LABEL_CAT_MAPPING[l])
 53 |     return df
 54 | 
 55 | 
 56 | def add_label_is_attack_columns(df: pd.DataFrame) -> pd.DataFrame:
 57 |     """
 58 |     Adds the column `label_is_attack` to the DataFrame containing a binary indicator specifying if a row is of category
 59 |     `benign = 0` or `attack = 1`.
 60 | 
 61 |     :param df: Input DataFrame.
 62 |     :return: The DataFrame containing a new column `label_is_attack`.
 63 |     """
 64 |     df[md.COLUMN_LABEL_IS_ATTACK] = df.label.apply(lambda l: 0 if l == md.LABEL_BENIGN else 1)
 65 |     return df
 66 | 
 67 | 
 68 | def load_dataset_generic(load_df_fn,
 69 |                          dataset_path: str,
 70 |                          use_cols: List[str] = None,
 71 |                          omit_cols: List[str] = None,
 72 |                          preserve_neg_value_cols: list = None,
 73 |                          transform_data: bool = True) -> pd.DataFrame:
 74 |     """
 75 |     Loads the dataset from the given path using the supplied function.
 76 |     All invalid values (`np.inf`, `-np.inf`, negative) are removed and replaced with `null` for easy imputation.
 77 |     Negative values of columns specified in `preserve_neg_value_cols` will be preserved.
 78 | 
 79 |     :param load_df_fn: Function used to load the dataset.
 80 |     :param dataset_path: Path of the base directory containing all files of the dataset.
 81 |     :param use_cols: Columns to load.
 82 |     :param omit_cols: Columns to omit.
 83 |     :param nrows: Number of rows to load per file.
 84 |     :param transform_data: Indicates if data should be manipulated (removal of invalid and negative values).
 85 |     :param preserve_neg_value_cols: Columns in which negative values are preserved.
 86 |     :return: The dataset as a DataFrame.
 87 |     """
 88 |     cols = None
 89 |     if use_cols:
 90 |         cols = use_cols
 91 |     if omit_cols:
 92 |         cols = [c for c in md.COLUMN_DTYPES.keys() if c not in omit_cols]
 93 | 
 94 |     df = load_df_fn(dataset_path, cols)
 95 | 
 96 |     if transform_data:
 97 |         df = remove_inf_values(df)
 98 |         df = remove_negative_values(df, preserve_neg_value_cols)
 99 | 
100 |     if md.COLUMN_LABEL in df.columns:
101 |         df = add_label_category_column(df)
102 |         df = add_label_is_attack_columns(df)
103 | 
104 |     return df
105 | 
106 | 
107 | def load_dataset(dataset_path: str,
108 |                  use_cols: List[str] = None,
109 |                  omit_cols: List[str] = None,
110 |                  nrows: int = None,
111 |                  transform_data: bool = True,
112 |                  preserve_neg_value_cols: list = None) -> pd.DataFrame:
113 |     """
114 |     Loads the dataset in CSV format from the given path.
115 |     All invalid values (`np.inf`, `-np.inf`, negative) are removed and replaced with `null` for easy imputation.
116 |     Negative values of columns specified in `preserve_neg_value_cols` will be preserved.
117 | 
118 |     :param dataset_path: Path of the base directory containing all files of the dataset.
119 |     :param use_cols: Columns to load.
120 |     :param omit_cols: Columns to omit.
121 |     :param nrows: Number of rows to load per file.
122 |     :param transform_data: Indicates if data should be manipulated (removal of invalid and negative values).
123 |     :param preserve_neg_value_cols: Columns in which negative values are preserved.
124 |     :return: The dataset as a DataFrame.
125 |     """
126 | 
127 |     def load_csv(path, cols):
128 |         files = glob.glob(os.path.join(path, '*.csv'))
129 |         return pd.concat([pd.read_csv(f, dtype=md.COLUMN_DTYPES, usecols=cols, nrows=nrows) for f in files])
130 | 
131 |     return load_dataset_generic(load_df_fn=load_csv,
132 |                                 dataset_path=dataset_path,
133 |                                 use_cols=use_cols,
134 |                                 omit_cols=omit_cols,
135 |                                 preserve_neg_value_cols=preserve_neg_value_cols,
136 |                                 transform_data=transform_data)
137 | 
138 | 
139 | def load_dataset_hdf(dataset_path: str,
140 |                      use_cols: List[str] = None,
141 |                      omit_cols: List[str] = None,
142 |                      preserve_neg_value_cols: list = None,
143 |                      transform_data: bool = True,
144 |                      key: str = None) -> pd.DataFrame:
145 |     """
146 |     Loads the dataset stored as a HDF file from the given path.
147 |     All invalid values (`np.inf`, `-np.inf`, negative) are removed and replaced with `null` for easy imputation.
148 |     Negative values of columns specified in `preserve_neg_value_cols` will be preserved.
149 | 
150 |     :param dataset_path: Path of the base directory containing all files of the dataset.
151 |     :param use_cols: Columns to load.
152 |     :param omit_cols: Columns to omit.
153 |     :param preserve_neg_value_cols: Columns in which negative values are preserved.
154 |     :param transform_data: Indicates if data should be manipulated (removal of invalid and negative values).
155 |     :param key: Group identifier in the HDF store.
156 |     :return: The dataset as a DataFrame.
157 |     """
158 | 
159 |     def load_hdf(path, cols):
160 |         return pd.read_hdf(path, key=key, columns=cols)
161 | 
162 |     return load_dataset_generic(load_df_fn=load_hdf,
163 |                                 dataset_path=dataset_path,
164 |                                 use_cols=use_cols,
165 |                                 omit_cols=omit_cols,
166 |                                 preserve_neg_value_cols=preserve_neg_value_cols,
167 |                                 transform_data=transform_data)
168 | 


--------------------------------------------------------------------------------
/ml_ids/data/metadata.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Metadata of the CIC-IDS-2018 dataset.
  3 | """
  4 | COLUMN_DTYPES = {
  5 |     'dst_port': 'uint32',
  6 |     'protocol': 'uint8',
  7 |     'timestamp': 'object',
  8 |     'flow_duration': 'int64',
  9 |     'tot_fwd_pkts': 'uint32',
 10 |     'tot_bwd_pkts': 'uint32',
 11 |     'totlen_fwd_pkts': 'uint32',
 12 |     'totlen_bwd_pkts': 'uint32',
 13 |     'fwd_pkt_len_max': 'uint16',
 14 |     'fwd_pkt_len_min': 'uint16',
 15 |     'fwd_pkt_len_mean': 'float32',
 16 |     'fwd_pkt_len_std': 'float32',
 17 |     'bwd_pkt_len_max': 'uint16',
 18 |     'bwd_pkt_len_min': 'uint16',
 19 |     'bwd_pkt_len_mean': 'float32',
 20 |     'bwd_pkt_len_std': 'float32',
 21 |     'flow_byts_s': 'float64',
 22 |     'flow_pkts_s': 'float64',
 23 |     'flow_iat_mean': 'float32',
 24 |     'flow_iat_std': 'float32',
 25 |     'flow_iat_max': 'int64',
 26 |     'flow_iat_min': 'int64',
 27 |     'fwd_iat_tot': 'int64',
 28 |     'fwd_iat_mean': 'float32',
 29 |     'fwd_iat_std': 'float32',
 30 |     'fwd_iat_max': 'int64',
 31 |     'fwd_iat_min': 'int64',
 32 |     'bwd_iat_tot': 'uint32',
 33 |     'bwd_iat_mean': 'float32',
 34 |     'bwd_iat_std': 'float32',
 35 |     'bwd_iat_max': 'uint32',
 36 |     'bwd_iat_min': 'uint32',
 37 |     'fwd_psh_flags': 'uint8',
 38 |     'bwd_psh_flags': 'uint8',
 39 |     'fwd_urg_flags': 'uint8',
 40 |     'bwd_urg_flags': 'uint8',
 41 |     'fwd_header_len': 'uint32',
 42 |     'bwd_header_len': 'uint32',
 43 |     'fwd_pkts_s': 'float32',
 44 |     'bwd_pkts_s': 'float32',
 45 |     'pkt_len_min': 'uint16',
 46 |     'pkt_len_max': 'uint16',
 47 |     'pkt_len_mean': 'float32',
 48 |     'pkt_len_std': 'float32',
 49 |     'pkt_len_var': 'float32',
 50 |     'fin_flag_cnt': 'uint8',
 51 |     'syn_flag_cnt': 'uint8',
 52 |     'rst_flag_cnt': 'uint8',
 53 |     'psh_flag_cnt': 'uint8',
 54 |     'ack_flag_cnt': 'uint8',
 55 |     'urg_flag_cnt': 'uint8',
 56 |     'cwe_flag_count': 'uint8',
 57 |     'ece_flag_cnt': 'uint8',
 58 |     'down_up_ratio': 'uint16',
 59 |     'pkt_size_avg': 'float32',
 60 |     'fwd_seg_size_avg': 'float32',
 61 |     'bwd_seg_size_avg': 'float32',
 62 |     'fwd_byts_b_avg': 'uint8',
 63 |     'fwd_pkts_b_avg': 'uint8',
 64 |     'fwd_blk_rate_avg': 'uint8',
 65 |     'bwd_byts_b_avg': 'uint8',
 66 |     'bwd_pkts_b_avg': 'uint8',
 67 |     'bwd_blk_rate_avg': 'uint8',
 68 |     'subflow_fwd_pkts': 'uint32',
 69 |     'subflow_fwd_byts': 'uint32',
 70 |     'subflow_bwd_pkts': 'uint32',
 71 |     'subflow_bwd_byts': 'uint32',
 72 |     'init_fwd_win_byts': 'int32',
 73 |     'init_bwd_win_byts': 'int32',
 74 |     'fwd_act_data_pkts': 'uint32',
 75 |     'fwd_seg_size_min': 'uint8',
 76 |     'active_mean': 'float32',
 77 |     'active_std': 'float32',
 78 |     'active_max': 'uint32',
 79 |     'active_min': 'uint32',
 80 |     'idle_mean': 'float32',
 81 |     'idle_std': 'float32',
 82 |     'idle_max': 'uint64',
 83 |     'idle_min': 'uint64',
 84 |     'label': 'category'
 85 | }
 86 | 
 87 | LABEL_BENIGN = 'Benign'
 88 | 
 89 | LABEL_CAT_MAPPING = {
 90 |     'Benign': 0,
 91 |     'Bot': 1,
 92 |     'Brute Force -Web': 2,
 93 |     'Brute Force -XSS': 3,
 94 |     'DoS attacks-GoldenEye': 4,
 95 |     'DoS attacks-Hulk': 5,
 96 |     'DoS attacks-SlowHTTPTest': 6,
 97 |     'DoS attacks-Slowloris': 7,
 98 |     'DDOS attack-HOIC': 8,
 99 |     'DDOS attack-LOIC-UDP': 9,
100 |     'DDoS attacks-LOIC-HTTP': 10,
101 |     'FTP-BruteForce': 11,
102 |     'Infilteration': 12,
103 |     'SQL Injection': 13,
104 |     'SSH-Bruteforce': 14,
105 |     'DDOS LOIT': 15,
106 |     'Heartbleed': 16,
107 |     'PortScan': 17
108 | }
109 | 
110 | FEATURES_NO_VARIANCE = [
111 |     "bwd_blk_rate_avg",
112 |     "bwd_byts_b_avg",
113 |     "bwd_pkts_b_avg",
114 |     "bwd_psh_flags",
115 |     "bwd_urg_flags",
116 |     "fwd_blk_rate_avg",
117 |     "fwd_byts_b_avg",
118 |     "fwd_pkts_b_avg"
119 | ]
120 | 
121 | FEATURES_TO_IGNORE = [
122 |     'timestamp',
123 |     'dst_port',
124 |     'protocol'
125 | ]
126 | 
127 | FEATURES_PRESERVE_NEG_COLUMNS = [
128 |     'init_fwd_win_byts',
129 |     'init_bwd_win_byts'
130 | ]
131 | 
132 | COLUMN_LABEL = 'label'
133 | COLUMN_LABEL_CAT = 'label_cat'
134 | COLUMN_LABEL_IS_ATTACK = 'label_is_attack'
135 | 


--------------------------------------------------------------------------------
/ml_ids/data/split_dataset.py:
--------------------------------------------------------------------------------
 1 | """
 2 | CLI to split a single dataset into train/val/test sub-datasets.
 3 | """
 4 | import os
 5 | import sys
 6 | import logging
 7 | import click
 8 | import pandas as pd
 9 | import ml_ids.data.metadata as md
10 | from ml_ids.data.dataset import load_dataset
11 | from ml_ids.model_selection import train_val_test_split
12 | 
13 | logging.basicConfig(
14 |     format='[%(asctime)s|%(module)s.py|%(levelname)s]  %(message)s',
15 |     datefmt='%H:%M:%S',
16 |     level=logging.INFO,
17 |     stream=sys.stdout
18 | )
19 | 
20 | 
21 | @click.command()
22 | @click.option('--dataset-path', type=click.Path(exists=True), required=True,
23 |               help='Path to the input dataset in .csv format. Can be a folder containing multiple files.')
24 | @click.option('--output-path', type=click.Path(exists=True), required=True,
25 |               help='Path to store the output datasets.')
26 | @click.option('--val-size', type=click.FloatRange(0, 1), default=0.1,
27 |               help='Fraction of the data used for the validation set.')
28 | @click.option('--test-size', type=click.FloatRange(0, 1), default=0.1,
29 |               help='Fraction of the data used for the test set.')
30 | @click.option('--nrows', type=int,
31 |               help='Number of rows to load per input file.')
32 | @click.option('--random-seed', type=int,
33 |               help='Random seed.')
34 | def split_dataset(dataset_path, output_path, val_size, test_size, nrows, random_seed):
35 |     """
36 |     Runs the CLI.
37 |     """
38 |     logging.info('Loading dataset from "%s"...', dataset_path)
39 | 
40 |     dataset = load_dataset(dataset_path=dataset_path, transform_data=False, nrows=nrows)
41 | 
42 |     train, val, test = train_val_test_split(dataset,
43 |                                             val_size=val_size,
44 |                                             test_size=test_size,
45 |                                             stratify_col=md.COLUMN_LABEL_CAT,
46 |                                             random_state=random_seed)
47 | 
48 |     train = remove_extra_labels(train)
49 |     val = remove_extra_labels(val)
50 |     test = remove_extra_labels(test)
51 | 
52 |     save_dataset(train, output_path, 'train')
53 |     save_dataset(val, output_path, 'val')
54 |     save_dataset(test, output_path, 'test')
55 |     logging.info('Processing complete.')
56 | 
57 | 
58 | def remove_extra_labels(dataset: pd.DataFrame):
59 |     """
60 |     Removes unused target labels.
61 |     :param dataset: Input dataset as Pandas DataFrame.
62 |     :return: Dataset without unused target labels.
63 |     """
64 |     return dataset.drop(columns=[md.COLUMN_LABEL_CAT, md.COLUMN_LABEL_IS_ATTACK])
65 | 
66 | 
67 | def save_dataset(dataset: pd.DataFrame, path: str, ds_type: str):
68 |     """
69 |     Stores the given dataset in hdf format on the specified path.
70 | 
71 |     :param dataset: Dataset as Pandas DataFrame.
72 |     :param path: Target path to store the dataset.
73 |     :param ds_type: Dataset type.
74 |     :return: None
75 |     """
76 |     file_path = os.path.join(path, '{}.h5'.format(ds_type))
77 | 
78 |     logging.info('Storing dataset "%s" of size %d to "%s"', ds_type, len(dataset), file_path)
79 | 
80 |     dataset.to_hdf(file_path, 'ids_data', format='t', complevel=5, complib='zlib')
81 | 
82 | 
83 | if __name__ == '__main__':
84 |     # pylint: disable=no-value-for-parameter
85 |     split_dataset()
86 | 


--------------------------------------------------------------------------------
/ml_ids/keras/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/ml_ids/keras/__init__.py


--------------------------------------------------------------------------------
/ml_ids/keras/callbacks.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Custom callbacks for Keras models.
 3 | """
 4 | # pylint: disable=import-error
 5 | from tensorflow import keras
 6 | from tensorflow.keras import callbacks
 7 | 
 8 | K = keras.backend
 9 | 
10 | 
11 | class OneCycleScheduler(callbacks.Callback):
12 |     """
13 |     Keras callback implementing a one-cycle learning-rate scheduler.
14 |     Provided by https://github.com/ageron/handson-ml2/blob/master/11_training_deep_neural_networks.ipynb.
15 |     """
16 |     def __init__(self, iterations, max_rate, start_rate=None,
17 |                  last_iterations=None, last_rate=None):
18 |         self.iterations = iterations
19 |         self.max_rate = max_rate
20 |         self.start_rate = start_rate or max_rate / 10
21 |         self.last_iterations = last_iterations or iterations // 10 + 1
22 |         self.half_iteration = (iterations - self.last_iterations) // 2
23 |         self.last_rate = last_rate or self.start_rate / 1000
24 |         self.iteration = 0
25 | 
26 |     def _interpolate(self, iter1, iter2, rate1, rate2):
27 |         return ((rate2 - rate1) * (iter2 - self.iteration)
28 |                 / (iter2 - iter1) + rate1)
29 | 
30 |     def on_batch_begin(self, batch, logs):
31 |         if self.iteration < self.half_iteration:
32 |             rate = self._interpolate(0, self.half_iteration, self.start_rate, self.max_rate)
33 |         elif self.iteration < 2 * self.half_iteration:
34 |             rate = self._interpolate(self.half_iteration, 2 * self.half_iteration,
35 |                                      self.max_rate, self.start_rate)
36 |         else:
37 |             rate = self._interpolate(2 * self.half_iteration, self.iterations,
38 |                                      self.start_rate, self.last_rate)
39 |             rate = max(rate, self.last_rate)
40 |         self.iteration += 1
41 |         K.set_value(self.model.optimizer.lr, rate)
42 | 


--------------------------------------------------------------------------------
/ml_ids/keras/evaluation.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utility functions to evaluate Keras models.
 3 | """
 4 | PREDICT_BATCH_SIZE = 16384
 5 | 
 6 | 
 7 | def evaluate_model(model, X_train, y_train, X_val, y_val, metric_title):
 8 |     """
 9 |     Prints the performance metrics of a Keras model by invoking the `evaluate` function of the model on the training
10 |     and validation dataset.
11 | 
12 |     :param model: Keras model.
13 |     :param X_train: Predictor variables of the training dataset.
14 |     :param y_train: Target labels of the training dataset.
15 |     :param X_val: Predictor variables of the validation dataset.
16 |     :param y_val: Target labels of the validation dataset.
17 |     :param metric_title: Title of the metrics.
18 |     :return: None
19 |     """
20 |     print('Evaluation:')
21 |     print('===========')
22 |     print('       {}'.format(metric_title))
23 |     print('Train: {}'.format(model.evaluate(X_train, y_train, batch_size=PREDICT_BATCH_SIZE, verbose=0)))
24 |     print('Val:   {}'.format(model.evaluate(X_val, y_val, batch_size=PREDICT_BATCH_SIZE, verbose=0)))
25 | 


--------------------------------------------------------------------------------
/ml_ids/keras/metrics.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utilities to create custom metrics for Keras models.
 3 | """
 4 | # pylint: disable=import-error
 5 | import gc
 6 | import numpy as np
 7 | from tensorflow import keras
 8 | from tensorflow.keras import callbacks
 9 | from sklearn.metrics import average_precision_score
10 | 
11 | K = keras.backend
12 | 
13 | 
14 | class AveragePrecisionScoreMetric(callbacks.Callback):
15 |     """
16 |     Keras callback calculating the average precision score for a given validation dataset using the
17 |     `average_precision_score` metric from Scikit-learn.
18 |     """
19 |     def __init__(self, X_val, y_val, batch_size=4096):
20 |         super(AveragePrecisionScoreMetric, self).__init__()
21 |         self.X_val = X_val
22 |         self.y_val = y_val
23 |         self.batch_size = batch_size
24 | 
25 |     def get_precision_score(self):
26 |         """
27 |         Calculates the average precision score using scikit-learn.
28 |         """
29 |         preds = self.model.predict(self.X_val, batch_size=self.batch_size)
30 |         # reduces memory consumption caused by a memory leak in `model.predict()` of Tensorflow 2
31 |         # https://github.com/tensorflow/tensorflow/issues/33009
32 |         gc.collect()
33 |         mse = np.mean(np.power(self.X_val - preds, 2), axis=1)
34 |         return average_precision_score(self.y_val, mse)
35 | 
36 |     def on_epoch_end(self, epoch, logs):
37 |         """
38 |         Invoked after each training epoch.
39 |         """
40 |         auprc = self.get_precision_score()
41 |         logs['val_auprc'] = auprc
42 |         print(' - val_auprc: {0:.4f}'.format(auprc))
43 | 


--------------------------------------------------------------------------------
/ml_ids/keras/model_selection.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utility functions for model selection of Keras models.
 3 | """
 4 | import gc
 5 | from typing import Tuple
 6 | import numpy as np
 7 | from sklearn.model_selection import StratifiedKFold
 8 | from tensorflow import keras
 9 | 
10 | 
11 | def cross_val_train(fit_fn,
12 |                     X: np.ndarray,
13 |                     y: np.ndarray,
14 |                     target_transform_fn=id,
15 |                     target_stratify_fn=id,
16 |                     n_splits: int = 3,
17 |                     fit_args: dict = None,
18 |                     random_state: int = None) -> Tuple[np.ndarray, np.ndarray, list]:
19 |     """
20 |     Performs stratified cross-validation for a Keras model using the provided fit function.
21 | 
22 |     :param fit_fn: The function used to fit a model with a given split of the train and test set. Must return a fitted
23 |                    Keras model with its history.
24 |     :param X: Predictor variables.
25 |     :param y: Labels.
26 |     :param target_transform_fn: Function to transform the target labels (e.g. one-hot encoding).
27 |     :param target_stratify_fn: Function to extract the target label to stratify by.
28 |     :param n_splits: Number of cross-validation splits.
29 |     :param fit_args: Arguments to pass to the fit function.
30 |     :param random_state: Random state.
31 |     :return: A triple containing the cross-validation predictions, the true values and a list of history-objects.
32 |     """
33 |     if fit_args is None:
34 |         fit_args = {}
35 | 
36 |     kfold = StratifiedKFold(n_splits=n_splits, random_state=random_state)
37 | 
38 |     cv_predictions = None
39 |     cv_y_true = None
40 |     hists = []
41 |     fold = 1
42 | 
43 |     for train_index, val_index in kfold.split(X, target_stratify_fn(y)):
44 |         print('\nFold {}/{}:'.format(fold, n_splits))
45 |         print('==========')
46 | 
47 |         X_train, X_val = X[train_index], X[val_index]
48 |         y_train, y_val = y[train_index], y[val_index]
49 | 
50 |         y_train_ = target_transform_fn(y_train)
51 |         y_val_ = target_transform_fn(y_val)
52 | 
53 |         keras.backend.clear_session()
54 |         gc.collect()
55 | 
56 |         model, hist = fit_fn(X_train, y_train_, X_val, y_val_, fit_args, (fold == 1))
57 | 
58 |         if isinstance(hist, list):
59 |             hists.extend(hist)
60 |         else:
61 |             hists.append(hist)
62 | 
63 |         if cv_predictions is not None:
64 |             cv_predictions = np.append(cv_predictions, model.predict(X_val), axis=0)
65 |         else:
66 |             cv_predictions = model.predict(X_val)
67 | 
68 |         if cv_y_true is not None:
69 |             cv_y_true = np.append(cv_y_true, y_val, axis=0)
70 |         else:
71 |             cv_y_true = y_val
72 | 
73 |         fold = fold + 1
74 | 
75 |     return cv_predictions, cv_y_true, hists
76 | 


--------------------------------------------------------------------------------
/ml_ids/keras/prediction.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utility functions to create predictions using Keras models.
 3 | """
 4 | PREDICT_BATCH_SIZE = 16384
 5 | 
 6 | 
 7 | def predict(model, X, decision_boundary=0.5):
 8 |     """
 9 |     Performs predictions for a binary classification task given a Keras model and a decision boundary.
10 |     If the probability of a sample belonging to the positive class exceeds the decision boundary the positive label
11 |     is assigned to the sample, otherwise the negative label is used.
12 | 
13 |     :param model: Keras model.
14 |     :param X: Dataset containing samples.
15 |     :param decision_boundary: Decision boundary used to assign predictions to the positive class.
16 |     :return: numpy array containing the binary predictions as one of the values {0, 1}.
17 |     """
18 |     pred = model.predict(X, batch_size=PREDICT_BATCH_SIZE)
19 |     return (pred >= decision_boundary).astype('int').reshape(-1)
20 | 
21 | 
22 | def predict_proba(model, X):
23 |     """
24 |      Performs predictions for a binary classification task given a Keras model.
25 |      This function returns the class probability of the positive class.
26 | 
27 |     :param model: Keras model.
28 |     :param X: Dataset containing samples.
29 |     :return: numpy array containing the class probabilities of the positive class.
30 |     """
31 |     return model.predict(X, batch_size=PREDICT_BATCH_SIZE).reshape(-1)
32 | 


--------------------------------------------------------------------------------
/ml_ids/libs/dfencoder/dataframe.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019, Michael Klear.
 2 | # All rights reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions are
 6 | # met:
 7 | #
 8 | #     * Redistributions of source code must retain the above copyright
 9 | #        notice, this list of conditions and the following disclaimer.
10 | #
11 | #     * Redistributions in binary form must reproduce the above
12 | #        copyright notice, this list of conditions and the following
13 | #        disclaimer in the documentation and/or other materials provided
14 | #        with the distribution.
15 | #
16 | #     * Neither the name of the dfencoder Developers nor the names of any
17 | #        contributors may be used to endorse or promote products derived
18 | #        from this software without specific prior written permission.
19 | #
20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 
32 | import pandas as pd
33 | import numpy as np
34 | 
35 | 
36 | class EncoderDataFrame(pd.DataFrame):
37 |     def __init__(self, *args, **kwargs):
38 |         super(EncoderDataFrame, self).__init__(*args, **kwargs)
39 | 
40 |     def swap(self, likelihood=.15):
41 |         """
42 |         Performs random swapping of data.
43 |         Each value has a likelihood of *argument likelihood*
44 |             of being randomly replaced with a value from a different
45 |             row.
46 |         Returns a copy of the dataframe with equal size.
47 |         """
48 | 
49 |         # select values to swap
50 |         tot_rows = self.__len__()
51 |         n_rows = int(round(tot_rows * likelihood))
52 |         n_cols = len(self.columns)
53 | 
54 |         def gen_indices():
55 |             column = np.repeat(np.arange(n_cols).reshape(1, -1), repeats=n_rows, axis=0)
56 |             row = np.random.randint(0, tot_rows, size=(n_rows, n_cols))
57 |             return row, column
58 | 
59 |         row, column = gen_indices()
60 |         new_mat = self.values
61 |         to_place = new_mat[row, column]
62 | 
63 |         row, column = gen_indices()
64 |         new_mat[row, column] = to_place
65 | 
66 |         dtypes = {col: typ for col, typ in zip(self.columns, self.dtypes)}
67 |         result = EncoderDataFrame(columns=self.columns, data=new_mat)
68 |         result = result.astype(dtypes, copy=False)
69 | 
70 |         return result
71 | 


--------------------------------------------------------------------------------
/ml_ids/model_selection.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utilities for machine learning model selection.
 3 | """
 4 | from typing import Tuple, List
 5 | import numpy as np
 6 | import pandas as pd
 7 | from sklearn.model_selection import train_test_split
 8 | from sklearn.metrics import precision_recall_curve
 9 | 
10 | 
11 | def train_val_test_split(df: pd.DataFrame,
12 |                          val_size: float = 0.1,
13 |                          test_size: float = 0.1,
14 |                          stratify_col: str = None,
15 |                          random_state: int = None) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
16 |     """
17 |     Splits the given DataFrame into three parts used for:
18 |     - training
19 |     - validation
20 |     - test
21 | 
22 |     :param df: Input DataFrame.
23 |     :param val_size: Size of validation set.
24 |     :param test_size: Size of test set.
25 |     :param stratify_col: Column to stratify.
26 |     :param random_state: Random state.
27 |     :return: A triple containing (`train`, `val`, `test`) sets.
28 |     """
29 |     assert (val_size + test_size) < 1, 'Sum of validation and test size must not be > 1.'
30 | 
31 |     df_stratify = df[stratify_col] if stratify_col else None
32 |     df_train, df_hold = train_test_split(df,
33 |                                          test_size=(val_size + test_size),
34 |                                          stratify=df_stratify,
35 |                                          random_state=random_state)
36 | 
37 |     df_hold_stratify = df_hold[stratify_col] if stratify_col else None
38 |     df_val, df_test = train_test_split(df_hold,
39 |                                        test_size=test_size / (val_size + test_size),
40 |                                        stratify=df_hold_stratify,
41 |                                        random_state=random_state)
42 | 
43 |     return df_train, df_val, df_test
44 | 
45 | 
46 | def split_x_y(df: pd.DataFrame, y_cols: List[str] = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
47 |     """
48 |     Splits the given DataFrame into a DataFrame `X` containing the predictor variables and a DataFrame 'y' containing
49 |     the labels y.
50 | 
51 |     :param df: Input DataFrame.
52 |     :param y_cols: Columns to use in the labels DataFrame `y`.
53 |     :return: A tuple containing the DataFrames (`X`, `y`).
54 |     """
55 |     if y_cols is None:
56 |         y_cols = ['label', 'label_cat', 'label_is_attack']
57 |     return df.drop(columns=y_cols), df[y_cols]
58 | 
59 | 
60 | def best_precision_for_target_recall(y_true, y_pred_score, target_recall):
61 |     """
62 |     Determines the decision boundary for the best precision given a specified target recall by using
63 |     the precision-recall curve.
64 | 
65 |     :param y_true: True labels.
66 |     :param y_pred_score: Predicted labels.
67 |     :param target_recall: Target recall.
68 |     :return: Decision boundary.
69 |     """
70 |     _, recalls, thresholds = precision_recall_curve(y_true, y_pred_score)
71 |     return thresholds[np.argmin(recalls >= target_recall)]
72 | 


--------------------------------------------------------------------------------
/ml_ids/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/ml_ids/models/__init__.py


--------------------------------------------------------------------------------
/ml_ids/models/gradient_boost/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/ml_ids/models/gradient_boost/__init__.py


--------------------------------------------------------------------------------
/ml_ids/models/gradient_boost/mlflow_wrapper.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Wrapper to enable usage of a CatBoost estimator with MLflow.
 3 | """
 4 | import pickle
 5 | import mlflow.pyfunc
 6 | from catboost import CatBoostClassifier
 7 | from ml_ids.data.dataset import remove_negative_values, remove_inf_values
 8 | 
 9 | 
10 | class CatBoostWrapper(mlflow.pyfunc.PythonModel):
11 |     """
12 |     MLflow wrapper for CatBoost estimators.
13 |     """
14 | 
15 |     def load_context(self, context):
16 |         # pylint: disable=attribute-defined-outside-init
17 |         with open(context.artifacts['pipeline'], 'rb') as f:
18 |             self.pipeline = pickle.load(f)
19 | 
20 |         with open(context.artifacts['col_config'], 'rb') as f:
21 |             column_config = pickle.load(f)
22 | 
23 |         self.clf = CatBoostClassifier()
24 |         self.clf.load_model(context.artifacts['cbm_model'])
25 |         self.col_names = column_config['col_names']
26 |         self.preserve_cols = column_config['preserve_neg_vals']
27 | 
28 |     def preprocess(self, data):
29 |         """
30 |         Applies the pre-processing pipeline to the features given in the input dataset.
31 | 
32 |         :param data: Input dataset.
33 |         :return: Transformed dataset.
34 |         """
35 |         data = data[self.col_names]
36 |         data = remove_inf_values(data)
37 |         data = remove_negative_values(data, ignore_cols=self.preserve_cols)
38 |         return self.pipeline.transform(data)
39 | 
40 |     def predict(self, context, model_input):
41 |         X = self.preprocess(model_input)
42 |         return self.clf.predict(X)
43 | 


--------------------------------------------------------------------------------
/ml_ids/models/gradient_boost/train.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utilities to train a machine learning estimator based on the Gradient Boosting algorithm using the CatBoost library.
  3 | """
  4 | import logging
  5 | from collections import namedtuple
  6 | import pandas as pd
  7 | from catboost import CatBoostClassifier, Pool
  8 | from sklearn.preprocessing import FunctionTransformer
  9 | 
 10 | from ml_ids.transform.preprocessing import create_pipeline
 11 | from ml_ids.transform.sampling import upsample_minority_classes
 12 | from ml_ids.model_selection import split_x_y
 13 | 
 14 | LOGGER = logging.getLogger(__name__)
 15 | 
 16 | GradientBoostHyperParams = namedtuple('GradientBoostHyperParams',
 17 |                                       ['nr_iterations', 'tree_depth', 'l2_reg', 'border_count', 'random_strength',
 18 |                                        'task_type'])
 19 | 
 20 | 
 21 | def fit_pipeline(train_dataset):
 22 |     """
 23 |     Creates and fits the scikit-learn pre-processing pipeline.
 24 | 
 25 |     :param train_dataset: Training dataset.
 26 |     :return: Tuple of (fitted scikit-learn pipeline, column names).
 27 |     """
 28 |     cols_to_impute = train_dataset.columns[train_dataset.isna().any()].tolist()
 29 | 
 30 |     X_train, _ = split_x_y(train_dataset)
 31 | 
 32 |     pipeline, get_col_names = create_pipeline(X_train,
 33 |                                               imputer_strategy='median',
 34 |                                               imputer_cols=cols_to_impute,
 35 |                                               scaler=FunctionTransformer,
 36 |                                               scaler_args={'validate': False})
 37 |     pipeline.fit(X_train)
 38 |     return pipeline, get_col_names()
 39 | 
 40 | 
 41 | def preprocess_val_dataset(pipeline, val_dataset):
 42 |     """
 43 |     Pre-processes the validation dataset.
 44 | 
 45 |     :param pipeline: Scikit-learn pipeline.
 46 |     :param val_dataset: Validation dataset.
 47 |     :return: Tuple of (transformed features, labels)
 48 |     """
 49 |     X_val, y_val = split_x_y(val_dataset)
 50 |     X_val = pipeline.transform(X_val)
 51 | 
 52 |     return X_val, y_val.label_is_attack
 53 | 
 54 | 
 55 | def preprocess_train_dataset(pipeline, train_dataset, nr_attack_samples, random_state):
 56 |     """
 57 |     Pre-processes the training dataset.
 58 | 
 59 |     :param pipeline: Scikit-learn pipeline.
 60 |     :param train_dataset: Training dataset.
 61 |     :param nr_attack_samples: Minimum number of attack samples per category. If the actual number of samples in the
 62 |     dataset is lower than this number the SMOTE algorithm will be used to upsample this category to have the requested
 63 |     number of samples.
 64 |     :return: Tuple of (transformed features, labels)
 65 |     """
 66 |     X_train, y_train = split_x_y(train_dataset)
 67 |     X_train = pipeline.transform(X_train)
 68 | 
 69 |     X_train, y_train = upsample_minority_classes(X_train, y_train,
 70 |                                                  min_samples=nr_attack_samples,
 71 |                                                  random_state=random_state)
 72 | 
 73 |     return X_train, (y_train != 0).astype('int')
 74 | 
 75 | 
 76 | def calculate_class_weights(y_train):
 77 |     """
 78 |     Calculates the class weights of the unique classes in the training labels.
 79 | 
 80 |     :param y_train: Training labels.
 81 |     :return: Array of class weights.
 82 |     """
 83 |     minority_class_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
 84 |     return [1, minority_class_weight]
 85 | 
 86 | 
 87 | def train_gb_classifier(train_pool,
 88 |                         val_pool,
 89 |                         class_weights,
 90 |                         nr_iterations,
 91 |                         tree_depth,
 92 |                         l2_reg,
 93 |                         border_count,
 94 |                         random_strength,
 95 |                         task_type,
 96 |                         random_state=None):
 97 |     """
 98 |     Trains an estimator based on the Gradient Boosting algorithm using the CatBoost library.
 99 | 
100 |     :param train_pool: Training dataset.
101 |     :param val_pool: Validation dataset.
102 |     :param class_weights: Class weights of the target labels.
103 |     :param nr_iterations: The maximum number of trees that can be built when solving machine learning problems.
104 |     :param tree_depth: Depth of a single tree.
105 |     :param l2_reg: Coefficient at the L2 regularization term of the cost function.
106 |     :param border_count: The number of splits for numerical features.
107 |     :param random_strength: The amount of randomness to use for scoring splits when the tree structure is selected.
108 |     :param task_type: The processing unit type to use for training (CPU | GPU).
109 |     :param random_state: State to initialize the random number generator.
110 |     :return: Trained CatBoost classifier.
111 |     """
112 |     clf = CatBoostClassifier(loss_function='Logloss',
113 |                              iterations=nr_iterations,
114 |                              depth=tree_depth,
115 |                              l2_leaf_reg=l2_reg,
116 |                              border_count=border_count,
117 |                              random_strength=random_strength,
118 |                              task_type=task_type,
119 |                              class_weights=class_weights,
120 |                              verbose=1,
121 |                              random_seed=random_state)
122 | 
123 |     clf.fit(train_pool, eval_set=val_pool)
124 |     return clf
125 | 
126 | 
127 | def train_model(train_dataset: pd.DataFrame,
128 |                 val_dataset: pd.DataFrame,
129 |                 hyper_params: GradientBoostHyperParams,
130 |                 nr_attack_samples: int,
131 |                 random_seed: int = None):
132 |     """
133 |     Trains an estimator based on the Gradient Boosting algorithm using the CatBoost library.
134 | 
135 |     :param train_dataset: Training dataset.
136 |     :param val_dataset: Validation dataset.
137 |     :param hyper_params: Hyper-parameters applied to the Gradient Boosting algorithm.
138 |     :param nr_attack_samples: Minimum number of attack samples per category. If the actual number of samples in the
139 |     dataset is lower than this number the SMOTE algorithm will be used to upsample this category to have the requested
140 |     number of samples.
141 |     :param random_seed: Seed to initialize the random number generator.
142 |     :return: Tuple of (CatBoost classifier, pre-processing pipeline, column names)
143 |     """
144 |     LOGGER.info('Training model with parameters [samples-per-attack-category=%s, hyperparams=%s]',
145 |                 nr_attack_samples,
146 |                 hyper_params)
147 | 
148 |     pipeline, col_names = fit_pipeline(train_dataset)
149 | 
150 |     X_train, y_train = preprocess_train_dataset(pipeline, train_dataset, nr_attack_samples, random_seed)
151 |     train_pool = Pool(X_train, y_train)
152 | 
153 |     if val_dataset is not None:
154 |         X_val, y_val = preprocess_val_dataset(pipeline, val_dataset)
155 |         val_pool = Pool(X_val, y_val)
156 |     else:
157 |         val_pool = None
158 | 
159 |     clf = train_gb_classifier(train_pool=train_pool,
160 |                               val_pool=val_pool,
161 |                               class_weights=calculate_class_weights(y_train),
162 |                               nr_iterations=hyper_params.nr_iterations,
163 |                               tree_depth=hyper_params.tree_depth,
164 |                               l2_reg=hyper_params.l2_reg,
165 |                               border_count=hyper_params.border_count,
166 |                               random_strength=hyper_params.random_strength,
167 |                               task_type=hyper_params.task_type,
168 |                               random_state=random_seed)
169 | 
170 |     return clf, pipeline, col_names
171 | 


--------------------------------------------------------------------------------
/ml_ids/prediction.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utilities to create predictions given a Scikit-learn estimator and a dataset containing input features.
 3 | """
 4 | 
 5 | 
 6 | def predict_proba_positive(clf, X):
 7 |     """
 8 |     Performs predictions for a binary classification task given a scikit-learn model.
 9 |     This function returns the class probability of the positive class.
10 | 
11 |     :param clf: Scikit-learn estimator.
12 |     :param X: Dataset containing the samples.
13 |     :return: numpy array containing the class probabilities of the positive class.
14 |     """
15 |     return clf.predict_proba(X)[:, 1].reshape(-1)
16 | 
17 | 
18 | def predict_decision_boundary(clf, X, decision_boundary=0.5):
19 |     """
20 |     Performs predictions for a binary classification task given a scikit-learn model and a decision boundary.
21 |     If the probability of a sample belonging to the positive class exceeds the decision boundary the positive label
22 |     is assigned to the sample, otherwise the negative label is used.
23 | 
24 |     :param clf: Scikit-learn estimator.
25 |     :param X: Dataset containing samples.
26 |     :param decision_boundary: Decision boundary used to assign predictions to the positive class.
27 |     :return: numpy array containing the binary predictions as one of the values {0, 1}.
28 |     """
29 |     pred = predict_proba_positive(clf, X)
30 |     return (pred >= decision_boundary).astype('int')
31 | 


--------------------------------------------------------------------------------
/ml_ids/tf_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utility functions for TensorFlow.
 3 | """
 4 | import tensorflow as tf
 5 | 
 6 | 
 7 | def enable_gpu_memory_growth():
 8 |     """
 9 |     Enables the experimental setting `allow_memory_growth` for GPU devices
10 | 
11 |     :return: None
12 |     """
13 |     physical_devices = tf.config.experimental.list_physical_devices('GPU')
14 |     tf.config.experimental.set_memory_growth(physical_devices[0], True)
15 | 


--------------------------------------------------------------------------------
/ml_ids/transform/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/ml_ids/transform/__init__.py


--------------------------------------------------------------------------------
/ml_ids/transform/preprocessing.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utilities for data pre-processing.
 3 | """
 4 | from typing import List
 5 | import numpy as np
 6 | import pandas as pd
 7 | from sklearn.pipeline import Pipeline
 8 | from sklearn.compose import ColumnTransformer
 9 | from sklearn.impute import SimpleImputer
10 | from sklearn.preprocessing import StandardScaler, OneHotEncoder
11 | from sklearn.exceptions import NotFittedError
12 | from sklearn.base import BaseEstimator
13 | 
14 | 
15 | def remove_outliers(df: pd.DataFrame, zscore: int = 3) -> pd.DataFrame:
16 |     """
17 |     Removes all rows from the given DataFrame containing outliers in any of the columns.
18 | 
19 |     :param df: Input DataFrame.
20 |     :param zscore: z-score to use when calculating outliers.
21 |     :return: The DataFrame with all outliers removed.
22 |     """
23 |     scores = (df - df.mean()) / df.std(ddof=0).values
24 |     return df[(np.abs(scores) < zscore).all(axis=1)]
25 | 
26 | 
27 | def create_pipeline(df: pd.DataFrame,
28 |                     imputer_strategy: str = 'mean',
29 |                     imputer_cols: List[str] = None,
30 |                     scaler: BaseEstimator = StandardScaler,
31 |                     scaler_args: dict = None,
32 |                     cat_cols: List[str] = None,
33 |                     copy: bool = True):
34 |     """
35 |     Creates a pipeline performing the following steps:
36 |     - value imputation
37 |     - value scaling
38 |     - one-hot-encoding of categorical values.
39 | 
40 |     :param df: Input DataFrame.
41 |     :param imputer_strategy: Imputer strategy applied to missing values.
42 |                              Allowed values are ['mean', 'median', 'most_frequent', 'constant'].
43 |     :param imputer_cols: Columns to impute. If no columns are specified all columns will be imputed.
44 |     :param scaler: Scikit-learn scaler to be applied to all values.
45 |     :param scaler_args: Additional arguments forwarded to the specified scaler.
46 |     :param cat_cols: Categorical columns to be one-hot-encoded.
47 |     :param copy: If True, a copy of the input will be created.
48 |     :return: A tuple containing the pipeline and a function returning the columns names after the pipeline has been
49 |              fitted.
50 |     """
51 | 
52 |     def create_get_feature_names(p, imp, scl, cat):
53 |         def get_feature_names():
54 |             if not hasattr(p, 'transformers_'):
55 |                 raise AssertionError('Pipeline is not yet fitted.')
56 | 
57 |             try:
58 |                 cat_names = p.transformers_[2][1].get_feature_names(cat)
59 |             except NotFittedError:
60 |                 cat_names = []
61 |             return np.append(imp, np.append(scl, cat_names))
62 | 
63 |         return get_feature_names
64 | 
65 |     if scaler_args is None:
66 |         scaler_args = {}
67 | 
68 |     cat_features = cat_cols if cat_cols else []
69 |     num_features = [c for c in df.select_dtypes(include=[np.number]).columns.values if c not in cat_features]
70 |     imp_features: List[str] = []
71 | 
72 |     if imputer_strategy is not None:
73 |         imp_features = imputer_cols if imputer_cols else num_features
74 | 
75 |     scale_features = [f for f in num_features if f not in imp_features]
76 | 
77 |     imp_pipeline = Pipeline([
78 |         ('imputer', SimpleImputer(missing_values=np.nan, strategy=imputer_strategy, copy=copy)),
79 |         ('imp_scaler', scaler(**scaler_args))
80 |     ])
81 | 
82 |     pipeline = ColumnTransformer([
83 |         ('imp', imp_pipeline, imp_features),
84 |         ('scl', scaler(**scaler_args), scale_features),
85 |         ('one_hot', OneHotEncoder(categories='auto'), cat_features)
86 |     ])
87 | 
88 |     return pipeline, create_get_feature_names(pipeline, imp_features, scale_features, cat_features)
89 | 


--------------------------------------------------------------------------------
/ml_ids/transform/sampling.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utilities to modify the amount of samples of specific categories in a datasets.
 3 | """
 4 | import numpy as np
 5 | import pandas as pd
 6 | from imblearn.over_sampling import SMOTE, SMOTENC
 7 | from typing import Tuple, List
 8 | 
 9 | 
10 | def upsample_minority_classes(X: np.ndarray,
11 |                               y: pd.DataFrame,
12 |                               min_samples: int,
13 |                               random_state: int = None,
14 |                               cat_cols: List[int] = None,
15 |                               n_jobs: int = 24) -> Tuple[np.ndarray, np.ndarray]:
16 |     """
17 |     Synthetic up-sampling of minority classes using `imblearn.over_sampling.SMOTE`.
18 | 
19 |     :param X: Predictor variables.
20 |     :param y: Labels.
21 |     :param min_samples: Minimum samples of each class.
22 |     :param random_state: Random state.
23 |     :param cat_cols: Column indices of categorical features.
24 |     :param n_jobs: Number of threads to use.
25 |     :return: A tuple containing the up-sampled X and y values.
26 |     """
27 |     counts = y.label_cat.value_counts()
28 |     sample_dict = {}
29 | 
30 |     for i in np.unique(y.label_cat):
31 |         sample_dict[i] = max(counts[i], min_samples)
32 | 
33 |     if cat_cols:
34 |         smote = SMOTENC(sampling_strategy=sample_dict,
35 |                         categorical_features=cat_cols,
36 |                         n_jobs=n_jobs,
37 |                         random_state=random_state)
38 |     else:
39 |         smote = SMOTE(sampling_strategy=sample_dict, n_jobs=n_jobs, random_state=random_state)
40 | 
41 |     x_s, y_s = smote.fit_resample(X, y.label_cat)
42 |     return x_s, y_s
43 | 
44 | 
45 | def create_sample_dict(df: pd.DataFrame,
46 |                        default_nr_samples: int,
47 |                        samples_per_label: dict = None) -> dict:
48 |     """
49 |     Creates a dictionary containing the number of samples per label.
50 | 
51 |     :param df: Input DataFrame.
52 |     :param default_nr_samples: Default number of samples per label.
53 |     :param samples_per_label: Number of samples for specific labels.
54 |     :return: Dictionary containing the number of samples per label.
55 |     """
56 |     if samples_per_label is None:
57 |         samples_per_label = {}
58 | 
59 |     sample_dict = df.label_cat.value_counts().to_dict()
60 | 
61 |     for label in sample_dict.keys():
62 |         requested_samples = samples_per_label[label] if label in samples_per_label else default_nr_samples
63 |         existing_samples = sample_dict[label] if label in sample_dict else 0
64 |         sample_dict[label] = min(requested_samples, existing_samples)
65 | 
66 |     return sample_dict
67 | 
68 | 
69 | def downsample(df: pd.DataFrame,
70 |                default_nr_samples: int,
71 |                samples_per_label: dict = None,
72 |                random_state: int = None) -> pd.DataFrame:
73 |     """
74 |     Downsamples the given DataFrame to contain at most `default_nr_samples` per instance of label.
75 | 
76 |     :param df: Input DataFrame.
77 |     :param default_nr_samples: Default number of samples per label.
78 |     :param samples_per_label: Number of samples for specific labels.
79 |     :param random_state: Random state.
80 |     :return: The downsampled DataFrame.
81 |     """
82 |     if samples_per_label is None:
83 |         samples_per_label = {}
84 | 
85 |     sample_dict = create_sample_dict(df, default_nr_samples, samples_per_label)
86 |     return pd.concat([df[df.label_cat == l].sample(n=n, random_state=random_state) for l, n in sample_dict.items()])
87 | 


--------------------------------------------------------------------------------
/ml_ids/visualization.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Visualization utilities for IPython Notebooks.
  3 | """
  4 | # pylint: disable=import-error
  5 | import numpy as np
  6 | import pandas as pd
  7 | import seaborn as sns
  8 | import matplotlib.pyplot as plt
  9 | from matplotlib.ticker import MaxNLocator
 10 | from sklearn.metrics import confusion_matrix, classification_report, average_precision_score, precision_recall_curve
 11 | from IPython.display import display
 12 | 
 13 | 
 14 | def plot_hist(hist,
 15 |               metrics=None,
 16 |               y_lim=None,
 17 |               size=(8, 5),
 18 |               ax=None):
 19 |     """
 20 |     Plot a Keras history object.
 21 | 
 22 |     :param hist: The Keras history.
 23 |     :param metrics: A list of histories to plot.
 24 |     :param y_lim: Limits the y-axis.
 25 |     :param size: Size of the plot.
 26 |     :param ax: Axis to apply the plot.
 27 |     """
 28 |     if metrics is None:
 29 |         metrics = ['loss', 'val_loss']
 30 | 
 31 |     fig_size = size if not ax else None
 32 | 
 33 |     df = pd.DataFrame(hist.history)[metrics]
 34 |     df.plot(figsize=fig_size, ax=ax)
 35 | 
 36 |     gca = ax if ax else plt.gca()
 37 |     gca.xaxis.set_major_locator(MaxNLocator(integer=True))
 38 | 
 39 |     if y_lim:
 40 |         gca.set_ylim(y_lim)
 41 | 
 42 |     if ax:
 43 |         ax.grid(True)
 44 |     else:
 45 |         plt.grid(True)
 46 |         plt.show()
 47 | 
 48 | 
 49 | def plot_confusion_matrix(y_true,
 50 |                           y_pred,
 51 |                           classes=None,
 52 |                           size=(10, 10),
 53 |                           normalize=False,
 54 |                           title=None,
 55 |                           print_raw=False,
 56 |                           cmap=plt.cm.Blues):
 57 |     """
 58 |     This function prints and plots the confusion matrix.
 59 |     Normalization can be applied by setting `normalize=True`.
 60 | 
 61 |     :param y_true: True labels.
 62 |     :param y_pred: Predicted labels.
 63 |     :param classes: List of class names.
 64 |     :param size: Size of the plot.
 65 |     :param normalize: If True values of the confusion matrix will be normalized.
 66 |     :param title: Title of the plot.
 67 |     :param print_raw: If True the raw confusion matrix is printed.
 68 |     :param cmap: Color map
 69 |     """
 70 |     if not title:
 71 |         if normalize:
 72 |             title = 'Normalized confusion matrix'
 73 |         else:
 74 |             title = 'Confusion matrix, without normalization'
 75 | 
 76 |     # Compute confusion matrix
 77 |     cm = confusion_matrix(y_true, y_pred)
 78 | 
 79 |     if normalize:
 80 |         cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
 81 | 
 82 |     if print_raw:
 83 |         print(cm)
 84 | 
 85 |     fig, ax = plt.subplots(figsize=size)
 86 |     im = ax.matshow(cm, interpolation='nearest', cmap=cmap)
 87 |     ax.figure.colorbar(im, ax=ax)
 88 |     ax.set(title=title,
 89 |            ylabel='True label',
 90 |            xlabel='Predicted label')
 91 | 
 92 |     if classes is not None:
 93 |         x_labels = classes
 94 |         y_labels = classes
 95 | 
 96 |         ax.set(xticks=np.arange(cm.shape[1]),
 97 |                yticks=np.arange(cm.shape[0]),
 98 |                xticklabels=x_labels,
 99 |                yticklabels=y_labels)
100 | 
101 |     plt.margins(2)
102 |     ax.tick_params(axis="x", bottom=True, labelbottom=True, top=False, labeltop=False, rotation=45)
103 | 
104 |     # Rotate the tick labels and set their alignment.
105 |     plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
106 |              rotation_mode="anchor")
107 | 
108 |     # Loop over data dimensions and create text annotations.
109 |     fmt = '.2f' if normalize else 'd'
110 |     thresh = cm.max() / 2.
111 |     for i in range(cm.shape[0]):
112 |         for j in range(cm.shape[1]):
113 |             ax.text(j, i, format(cm[i, j], fmt),
114 |                     ha="center", va="center",
115 |                     color="white" if cm[i, j] > thresh else "black")
116 |     fig.tight_layout()
117 |     return ax
118 | 
119 | 
120 | def identity(x):
121 |     """
122 |     Identity function.
123 |     """
124 |     return x
125 | 
126 | 
127 | def plot_threshold(pred_train, pred_val, threshold, size=(15, 5), transform=identity):
128 |     """
129 |     Plots the reconstruction errors of training and test samples and displays the classification threshold.
130 | 
131 |     :param pred_train: Predictions of training samples.
132 |     :param pred_val: Predictions of validation samples.
133 |     :param threshold: Classification threshold.
134 |     :param size: Size of the plot.
135 |     :param transform: Value transformation.
136 |     """
137 |     _, ax = plt.subplots(figsize=size)
138 |     sns.distplot(transform(pred_train.rec_error.values), hist=False, ax=ax, label='Train Benign')
139 |     sns.distplot(transform(pred_val[pred_val.y_true == 0].rec_error.values), hist=False, ax=ax,
140 |                  label='Validation Benign')
141 |     sns.distplot(transform(pred_val[pred_val.y_true == 1].rec_error.values), hist=False, ax=ax,
142 |                  label='Validation Attack')
143 |     ax.axvline(transform(threshold), color='red', linestyle='--')
144 |     ax.legend()
145 | 
146 | 
147 | def get_misclassifications(y, y_true, pred):
148 |     """
149 |     Calculates the misclassification rate for each label.
150 | 
151 |     :param y: Pandas DataFrame containing the target labels.
152 |     :param y_true: True labels.
153 |     :param pred: Predicted labels.
154 |     :return: Pandas DataFrame containing the misclassification per label.
155 |     """
156 |     misclassifications = y[y_true != pred]
157 | 
158 |     mc_df = pd.merge(pd.DataFrame({'misclassified': misclassifications.label.value_counts()}),
159 |                      pd.DataFrame({'total': y.label.value_counts()}),
160 |                      how='left', left_index=True, right_index=True)
161 |     mc_df['percent_misclassified'] = mc_df.apply(lambda x: x[0] / x[1], axis=1)
162 |     return mc_df.sort_values('percent_misclassified', ascending=False)
163 | 
164 | 
165 | def print_binary_performance(y, y_true, pred, print_misclassifications=True, digits=3):
166 |     """
167 |     Prints the performance of a binary classifier using
168 |     - the classification report,
169 |     - the confusion matrix and
170 |     - the misclassification report.
171 | 
172 |     :param y: Pandas DataFrame containing the target labels (binary, categories).
173 |     :param y_true: True labels.
174 |     :param pred: Predicted labels.
175 |     :param print_misclassifications: Binary indicator instructing that the misclassification report should be printed.
176 |     :param digits: Number of digits used to print the classification report.
177 |     :return: None
178 |     """
179 |     print('Classification Report:')
180 |     print('======================')
181 |     print(classification_report(y_true, pred, digits=digits))
182 | 
183 |     print('Confusion Matrix:')
184 |     print('=================')
185 |     plot_confusion_matrix(y_true, pred, np.array(['Benign', 'Attack']), size=(5, 5))
186 |     plt.show()
187 | 
188 |     if print_misclassifications:
189 |         print('Misclassifications by attack category:')
190 |         print('======================================')
191 |         mc_df = get_misclassifications(y, y_true, pred)
192 |         display(mc_df)
193 | 
194 | 
195 | def plot_pr_curve(y_true, y_score, size=(8, 5), average='weighted'):
196 |     """
197 |     Plots the precision-recall curve for a single estimator.
198 | 
199 |     :param y_true: True labels.
200 |     :param y_score: Predicted probabilities.
201 |     :param size: Size of the plot.
202 |     :param average: Average parameter used for the calculation of the average precision score.
203 |     :return: None
204 |     """
205 |     precisions, recalls, _ = precision_recall_curve(y_true, y_score)
206 |     pr_auc = average_precision_score(y_true, y_score, average=average)
207 | 
208 |     plt.figure(figsize=size)
209 |     plt.plot(recalls, precisions, label='auc={}'.format(pr_auc))
210 |     plt.title('Precision / Recall Curve')
211 |     plt.xlabel('Recall')
212 |     plt.ylabel('Precision')
213 |     plt.legend(loc='lower left')
214 |     plt.show()
215 | 
216 |     print('Average PR Score {}'.format(pr_auc))
217 | 
218 | 
219 | def plot_pr_curves(y_true, y_score_dict, size=(8, 5), average='weighted'):
220 |     """
221 |     Plots the precision-recall curve for a multiple estimators.
222 | 
223 |     :param y_true: True labels.
224 |     :param y_score_dict: Dictionary containing the estimator name as keys and the predicted label probabilities
225 |            as values.
226 |     :param size: Size of the plot.
227 |     :param average: Average parameter used for the calculation of the average precision score.
228 |     :return: None
229 |     """
230 |     plt.figure(figsize=size)
231 | 
232 |     for name, y_score in y_score_dict.items():
233 |         precisions, recalls, _ = precision_recall_curve(y_true, y_score)
234 |         pr_auc = average_precision_score(y_true, y_score, average=average)
235 |         plt.plot(recalls, precisions, label='{} (AUC={})'.format(name, pr_auc))
236 | 
237 |     plt.title('Precision / Recall Curve')
238 |     plt.xlabel('Recall')
239 |     plt.ylabel('Precision')
240 |     plt.legend(loc='lower left')
241 |     plt.show()
242 | 
243 | 
244 | def plot_pr_threshold_curves(y_true, y_pred_score, size=(20, 8)):
245 |     """
246 |     Plots the precision-recall values for different probability thresholds.
247 | 
248 |     :param y_true: True labels.
249 |     :param y_pred_score: Predicted probabilities.
250 |     :param size: Size of the plot.
251 |     :return: None
252 |     """
253 |     precisions, recalls, thresholds = precision_recall_curve(y_true, y_pred_score)
254 | 
255 |     # plot precision / recall for different thresholds
256 |     plt.figure(figsize=size)
257 |     plt.plot(thresholds, precisions[:-1], label="Precision")
258 |     plt.plot(thresholds, recalls[:-1], label="Recall")
259 |     plt.title('Precision / Recall of different thresholds')
260 |     plt.xlabel('Threshold')
261 |     plt.ylabel('Precision / Recall')
262 |     plt.legend(loc='lower right')
263 |     plt.show()
264 | 


--------------------------------------------------------------------------------
/models/gradient_boost/envs/local/train.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import click
 3 | import mlflow
 4 | import shutil
 5 | import os
 6 | 
 7 | 
 8 | def merge(dict1, dict2):
 9 |     """
10 |     Merges two dictionaries by creating copies of the dictionaries.
11 |     :param dict1: First dictionary to merge
12 |     :param dict2: Second dictionary to merge
13 |     :return: Merged dictionary
14 |     """
15 |     d = dict(dict1)
16 |     d.update(dict2)
17 |     return d
18 | 
19 | 
20 | @click.command()
21 | @click.option('--train-path', type=click.Path(exists=True), required=True,
22 |               help='Path to the train dataset in .h5 format.')
23 | @click.option('--val-path', type=click.Path(exists=True), required=True,
24 |               help='Path to the train dataset in .h5 format.')
25 | @click.option('--test-path', type=click.Path(exists=True), required=True,
26 |               help='Path to the train dataset in .h5 format.')
27 | @click.option('--output-path', type=click.Path(), required=True,
28 |               help='Path to store the output.')
29 | @click.option('--param-path', type=click.Path(exists=True), required=True,
30 |               help='Path to the training parameters.')
31 | def train(train_path, val_path, test_path, output_path, param_path):
32 |     with open(param_path, 'r') as f:
33 |         params = json.load(f)
34 | 
35 |     shutil.rmtree(output_path, ignore_errors=True)
36 |     os.makedirs(output_path, exist_ok=True)
37 | 
38 |     run_params = merge(params, {
39 |         'train_path': train_path,
40 |         'val_path': val_path,
41 |         'test_path': test_path,
42 |         'output_path': output_path,
43 |         'artifact_path': output_path,
44 |     })
45 | 
46 |     mlflow.run('models/gradient_boost/project',
47 |                parameters=run_params)
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     train()
52 | 


--------------------------------------------------------------------------------
/models/gradient_boost/envs/sagemaker/configs/deploy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "deploy": {
 3 |     "app_name": "ml-ids-classifier",
 4 |     "instance_type": "ml.t2.medium",
 5 |     "instance_count": 1,
 6 |     "region": "eu-west-1"
 7 |   },
 8 |   "role": "arn:aws:iam::763816190631:role/service-role/AmazonSageMaker-ExecutionRole-20191125T215860",
 9 |   "model_bucket": "s3://sagemaker-eu-west-1-763816190631",
10 |   "model_artifact": "model.tar.gz",
11 |   "model_name": "ml-ids-gb_mlflow_pyfunc"
12 | }


--------------------------------------------------------------------------------
/models/gradient_boost/envs/sagemaker/configs/train-cpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "instance_type": "ml.m5.large",
 4 |     "instance_count": 1,
 5 |     "task_type": "CPU"
 6 |   },
 7 |   "role": "arn:aws:iam::763816190631:role/service-role/AmazonSageMaker-ExecutionRole-20191125T215860",
 8 |   "data": {
 9 |     "train": "s3://ml-ids-2018-sm/training",
10 |     "val": "s3://ml-ids-2018-sm/validation",
11 |     "test": "s3://ml-ids-2018-sm/testing"
12 |   },
13 |   "model_bucket": "s3://sagemaker-eu-west-1-763816190631"
14 | }


--------------------------------------------------------------------------------
/models/gradient_boost/envs/sagemaker/configs/train-gpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train": {
 3 |     "instance_type": "ml.p2.xlarge",
 4 |     "instance_count": 1,
 5 |     "task_type": "GPU"
 6 |   },
 7 |   "role": "arn:aws:iam::763816190631:role/service-role/AmazonSageMaker-ExecutionRole-20191125T215860",
 8 |   "data": {
 9 |     "train": "s3://ml-ids-2018-full/training",
10 |     "val": "s3://ml-ids-2018-full/validation",
11 |     "test": "s3://ml-ids-2018-full/testing"
12 |   },
13 |   "model_bucket": "s3://sagemaker-eu-west-1-763816190631"
14 | }


--------------------------------------------------------------------------------
/models/gradient_boost/envs/sagemaker/container/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:10.1-base
 2 | 
 3 | # Install Miniconda 3
 4 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
 5 | ENV PATH /opt/conda/bin:$PATH
 6 | 
 7 | RUN apt-get update --fix-missing && \
 8 |     apt-get install -y wget bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 git mercurial subversion && \
 9 |     apt-get clean
10 | 
11 | RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh -O ~/miniconda.sh && \
12 |     /bin/bash ~/miniconda.sh -b -p /opt/conda && \
13 |     rm ~/miniconda.sh && \
14 |     /opt/conda/bin/conda clean -tipsy && \
15 |     ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
16 |     echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
17 |     echo "conda activate base" >> ~/.bashrc && \
18 |     find /opt/conda/ -follow -type f -name '*.a' -delete && \
19 |     find /opt/conda/ -follow -type f -name '*.js.map' -delete && \
20 |     /opt/conda/bin/conda clean -afy
21 | 
22 | # Copy Conda environment file
23 | COPY models/gradient_boost/project/conda.yaml /opt/ml/code/conda.yaml
24 | 
25 | # Install Conda environment
26 | RUN conda env create -f /opt/ml/code/conda.yaml
27 | 
28 | # Copy project files
29 | ADD ml_ids /opt/ml/code/ml_ids
30 | ADD models/gradient_boost/project /opt/ml/code/models/gradient_boost/project
31 | COPY setup.cfg /opt/ml/code/setup.cfg
32 | COPY setup.py /opt/ml/code/setup.py
33 | 
34 | # Activate conda env
35 | RUN echo "source activate ml-ids-gradient-boost-catboost" > ~/.bashrc
36 | ENV PATH /opt/conda/envs/ml-ids-gradient-boost-catboost/bin:$PATH
37 | 
38 | # Copy train script and make it executable
39 | COPY models/gradient_boost/envs/sagemaker/container/train.py /opt/ml/code/train
40 | RUN chmod +x /opt/ml/code/train
41 | ENV PATH="/opt/ml/code:${PATH}"
42 | 
43 | WORKDIR /opt/ml/code


--------------------------------------------------------------------------------
/models/gradient_boost/envs/sagemaker/container/train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import os
 5 | import json
 6 | import traceback
 7 | import uuid
 8 | import mlflow
 9 | 
10 | prefix = '/opt/ml/'
11 | 
12 | output_path = os.path.join(prefix, 'output')
13 | model_path = os.path.join(prefix, 'model')
14 | param_path = os.path.join(prefix, 'input/config/hyperparameters.json')
15 | 
16 | input_path = prefix + 'input/data'
17 | training_path = os.path.join(input_path, 'training')
18 | validation_path = os.path.join(input_path, 'validation')
19 | testing_path = os.path.join(input_path, 'testing')
20 | 
21 | mlflow_project_uri = os.path.join(prefix, 'code/models/gradient_boost/project')
22 | mlflow_out_path = os.path.join('/tmp', str(uuid.uuid4()))
23 | 
24 | 
25 | def merge(dict1, dict2):
26 |     d = dict(dict1)
27 |     d.update(dict2)
28 |     return d
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     print('Starting the training')
33 | 
34 |     try:
35 |         with open(param_path, 'r') as tc:
36 |             training_params = json.load(tc)
37 | 
38 |         training_file_path = os.path.join(training_path, 'train.h5')
39 |         validation_file_path = os.path.join(validation_path, 'val.h5')
40 |         testing_file_path = os.path.join(testing_path, 'test.h5')
41 | 
42 |         mlflow_params = merge(training_params, {
43 |             'train_path': training_file_path,
44 |             'val_path': validation_file_path,
45 |             'test_path': testing_file_path,
46 |             'output_path': mlflow_out_path,
47 |             'artifact_path': model_path
48 |         })
49 | 
50 |         os.makedirs(mlflow_out_path, exist_ok=True)
51 | 
52 |         mlflow.run(mlflow_project_uri, parameters=mlflow_params, use_conda=False)
53 |         print('Training complete.')
54 | 
55 |         sys.exit(0)
56 |     except Exception as e:
57 |         # Write out an error file. This will be returned as the failureReason in the
58 |         # DescribeTrainingJob result.
59 |         trc = traceback.format_exc()
60 |         with open(os.path.join(output_path, 'failure'), 'w') as s:
61 |             s.write('Exception during training: ' + str(e) + '\n' + trc)
62 |         # Printing this causes the exception to be in the training job logs, as well.
63 |         print('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr)
64 |         # A non-zero exit code causes the training job to be marked as Failed.
65 |         sys.exit(255)
66 | 


--------------------------------------------------------------------------------
/models/gradient_boost/envs/sagemaker/scripts/build_image.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | image_name=$1
 4 | image_version=$2
 5 | 
 6 | if [ "$image_name" == "" ]
 7 | then
 8 |     echo "Usage: $0 <image-name>"
 9 |     exit 1
10 | fi
11 | 
12 | if [ "$image_version" == "" ]
13 | then
14 |     echo "Usage: $1 <image-version>"
15 |     exit 1
16 | fi
17 | 
18 | fullname="${image_name}:${image_version}"
19 | 
20 | echo "Building image '${fullname}'"
21 | 
22 | docker build -f models/gradient_boost/envs/sagemaker/container/Dockerfile -t ${fullname} .


--------------------------------------------------------------------------------
/models/gradient_boost/envs/sagemaker/scripts/deploy.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | import json
 3 | import boto3
 4 | import tarfile
 5 | import re
 6 | import logging
 7 | from mlflow import sagemaker
 8 | 
 9 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | def unpack(file):
14 |     """
15 |     Unpacks compressed files of format `tar` and `tar.gz`.
16 |     :param file: Filename.
17 |     :return: None
18 |     """
19 |     if file.endswith("tar.gz"):
20 |         tar = tarfile.open(file, "r:gz")
21 |         tar.extractall()
22 |         tar.close()
23 |     elif file.endswith("tar"):
24 |         tar = tarfile.open(file, "r:")
25 |         tar.extractall()
26 |         tar.close()
27 | 
28 | 
29 | @click.command()
30 | @click.option('--config-path', type=click.Path(exists=True), required=True,
31 |               help='Path to the config.')
32 | @click.option('--job-id', type=str, required=True,
33 |               help='Unique ID of the training job. Model is retrieved from a subdirectory with this name.')
34 | def deploy(config_path, job_id):
35 |     with open(config_path, 'r') as f:
36 |         config = json.load(f)
37 | 
38 |     app_name = config['deploy']['app_name']
39 |     instance_type = config['deploy']['instance_type']
40 |     instance_count = config['deploy']['instance_count']
41 |     region = config['deploy']['region']
42 |     role = config['role']
43 |     model_name = config['model_name']
44 |     model_bucket = re.sub('s3://', '', config['model_bucket'])
45 |     model_artifact = config['model_artifact']
46 |     model_path = '{}/output/{}'.format(job_id, model_artifact)
47 | 
48 |     logger.info('Deploying model with parameters '
49 |                 '[app-name="{}", instance-type="{}", instance-count={}, region="{}", model-path="{}"]'
50 |                 .format(app_name, instance_type, instance_count, region, model_path))
51 | 
52 |     s3 = boto3.client('s3')
53 |     s3.download_file(model_bucket, model_path, model_artifact)
54 | 
55 |     unpack(model_artifact)
56 | 
57 |     sagemaker.deploy(app_name=app_name,
58 |                      model_uri=model_name,
59 |                      execution_role_arn=role,
60 |                      region_name=region,
61 |                      mode='replace',
62 |                      instance_type=instance_type,
63 |                      instance_count=instance_count)
64 | 
65 | 
66 | if __name__ == '__main__':
67 |     deploy()
68 | 


--------------------------------------------------------------------------------
/models/gradient_boost/envs/sagemaker/scripts/push_image_to_ecr.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | image_name=$1
 4 | image_version=$2
 5 | 
 6 | if [ "$image_name" == "" ]
 7 | then
 8 |     echo "Usage: $0 <image-name>"
 9 |     exit 1
10 | fi
11 | 
12 | if [ "$image_version" == "" ]
13 | then
14 |     echo "Usage: $1 <image-version>"
15 |     exit 1
16 | fi
17 | 
18 | # Get the account number associated with the current IAM credentials
19 | account=$(aws sts get-caller-identity --query Account --output text)
20 | 
21 | if [ $? -ne 0 ]
22 | then
23 |     exit 255
24 | fi
25 | 
26 | # Get the region defined in the current configuration (default to eu-west-1 if none defined)
27 | region=$(aws configure get region)
28 | region=${region:-eu-west-1}
29 | 
30 | fullname="${account}.dkr.ecr.${region}.amazonaws.com/${image_name}:${image_version}"
31 | 
32 | # If the repository doesn't exist in ECR, create it.
33 | 
34 | aws ecr describe-repositories --repository-names "${image_name}" > /dev/null 2>&1
35 | 
36 | if [ $? -ne 0 ]
37 | then
38 |     aws ecr create-repository --repository-name "${image_name}" > /dev/null
39 | fi
40 | 
41 | # Get the login command from ECR and execute it directly
42 | $(aws ecr get-login --region ${region} --no-include-email)
43 | 
44 | # Build the docker image locally with the image name and then push it to ECR
45 | # with the full name.
46 | 
47 | docker tag "${image_name}:${image_version}" ${fullname}
48 | docker push ${fullname}
49 | 
50 | echo "image-name=${fullname}"


--------------------------------------------------------------------------------
/models/gradient_boost/envs/sagemaker/scripts/train.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import click
 3 | import logging
 4 | from sagemaker.estimator import Estimator
 5 | 
 6 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | 
10 | def create_performance_metric_regex(id):
11 |     """
12 |     Creates the regex for a single performance metric.
13 |     Format: metric_name: 0.12345
14 |     :param id: Metric identifier.
15 |     :return: Regex
16 |     """
17 |     return rf'{id}:\s*([\d.]*)'
18 | 
19 | 
20 | def create_metric_def(name, regex):
21 |     """
22 |     Creates a metric definition for a single metric.
23 |     :param name: Metric name.
24 |     :param regex: Metric regex.
25 |     :return: Metric definition as a `dict`.
26 |     """
27 |     return {'Name': name, 'Regex': regex}
28 | 
29 | 
30 | def get_metric_definitions():
31 |     """
32 |     Creates the definitions for all metrics to monitor.
33 |     :return: Metric definitions as a `list`.
34 |     """
35 |     return [create_metric_def('train:loss', create_performance_metric_regex('learn')),
36 |             create_metric_def('val:loss', create_performance_metric_regex('test')),
37 |             create_metric_def('val:loss:best', r'bestTest\s=\s([\d.]*)'),
38 |             create_metric_def('test:pr_auc', create_performance_metric_regex('pr_auc')),
39 |             create_metric_def('test:precision', create_performance_metric_regex('precision')),
40 |             create_metric_def('test:recall', create_performance_metric_regex('recall')),
41 |             create_metric_def('test:f1', create_performance_metric_regex('f1'))]
42 | 
43 | 
44 | @click.command()
45 | @click.option('--config-path', type=click.Path(exists=True), required=True,
46 |               help='Path to the config.')
47 | @click.option('--param-path', type=click.Path(exists=True), required=True,
48 |               help='Path to the training parameters.')
49 | @click.option('--image-name', type=str, required=True,
50 |               help='Name of the training image')
51 | @click.option('--mode', type=click.Choice(['LOCAL', 'AWS'], case_sensitive=False), default='LOCAL',
52 |               help='Training mode.')
53 | @click.option('--job-id', type=str, required=True,
54 |               help='Unique ID of the training job. Model outputs will be stored in a subdirectory with this name.')
55 | def train(config_path, param_path, image_name, mode, job_id):
56 |     with open(config_path, 'r') as f:
57 |         config = json.load(f)
58 | 
59 |     with open(param_path, 'r') as f:
60 |         params = json.load(f)
61 | 
62 |     if mode == 'LOCAL':
63 |         train_instance_type = 'local'
64 |         params['task_type'] = 'CPU'
65 |     else:
66 |         train_instance_type = config['train']['instance_type']
67 |         params['task_type'] = config['train']['task_type']
68 | 
69 |     train_instance_count = config['train']['instance_count']
70 |     role = config['role']
71 |     model_bucket = config['model_bucket']
72 | 
73 |     logger.info('Start training with parameters '
74 |                 '[job-id="{}", image="{}", mode="{}", instance_type="{}", instance_count={}, params={}]'
75 |                 .format(job_id, image_name, mode, train_instance_type, train_instance_count, params))
76 | 
77 |     estimator = Estimator(image_name=image_name,
78 |                           role=role,
79 |                           train_instance_count=train_instance_count,
80 |                           train_instance_type=train_instance_type,
81 |                           hyperparameters=params,
82 |                           output_path=model_bucket,
83 |                           metric_definitions=get_metric_definitions(),
84 |                           train_max_run=(2 * 60 * 60))
85 | 
86 |     estimator.fit(job_name=job_id,
87 |                   inputs={
88 |                       'training': config['data']['train'],
89 |                       'validation': config['data']['val'],
90 |                       'testing': config['data']['test']
91 |                   })
92 | 
93 | 
94 | if __name__ == '__main__':
95 |     train()
96 | 


--------------------------------------------------------------------------------
/models/gradient_boost/envs/sagemaker/scripts/undeploy.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | import json
 3 | from mlflow import sagemaker
 4 | 
 5 | 
 6 | @click.command()
 7 | @click.option('--config-path', type=click.Path(exists=True), required=True,
 8 |               help='Path to the config.')
 9 | def undeploy(config_path):
10 |     with open(config_path, 'r') as f:
11 |         config = json.load(f)
12 | 
13 |     app_name = config['deploy']['app_name']
14 |     region = config['deploy']['region']
15 | 
16 |     sagemaker.delete(app_name=app_name, region_name=region)
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     undeploy()
21 | 


--------------------------------------------------------------------------------
/models/gradient_boost/project/MLproject:
--------------------------------------------------------------------------------
 1 | name: gradient_boost_model
 2 | 
 3 | conda_env: conda.yaml
 4 | 
 5 | entry_points:
 6 |   main:
 7 |     parameters:
 8 |       train_path: path
 9 |       val_path: path
10 |       test_path: path
11 |       output_path: path
12 |       artifact_path: path
13 |       use_val_set: {type: bool, default: True}
14 |       nr_iterations: {type: int, default: 1000}
15 |       tree_depth: {type: int, default: 6}
16 |       l2_reg: {type: float, default: 3.0}
17 |       border_count: {type: int, default: 254}
18 |       random_strength: {type: int, default: 1}
19 |       task_type: {type: str, default: 'GPU'}
20 |       nr_samples_attack_category: {type: int, default: 1000}
21 |       random_seed: {type: int, default: -1}
22 |     command: "pip install -e ../../../. &&
23 |               python train.py --train-path {train_path}
24 |                               --val-path {val_path}
25 |                               --test-path {test_path}
26 |                               --output-path {output_path}
27 |                               --artifact-path {artifact_path}
28 |                               --use-val-set {use_val_set}
29 |                               --random-seed {random_seed}
30 |                               --nr-iterations {nr_iterations}
31 |                               --tree-depth {tree_depth}
32 |                               --l2-reg {l2_reg}
33 |                               --border-count {border_count}
34 |                               --random-strength {random_strength}
35 |                               --task-type {task_type}
36 |                               --nr-samples-attack-category {nr_samples_attack_category}"


--------------------------------------------------------------------------------
/models/gradient_boost/project/conda.yaml:
--------------------------------------------------------------------------------
 1 | name: ml-ids-gradient-boost-catboost
 2 | channels:
 3 |   - anaconda
 4 |   - conda-forge
 5 |   - defaults
 6 | dependencies:
 7 |   - python=3.7
 8 |   - pip=19.2.3=py37_0
 9 |   - pandas=0.25.2=py37hb3f55d8_0
10 |   - catboost=0.18.1=py37_0
11 |   - imbalanced-learn=0.5.0=py_0
12 |   - scikit-learn=0.21.3=py37hcdab131_0
13 |   - scipy=1.3.1=py37h921218d_2
14 |   - click=7.0=py37_0
15 |   - cloudpickle=1.2.2=py_0
16 |   - pip:
17 |     - tables==3.6.1
18 |     - keras==2.2.4
19 |     - mlflow==1.4


--------------------------------------------------------------------------------
/models/gradient_boost/project/train.py:
--------------------------------------------------------------------------------
  1 | import click
  2 | import logging
  3 | import mlflow
  4 | import mlflow.pyfunc
  5 | import pickle
  6 | import os
  7 | import shutil
  8 | from catboost import Pool
  9 | from ml_ids.data.dataset import load_dataset_hdf
 10 | from ml_ids.data.metadata import FEATURES_NO_VARIANCE, FEATURES_TO_IGNORE, FEATURES_PRESERVE_NEG_COLUMNS
 11 | from ml_ids.prediction import predict_proba_positive
 12 | from ml_ids.model_selection import split_x_y
 13 | from ml_ids.models.gradient_boost.train import train_model, GradientBoostHyperParams
 14 | from ml_ids.models.gradient_boost.mlflow_wrapper import CatBoostWrapper
 15 | from sklearn.metrics import average_precision_score, precision_score, recall_score, f1_score
 16 | import logging
 17 | 
 18 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 19 | logger = logging.getLogger(__name__)
 20 | 
 21 | 
 22 | def load_dataset(path):
 23 |     """
 24 |     Loads a single dataset in `hdf` format.
 25 |     :param path: Dataset path.
 26 |     :return: Pandas DataFrame.
 27 |     """
 28 |     return load_dataset_hdf(dataset_path=path,
 29 |                             omit_cols=FEATURES_NO_VARIANCE + FEATURES_TO_IGNORE,
 30 |                             preserve_neg_value_cols=FEATURES_PRESERVE_NEG_COLUMNS)
 31 | 
 32 | 
 33 | def load_train_val_test_dataset(train_path, val_path, test_path):
 34 |     """
 35 |     Loads the train, validation and test datasets.
 36 |     :param train_path: Path to the train dataset.
 37 |     :param val_path: Path to the validation dataset.
 38 |     :param test_path: Path to the test dataset.
 39 |     :return: the `Tuple(train, val, test)` containing Pandas DataFrames.
 40 |     """
 41 |     return load_dataset(train_path), load_dataset(val_path), load_dataset(test_path)
 42 | 
 43 | 
 44 | def measure_performance(clf, pipeline, dataset):
 45 |     """
 46 |     Measures performance metrics on the given dataset.
 47 |     :param clf: Classifier to test.
 48 |     :param pipeline: Preprocessing pipeline.
 49 |     :param dataset: Dataset.
 50 |     :return: the `Tuple(pr_auc, precision, recall, f1)`.
 51 |     """
 52 |     X, y = split_x_y(dataset)
 53 |     X = pipeline.transform(X)
 54 | 
 55 |     pool = Pool(X)
 56 |     y_true = y.label_is_attack
 57 | 
 58 |     pred_proba = predict_proba_positive(clf, pool)
 59 |     pred = clf.predict(pool)
 60 | 
 61 |     pr_auc = average_precision_score(y_true, pred_proba)
 62 |     precision = precision_score(y_true, pred)
 63 |     recall = recall_score(y_true, pred)
 64 |     f1 = f1_score(y_true, pred)
 65 |     return pr_auc, precision, recall, f1
 66 | 
 67 | 
 68 | def save_artifacts(cbm_model_path, classifier, pipeline_path, pipeline, col_config_path, column_config):
 69 |     """
 70 |     Save training artifacts to disk.
 71 |     :param cbm_model_path: Path on disk where the classifier should be stored.
 72 |     :param classifier: Classifier to store.
 73 |     :param pipeline_path: Path on disk where the pipeline should be stored.
 74 |     :param pipeline: Pipeline to store.
 75 |     :param col_config_path: Path on disk where the config should be stored.
 76 |     :param column_config: Column config to store.
 77 |     :return: None
 78 |     """
 79 |     classifier.save_model(cbm_model_path)
 80 |     with open(pipeline_path, 'wb') as f:
 81 |         pickle.dump(pipeline, f)
 82 |     with open(col_config_path, 'wb') as f:
 83 |         pickle.dump(column_config, f)
 84 | 
 85 | 
 86 | @click.command()
 87 | @click.option('--train-path', type=click.Path(exists=True), required=True,
 88 |               help='Path to the train dataset in .h5 format.')
 89 | @click.option('--val-path', type=click.Path(exists=True), required=True,
 90 |               help='Path to the train dataset in .h5 format.')
 91 | @click.option('--test-path', type=click.Path(exists=True), required=True,
 92 |               help='Path to the train dataset in .h5 format.')
 93 | @click.option('--output-path', type=click.Path(exists=True), required=True,
 94 |               help='Path to store the output.')
 95 | @click.option('--artifact-path', type=click.Path(exists=True), required=True,
 96 |               help='Path to store the artifacts.')
 97 | @click.option('--use-val-set', type=bool, default=True,
 98 |               help='Determines if the evaluation dataset should be used for early stopping of the training process.'
 99 |                    'If set to False the evaluation dataset will be appended to the train dataset.')
100 | @click.option('--random-seed', type=int, default=None,
101 |               help='Random seed.')
102 | @click.option('--nr-iterations', type=int, required=True)
103 | @click.option('--tree-depth', type=int, required=True)
104 | @click.option('--l2-reg', type=float, required=True)
105 | @click.option('--border-count', type=int, required=True)
106 | @click.option('--random-strength', type=int, required=True)
107 | @click.option('--task-type', type=click.Choice(['CPU', 'GPU'], case_sensitive=False), required=True)
108 | @click.option('--nr-samples-attack-category', type=int, required=True)
109 | def train(train_path,
110 |           val_path,
111 |           test_path,
112 |           output_path,
113 |           artifact_path,
114 |           use_val_set,
115 |           random_seed,
116 |           nr_iterations,
117 |           tree_depth,
118 |           l2_reg,
119 |           border_count,
120 |           random_strength,
121 |           task_type,
122 |           nr_samples_attack_category):
123 |     shutil.rmtree(output_path, ignore_errors=True)
124 |     os.makedirs(output_path, exist_ok=True)
125 | 
126 |     cbm_model_path = os.path.join(output_path, 'gradient_boost_model.cbm')
127 |     pipeline_path = os.path.join(output_path, 'preprocessing_pipeline.pkl')
128 |     col_config_path = os.path.join(output_path, 'column_config.pkl')
129 |     mlflow_model_path = os.path.join(artifact_path, 'ml-ids-gb_mlflow_pyfunc')
130 | 
131 |     random_seed = None if random_seed == -1 else random_seed
132 | 
133 |     logger.info('Loading datasets...')
134 |     train_dataset, val_dataset, test_dataset = load_train_val_test_dataset(train_path, val_path, test_path)
135 | 
136 |     if not use_val_set:
137 |         logger.info('Evaluation dataset will not be used for early stopping. Merging with training dataset.')
138 |         train_dataset = train_dataset.append(val_dataset)
139 |         val_dataset = None
140 |     else:
141 |         logger.info('Evaluation dataset will be used for early stopping.')
142 | 
143 |     hyper_params = GradientBoostHyperParams(nr_iterations=nr_iterations,
144 |                                             tree_depth=tree_depth,
145 |                                             l2_reg=l2_reg,
146 |                                             border_count=border_count,
147 |                                             random_strength=random_strength,
148 |                                             task_type=task_type)
149 | 
150 |     with mlflow.start_run():
151 |         logger.info('Starting training...')
152 |         clf, pipeline, column_names = train_model(train_dataset,
153 |                                                   val_dataset,
154 |                                                   hyper_params=hyper_params,
155 |                                                   nr_attack_samples=nr_samples_attack_category,
156 |                                                   random_seed=random_seed)
157 | 
158 |         pr_auc, precision, recall, f1 = measure_performance(clf, pipeline, test_dataset)
159 |         logger.info('Estimator performance:')
160 |         logger.info('pr_auc: %f', pr_auc)
161 |         logger.info('precision: %f', precision)
162 |         logger.info('recall: %f', recall)
163 |         logger.info('f1: %f', f1)
164 | 
165 |         save_artifacts(cbm_model_path,
166 |                        clf,
167 |                        pipeline_path,
168 |                        pipeline,
169 |                        col_config_path,
170 |                        {
171 |                            'col_names': column_names,
172 |                            'preserve_neg_vals': FEATURES_PRESERVE_NEG_COLUMNS
173 |                        })
174 | 
175 |         mlflow.pyfunc.save_model(
176 |             path=mlflow_model_path,
177 |             python_model=CatBoostWrapper(),
178 |             artifacts={
179 |                 'cbm_model': cbm_model_path,
180 |                 'pipeline': pipeline_path,
181 |                 'col_config': col_config_path
182 |             },
183 |             conda_env='conda.yaml',
184 |             code_path=['../../../ml_ids'])
185 | 
186 |         logger.info('Training completed.')
187 | 
188 | 
189 | if __name__ == '__main__':
190 |     train()
191 | 


--------------------------------------------------------------------------------
/models/gradient_boost/training_params.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "task_type": "GPU",
 3 |   "use_val_set": true,
 4 |   "nr_iterations": 2000,
 5 |   "tree_depth": 10,
 6 |   "l2_reg": 4.813919374945952,
 7 |   "border_count": 254,
 8 |   "random_strength": 5,
 9 |   "nr_samples_attack_category": 100000
10 | }


--------------------------------------------------------------------------------
/models/gradient_boost/training_params_quick_run.json:
--------------------------------------------------------------------------------
1 | {
2 |   "task_type": "GPU",
3 |   "nr_iterations": 10,
4 |   "nr_samples_attack_category": 1000,
5 |   "random_seed": 42
6 | }


--------------------------------------------------------------------------------
/notebooks/03_ml-prototype/models/gradient_boost_model.cbm:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f9ff34d59ef5e2a1040b921b0b1d7565c63e4fd8d9bf4d080cf31a5e9ee13fc0
3 | size 14315968
4 | 


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00000-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00000-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00001-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00001-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00002-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00002-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00003-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00003-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00004-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00004-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00005-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00005-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00007-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00007-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00008-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00008-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00009-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00009-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00010-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00010-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00011-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00011-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00013-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00013-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00014-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00014-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00015-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00015-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00016-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00016-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00017-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00017-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00019-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00019-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00020-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00020-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00021-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00021-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00022-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00022-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00023-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00023-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/_SUCCESS


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00000-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00000-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00001-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00001-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00002-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00002-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00003-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00003-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00004-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00004-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00005-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00005-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00007-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00007-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00008-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00008-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00009-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00009-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00010-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00010-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00011-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00011-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00013-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00013-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00014-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00014-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00015-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00015-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00016-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00016-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00017-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00017-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00019-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00019-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00020-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00020-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00021-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00021-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00022-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00022-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00023-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00023-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/metadata/.part-00000.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/metadata/_SUCCESS


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"org.apache.spark.ml.classification.GBTClassificationModel","timestamp":1568299022309,"sparkVersion":"2.4.4","uid":"GBTClassifier_0f5cdab6ac21","paramMap":{"seed":42,"maxDepth":5,"labelCol":"label_is_attack","stepSize":0.5,"featuresCol":"features","maxIter":20},"defaultParamMap":{"rawPredictionCol":"rawPrediction","minInstancesPerNode":1,"impurity":"gini","minInfoGain":0.0,"seed":-715221063584165447,"maxDepth":5,"labelCol":"label","featureSubsetStrategy":"all","subsamplingRate":1.0,"probabilityCol":"probability","maxMemoryInMB":256,"cacheNodeIds":false,"validationTol":0.01,"stepSize":0.1,"checkpointInterval":10,"maxBins":32,"lossType":"logistic","predictionCol":"prediction","featuresCol":"features","maxIter":20},"numFeatures":47,"numTrees":20}
2 | 


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00000-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00000-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00001-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00001-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00002-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00002-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00003-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00003-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00004-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00004-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00005-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00005-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00006-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00006-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00007-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00007-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00008-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00008-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00009-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00009-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00010-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00010-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00011-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00011-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00012-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00012-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00013-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00013-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00014-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00014-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00015-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00015-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00016-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00016-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00017-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00017-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00018-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00018-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00019-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00019-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/_SUCCESS


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00000-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00000-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00001-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00001-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00002-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00002-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00003-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00003-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00004-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00004-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00005-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00005-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00006-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00006-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00007-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00007-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00008-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00008-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00009-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00009-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00010-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00010-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00011-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00011-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00012-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00012-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00013-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00013-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00014-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00014-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00015-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00015-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00016-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00016-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00017-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00017-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00018-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00018-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00019-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00019-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/estimator/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/estimator/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/estimator/metadata/.part-00000.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/estimator/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/estimator/metadata/_SUCCESS


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/estimator/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"org.apache.spark.ml.classification.GBTClassifier","timestamp":1568299022265,"sparkVersion":"2.4.4","uid":"GBTClassifier_0f5cdab6ac21","paramMap":{"seed":42,"labelCol":"label_is_attack","featuresCol":"features"},"defaultParamMap":{"rawPredictionCol":"rawPrediction","minInstancesPerNode":1,"impurity":"gini","minInfoGain":0.0,"seed":-715221063584165447,"maxDepth":5,"labelCol":"label","featureSubsetStrategy":"all","subsamplingRate":1.0,"probabilityCol":"probability","maxMemoryInMB":256,"cacheNodeIds":false,"validationTol":0.01,"stepSize":0.1,"checkpointInterval":10,"maxBins":32,"lossType":"logistic","predictionCol":"prediction","featuresCol":"features","maxIter":20}}
2 | 


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/evaluator/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/evaluator/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/evaluator/metadata/.part-00000.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/evaluator/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/evaluator/metadata/_SUCCESS


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/evaluator/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator","timestamp":1568299022228,"sparkVersion":"2.4.4","uid":"MulticlassClassificationEvaluator_2045e84d5752","paramMap":{"metricName":"weightedRecall","predictionCol":"prediction","labelCol":"label_is_attack"},"defaultParamMap":{"metricName":"f1","predictionCol":"prediction","labelCol":"label"}}
2 | 


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/metadata/.part-00000.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/metadata/_SUCCESS


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/gb-model/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"org.apache.spark.ml.tuning.CrossValidatorModel","timestamp":1568299022196,"sparkVersion":"2.4.4","uid":"CrossValidatorModel_4f73491e9469","paramMap":{"seed":880116102,"numFolds":3,"estimatorParamMaps":[[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"1","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.5","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"10","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"1","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.1","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"10","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"1","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.05","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"10","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"1","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.5","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"20","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"1","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.1","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"20","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"1","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.05","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"20","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"3","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.5","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"10","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"3","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.1","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"10","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"3","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.05","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"10","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"3","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.5","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"20","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"3","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.1","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"20","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"3","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.05","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"20","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"5","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.5","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"10","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"5","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.1","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"10","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"5","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.05","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"10","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"5","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.5","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"20","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"5","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.1","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"20","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"5","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.05","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"20","isJson":"true"}]]},"defaultParamMap":{"seed":880116102,"numFolds":3},"avgMetrics":[],"persistSubModels":false}
2 | 


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/metadata/.part-00000.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/metadata/_SUCCESS


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"pyspark.ml.pipeline.PipelineModel","timestamp":1568299021142,"sparkVersion":"2.4.4","uid":"PipelineModel_aec8567ff127","paramMap":{"stageUids":["ValueCleaner_57f061a9e393","Imputer_3f8cf4b571a8","OneHotEncoderEstimator_f1dc6e50f52e","VectorAssembler_ef6b7bf933ee","BinaryLabelMaker_3b174e5e0c29"],"language":"Python"},"defaultParamMap":{}}
2 | 


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/0_ValueCleaner_57f061a9e393/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/0_ValueCleaner_57f061a9e393/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/0_ValueCleaner_57f061a9e393/metadata/.part-00000.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/0_ValueCleaner_57f061a9e393/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/0_ValueCleaner_57f061a9e393/metadata/_SUCCESS


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/0_ValueCleaner_57f061a9e393/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"__main__.ValueCleaner","timestamp":1568299021262,"sparkVersion":"2.4.4","uid":"ValueCleaner_57f061a9e393","paramMap":{"inputCols":["flow_duration","flow_byts_s","flow_pkts_s","flow_iat_min","fwd_iat_tot","fwd_iat_min","init_fwd_win_byts","init_bwd_win_byts","fwd_seg_size_min"],"outputCols":["flow_duration_clean","flow_byts_s_clean","flow_pkts_s_clean","flow_iat_min_clean","fwd_iat_tot_clean","fwd_iat_min_clean","init_fwd_win_byts_clean","init_bwd_win_byts_clean","fwd_seg_size_min_clean"]},"defaultParamMap":{}}
2 | 


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/data/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/data/.part-00000-d346f402-14f7-495c-adb5-386e07999ead-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/data/.part-00000-d346f402-14f7-495c-adb5-386e07999ead-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/data/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/data/_SUCCESS


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/data/part-00000-d346f402-14f7-495c-adb5-386e07999ead-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/data/part-00000-d346f402-14f7-495c-adb5-386e07999ead-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/metadata/.part-00000.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/metadata/_SUCCESS


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"org.apache.spark.ml.feature.ImputerModel","timestamp":1568299021353,"sparkVersion":"2.4.4","uid":"Imputer_3f8cf4b571a8","paramMap":{"inputCols":["flow_duration_clean","flow_byts_s_clean","flow_pkts_s_clean","flow_iat_min_clean","fwd_iat_tot_clean","fwd_iat_min_clean","init_fwd_win_byts_clean","init_bwd_win_byts_clean","fwd_seg_size_min_clean"],"outputCols":["flow_duration_imputed","flow_byts_s_imputed","flow_pkts_s_imputed","flow_iat_min_imputed","fwd_iat_tot_imputed","fwd_iat_min_imputed","init_fwd_win_byts_imputed","init_bwd_win_byts_imputed","fwd_seg_size_min_imputed"]},"defaultParamMap":{"missingValue":"NaN","strategy":"mean"}}
2 | 


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/data/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/data/.part-00000-c909fe56-90d1-4202-a5f4-69907defba9a-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/data/.part-00000-c909fe56-90d1-4202-a5f4-69907defba9a-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/data/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/data/_SUCCESS


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/data/part-00000-c909fe56-90d1-4202-a5f4-69907defba9a-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/data/part-00000-c909fe56-90d1-4202-a5f4-69907defba9a-c000.snappy.parquet


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/metadata/.part-00000.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/metadata/_SUCCESS


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"org.apache.spark.ml.feature.OneHotEncoderModel","timestamp":1568299021798,"sparkVersion":"2.4.4","uid":"OneHotEncoderEstimator_f1dc6e50f52e","paramMap":{"inputCols":["protocol"],"outputCols":["protocol_cat"]},"defaultParamMap":{"dropLast":true,"handleInvalid":"error"}}
2 | 


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/3_VectorAssembler_ef6b7bf933ee/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/3_VectorAssembler_ef6b7bf933ee/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/3_VectorAssembler_ef6b7bf933ee/metadata/.part-00000.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/3_VectorAssembler_ef6b7bf933ee/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/3_VectorAssembler_ef6b7bf933ee/metadata/_SUCCESS


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/3_VectorAssembler_ef6b7bf933ee/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"org.apache.spark.ml.feature.VectorAssembler","timestamp":1568299021967,"sparkVersion":"2.4.4","uid":"VectorAssembler_ef6b7bf933ee","paramMap":{"inputCols":["tot_fwd_pkts","tot_bwd_pkts","totlen_fwd_pkts","totlen_bwd_pkts","fwd_pkt_len_mean","fwd_pkt_len_std","bwd_pkt_len_mean","flow_iat_std","bwd_iat_tot","bwd_iat_min","fwd_psh_flags","fwd_urg_flags","bwd_pkts_s","fin_flag_cnt","rst_flag_cnt","psh_flag_cnt","ack_flag_cnt","urg_flag_cnt","down_up_ratio","active_mean","idle_mean","protocol_cat","flow_duration_imputed","flow_byts_s_imputed","flow_pkts_s_imputed","flow_iat_min_imputed","fwd_iat_tot_imputed","fwd_iat_min_imputed","init_fwd_win_byts_imputed","init_bwd_win_byts_imputed","fwd_seg_size_min_imputed"],"outputCol":"features"},"defaultParamMap":{"handleInvalid":"error","outputCol":"VectorAssembler_ef6b7bf933ee__output"}}
2 | 


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/4_BinaryLabelMaker_3b174e5e0c29/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/4_BinaryLabelMaker_3b174e5e0c29/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/4_BinaryLabelMaker_3b174e5e0c29/metadata/.part-00000.crc


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/4_BinaryLabelMaker_3b174e5e0c29/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/4_BinaryLabelMaker_3b174e5e0c29/metadata/_SUCCESS


--------------------------------------------------------------------------------
/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/4_BinaryLabelMaker_3b174e5e0c29/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"__main__.BinaryLabelMaker","timestamp":1568299022005,"sparkVersion":"2.4.4","uid":"BinaryLabelMaker_3b174e5e0c29","paramMap":{"inputCols":["label"],"outputCols":["label_is_attack"],"classLabel":"Benign"},"defaultParamMap":{}}
2 | 


--------------------------------------------------------------------------------
/notebooks/05_anomaly_detection/img/denoising_autoencoder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/05_anomaly_detection/img/denoising_autoencoder.png


--------------------------------------------------------------------------------
/notebooks/05_anomaly_detection/img/stacked_autoencoder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/05_anomaly_detection/img/stacked_autoencoder.png


--------------------------------------------------------------------------------
/notebooks/05_anomaly_detection/img/undercomplete_autoencoder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/05_anomaly_detection/img/undercomplete_autoencoder.png


--------------------------------------------------------------------------------
/notebooks/05_anomaly_detection/models/denoising_autoencoder_model.h5:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:13f9ca921d4d76f3a745450fa844e22c2d5716440efcc22c2170f3bc0f21f179
3 | size 13411104
4 | 


--------------------------------------------------------------------------------
/notebooks/05_anomaly_detection/models/simple_autoencoder_model.h5:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:18216e715acf520b92ba511d4a27f37b90377887540a1d2b1217d46b41d7d93a
3 | size 70464
4 | 


--------------------------------------------------------------------------------
/notebooks/05_anomaly_detection/models/stacked_autoencoder_model.h5:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:db4d61f4d8ee4e9d43db255afcac4c2443aea48268ea9ea867783460cdfa065d
3 | size 204328
4 | 


--------------------------------------------------------------------------------
/notebooks/05_anomaly_detection/notebook_utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | from sklearn.metrics import classification_report, average_precision_score, roc_auc_score, precision_recall_curve, \
  5 |     roc_curve
  6 | from IPython.display import display
  7 | from ml_ids.visualization import plot_confusion_matrix
  8 | 
  9 | 
 10 | def predict(model, X, y):
 11 |     preds = model.predict(X, batch_size=8196)
 12 |     mse = np.mean(np.power(X - preds, 2), axis=1)
 13 | 
 14 |     return pd.DataFrame({'y_true': y, 'rec_error': mse})
 15 | 
 16 | 
 17 | def evaluate_pr_roc(pred):
 18 |     pr_auc = average_precision_score(pred.y_true, pred.rec_error)
 19 |     roc_auc = roc_auc_score(pred.y_true, pred.rec_error)
 20 |     return pr_auc, roc_auc
 21 | 
 22 | 
 23 | def plot_evaluation_curves(pred):
 24 |     fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(20, 8))
 25 | 
 26 |     precisions, recalls, thresholds = precision_recall_curve(pred.y_true, pred.rec_error)
 27 |     fpr, tpr, _ = roc_curve(pred.y_true, pred.rec_error)
 28 |     pr_auc, roc_auc = evaluate_pr_roc(pred)
 29 | 
 30 |     # plot precision / recall curve
 31 |     ax1.plot(recalls, precisions, label='auc={}'.format(pr_auc))
 32 |     ax1.set_title('Precision / Recall Curve')
 33 |     ax1.set_xlabel('Recall')
 34 |     ax1.set_ylabel('Precision')
 35 |     ax1.legend(loc='lower right')
 36 | 
 37 |     # plot ROC curve
 38 |     ax2.plot(fpr, tpr, label='auc={}'.format(roc_auc))
 39 |     ax2.set_title('ROC Curve')
 40 |     ax2.set_ylabel('True Positive Rate')
 41 |     ax2.set_xlabel("False Positive Rate")
 42 |     ax2.legend(loc='lower right')
 43 | 
 44 | 
 45 | def plot_pr_threshold_curves(pred, pr_plot_lim=[0, 1]):
 46 |     fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(20, 8))
 47 | 
 48 |     precisions, recalls, thresholds = precision_recall_curve(pred.y_true, pred.rec_error)
 49 | 
 50 |     # plot precision / recall for different thresholds
 51 |     ax1.plot(thresholds, precisions[:-1], label="Precision")
 52 |     ax1.plot(thresholds, recalls[:-1], label="Recall")
 53 |     ax1.set_title('Precision / Recall of different thresholds')
 54 |     ax1.set_xlabel('Threshold')
 55 |     ax1.set_ylabel('Precision / Recall')
 56 |     ax1.legend(loc='lower right')
 57 | 
 58 |     # plot precision / recall for different thresholds
 59 |     ax2.plot(thresholds, precisions[:-1], label="Precision")
 60 |     ax2.plot(thresholds, recalls[:-1], label="Recall")
 61 |     ax2.set_title('Precision / Recall of different thresholds')
 62 |     ax2.set_xlabel('Threshold')
 63 |     ax2.set_ylabel('Precision / Recall')
 64 |     ax2.set_xlim(pr_plot_lim)
 65 |     ax2.legend(loc='lower right')
 66 | 
 67 | 
 68 | def best_precision_for_target_recall(pred, target_recall):
 69 |     precisions, recalls, thresholds = precision_recall_curve(pred.y_true, pred.rec_error)
 70 |     return thresholds[np.argmin(recalls >= target_recall)]
 71 | 
 72 | 
 73 | def get_misclassifications(y, pred_binary):
 74 |     misclassifications = y[y.label_is_attack != pred_binary]
 75 | 
 76 |     mc_df = pd.merge(pd.DataFrame({'misclassified': misclassifications.label.value_counts()}),
 77 |                      pd.DataFrame({'total': y.label.value_counts()}),
 78 |                      how='left', left_index=True, right_index=True)
 79 |     mc_df['percent_misclassified'] = mc_df.apply(lambda x: x[0] / x[1], axis=1)
 80 |     return mc_df.sort_values('percent_misclassified', ascending=False)
 81 | 
 82 | 
 83 | def print_performance(y, pred, threshold):
 84 |     pred_binary = (pred.rec_error >= threshold).astype('int')
 85 | 
 86 |     print('Classification Report:')
 87 |     print('======================')
 88 |     print(classification_report(pred.y_true, pred_binary))
 89 | 
 90 |     print('Confusion Matrix:')
 91 |     print('=================')
 92 |     plot_confusion_matrix(pred.y_true, pred_binary, np.array(['Benign', 'Attack']), size=(5, 5))
 93 |     plt.show()
 94 | 
 95 |     print('Misclassifications by attack category:')
 96 |     print('======================================')
 97 |     mc_df = get_misclassifications(y, pred_binary)
 98 |     display(mc_df)
 99 | 
100 | 
101 | def filter_benign(X, y):
102 |     return X[y.label_is_attack == 0]
103 | 


--------------------------------------------------------------------------------
/notebooks/06_dl_classifier/models/c0cb0656-558f-4311-b138-9b91ab4d1fe6.h5:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8efc348b48af452153dec068d1367cec784ffc2930049df4eaf371d10c0d1caa
3 | size 4651784
4 | 


--------------------------------------------------------------------------------
/notebooks/06_dl_classifier/models/model_class_weight.h5:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:60e8392296780d912e8bff335bdadb81deae5b035925d2282e963d45def4ce95
3 | size 4231072
4 | 


--------------------------------------------------------------------------------
/notebooks/06_dl_classifier/models/model_no_class_weights.h5:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e0d9dff11e5600e74974a8e6657be10f7af7d1abe7b66ea2308d9d6eea4d29eb
3 | size 4231072
4 | 


--------------------------------------------------------------------------------
/notebooks/06_dl_classifier/models/opt_model.h5:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d941b094728d0e970231c2f40440da9bfe2c9c5f9898954064954483d210857a
3 | size 4655880
4 | 


--------------------------------------------------------------------------------
/notebooks/06_dl_classifier/notebook_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gc
 3 | from ml_ids.model_selection import split_x_y, train_val_test_split
 4 | from ml_ids.transform.sampling import upsample_minority_classes, downsample
 5 | from ml_ids.transform.preprocessing import create_pipeline
 6 | from collections import Counter
 7 | 
 8 | 
 9 | def transform_data(dataset,
10 |                    attack_samples,
11 |                    imputer_strategy,
12 |                    scaler,
13 |                    benign_samples=None,
14 |                    random_state=None):
15 | 
16 |     cols_to_impute = dataset.columns[dataset.isna().any()].tolist()
17 | 
18 |     train_data, val_data, test_data = train_val_test_split(dataset,
19 |                                                            val_size=0.1,
20 |                                                            test_size=0.1,
21 |                                                            stratify_col='label_cat',
22 |                                                            random_state=random_state)
23 | 
24 |     if benign_samples:
25 |         train_data = downsample(train_data, default_nr_samples=benign_samples, random_state=random_state)
26 | 
27 |     X_train_raw, y_train = split_x_y(train_data)
28 |     X_val_raw, y_val = split_x_y(val_data)
29 |     X_test_raw, y_test = split_x_y(test_data)
30 | 
31 |     print('Samples:')
32 |     print('========')
33 |     print('Training: {}'.format(X_train_raw.shape))
34 |     print('Val:      {}'.format(X_val_raw.shape))
35 |     print('Test:     {}'.format(X_test_raw.shape))
36 | 
37 |     print('\nTraining labels:')
38 |     print('================')
39 |     print(y_train.label.value_counts())
40 |     print('\nValidation labels:')
41 |     print('==================')
42 |     print(y_val.label.value_counts())
43 |     print('\nTest labels:')
44 |     print('============')
45 |     print(y_test.label.value_counts())
46 | 
47 |     del train_data, val_data, test_data
48 |     gc.collect()
49 | 
50 |     pipeline, get_col_names = create_pipeline(X_train_raw,
51 |                                               imputer_strategy=imputer_strategy,
52 |                                               imputer_cols=cols_to_impute,
53 |                                               scaler=scaler)
54 | 
55 |     X_train = pipeline.fit_transform(X_train_raw)
56 |     X_val = pipeline.transform(X_val_raw)
57 |     X_test = pipeline.transform(X_test_raw)
58 | 
59 |     column_names = get_col_names()
60 | 
61 |     print('Samples:')
62 |     print('========')
63 |     print('Training: {}'.format(X_train.shape))
64 |     print('Val:      {}'.format(X_val.shape))
65 |     print('Test:     {}'.format(X_test.shape))
66 | 
67 |     print('\nMissing values:')
68 |     print('===============')
69 |     print('Training: {}'.format(np.count_nonzero(np.isnan(X_train))))
70 |     print('Val:      {}'.format(np.count_nonzero(np.isnan(X_val))))
71 |     print('Test:     {}'.format(np.count_nonzero(np.isnan(X_test))))
72 | 
73 |     print('\nScaling:')
74 |     print('========')
75 |     print('Training: min={}, max={}'.format(np.min(X_train), np.max(X_train)))
76 |     print('Val:      min={}, max={}'.format(np.min(X_val), np.max(X_val)))
77 |     print('Test:     min={}, max={}'.format(np.min(X_test), np.max(X_test)))
78 | 
79 |     X_train, y_train = upsample_minority_classes(X_train,
80 |                                                  y_train,
81 |                                                  min_samples=attack_samples,
82 |                                                  random_state=random_state)
83 | 
84 |     print('Samples:')
85 |     print('========')
86 |     print('Training: {}'.format(X_train.shape))
87 | 
88 |     print('\nTraining labels:')
89 |     print('================')
90 |     print(Counter(y_train))
91 | 
92 |     return X_train, y_train, X_val, y_val, X_test, y_test, column_names
93 | 


--------------------------------------------------------------------------------
/notebooks/07_binary_classifier_comparison/models/gb_835066e8-2427-48ca-a521-67195008cb91.catboost:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ceccc696d2c5eae0d550425f772221088e7a66b26a626461642e14c2b42099ce
3 | size 31179384
4 | 


--------------------------------------------------------------------------------
/notebooks/07_binary_classifier_comparison/notebook_utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import gc
  3 | from ml_ids.model_selection import split_x_y, train_val_test_split
  4 | from ml_ids.transform.sampling import upsample_minority_classes, downsample
  5 | from ml_ids.transform.preprocessing import create_pipeline
  6 | from collections import Counter
  7 | 
  8 | 
  9 | def get_best_model_path(trials, model_path_var='model_path'):
 10 |     return trials.results[np.argmin(trials.losses())][model_path_var]
 11 | 
 12 | 
 13 | def print_trial_results(trials, best_run, model_path_var='model_path'):
 14 |     best_model_path = get_best_model_path(trials, model_path_var)
 15 | 
 16 |     print('Best validation score: {}'.format(-np.min(trials.losses())))
 17 |     print('Best model path: {}\n'.format(best_model_path))
 18 |     print('Best model parameters:')
 19 |     print('======================')
 20 |     print(best_run)
 21 | 
 22 | 
 23 | def transform_data(dataset,
 24 |                    attack_samples,
 25 |                    imputer_strategy,
 26 |                    scaler,
 27 |                    benign_samples=None,
 28 |                    random_state=None):
 29 | 
 30 |     cols_to_impute = dataset.columns[dataset.isna().any()].tolist()
 31 | 
 32 |     train_data, val_data, test_data = train_val_test_split(dataset,
 33 |                                                            val_size=0.1,
 34 |                                                            test_size=0.1,
 35 |                                                            stratify_col='label_cat',
 36 |                                                            random_state=random_state)
 37 | 
 38 |     if benign_samples:
 39 |         train_data = downsample(train_data, default_nr_samples=benign_samples, random_state=random_state)
 40 | 
 41 |     X_train_raw, y_train = split_x_y(train_data)
 42 |     X_val_raw, y_val = split_x_y(val_data)
 43 |     X_test_raw, y_test = split_x_y(test_data)
 44 | 
 45 |     print('Samples:')
 46 |     print('========')
 47 |     print('Training: {}'.format(X_train_raw.shape))
 48 |     print('Val:      {}'.format(X_val_raw.shape))
 49 |     print('Test:     {}'.format(X_test_raw.shape))
 50 | 
 51 |     print('\nTraining labels:')
 52 |     print('================')
 53 |     print(y_train.label.value_counts())
 54 |     print('\nValidation labels:')
 55 |     print('==================')
 56 |     print(y_val.label.value_counts())
 57 |     print('\nTest labels:')
 58 |     print('============')
 59 |     print(y_test.label.value_counts())
 60 | 
 61 |     del train_data, val_data, test_data
 62 |     gc.collect()
 63 | 
 64 |     pipeline, get_col_names = create_pipeline(X_train_raw,
 65 |                                               imputer_strategy=imputer_strategy,
 66 |                                               imputer_cols=cols_to_impute,
 67 |                                               scaler=scaler)
 68 | 
 69 |     X_train = pipeline.fit_transform(X_train_raw)
 70 |     X_val = pipeline.transform(X_val_raw)
 71 |     X_test = pipeline.transform(X_test_raw)
 72 | 
 73 |     column_names = get_col_names()
 74 | 
 75 |     print('Samples:')
 76 |     print('========')
 77 |     print('Training: {}'.format(X_train.shape))
 78 |     print('Val:      {}'.format(X_val.shape))
 79 |     print('Test:     {}'.format(X_test.shape))
 80 | 
 81 |     print('\nMissing values:')
 82 |     print('===============')
 83 |     print('Training: {}'.format(np.count_nonzero(np.isnan(X_train))))
 84 |     print('Val:      {}'.format(np.count_nonzero(np.isnan(X_val))))
 85 |     print('Test:     {}'.format(np.count_nonzero(np.isnan(X_test))))
 86 | 
 87 |     print('\nScaling:')
 88 |     print('========')
 89 |     print('Training: min={}, max={}'.format(np.min(X_train), np.max(X_train)))
 90 |     print('Val:      min={}, max={}'.format(np.min(X_val), np.max(X_val)))
 91 |     print('Test:     min={}, max={}'.format(np.min(X_test), np.max(X_test)))
 92 | 
 93 |     X_train, y_train = upsample_minority_classes(X_train,
 94 |                                                  y_train,
 95 |                                                  min_samples=attack_samples,
 96 |                                                  random_state=random_state)
 97 | 
 98 |     print('Samples:')
 99 |     print('========')
100 |     print('Training: {}'.format(X_train.shape))
101 | 
102 |     print('\nTraining labels:')
103 |     print('================')
104 |     print(Counter(y_train))
105 | 
106 |     return X_train, y_train, X_val, y_val, X_test, y_test, column_names
107 | 


--------------------------------------------------------------------------------
/project-proposal.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/project-proposal.pdf


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [aliases]
 2 | test=pytest
 3 | 
 4 | [mypy-numpy.*]
 5 | ignore_missing_imports = True
 6 | 
 7 | [mypy-pandas.*]
 8 | ignore_missing_imports = True
 9 | 
10 | [mypy-matplotlib.*]
11 | ignore_missing_imports = True
12 | 
13 | [mypy-IPython.*]
14 | ignore_missing_imports = True
15 | 
16 | [mypy-sklearn.*]
17 | ignore_missing_imports = True
18 | 
19 | [mypy-seaborn.*]
20 | ignore_missing_imports = True
21 | 
22 | [mypy-tensorflow.*]
23 | ignore_missing_imports = True
24 | 
25 | [mypy-mlflow.*]
26 | ignore_missing_imports = True
27 | 
28 | [mypy-catboost.*]
29 | ignore_missing_imports = True
30 | 
31 | [mypy-imblearn.*]
32 | ignore_missing_imports = True


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | 
 3 | setup(
 4 |     name='ml-ids',
 5 |     version='0.1',
 6 |     description='Machine learning based Intrusion Detection System',
 7 |     long_description='Machine learning based Intrusion Detection System',
 8 |     classifiers=[
 9 |         'Programming Language :: Python :: 3',
10 |     ],
11 |     url='https://github.com/cstub/ml-ids',
12 |     author='cstub',
13 |     author_email='stumpf.christoph@gmail.com',
14 |     license='MIT',
15 |     packages=['ml_ids'],
16 |     install_requires=[
17 |     ],
18 |     setup_requires=['pytest-runner'],
19 |     tests_require=['pytest']
20 | )
21 | 


--------------------------------------------------------------------------------
/tests/data/test_dataset.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import pandas as pd
 3 | import numpy as np
 4 | import os
 5 | from ml_ids import conf
 6 | from ml_ids.data.dataset import load_dataset
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def val_data():
11 |     validation_data_path = os.path.join(conf.TEST_DATA_DIR, 'validation.csv')
12 |     return pd.read_csv(validation_data_path)
13 | 
14 | 
15 | def inf_value_count(df):
16 |     return df[(df == np.inf) | (df == -np.inf)].count().sum()
17 | 
18 | 
19 | def neg_value_count(df):
20 |     numeric_cols = df.select_dtypes(include=[np.number]).columns.values
21 |     df_num = df[numeric_cols]
22 |     return df_num[df_num < 0].count().sum()
23 | 
24 | 
25 | def nan_value_count(df):
26 |     return df.isna().sum().sum()
27 | 
28 | 
29 | def negative_value_columns(df):
30 |     numeric_cols = df.select_dtypes(include=[np.number]).columns.values
31 |     return [c for c in numeric_cols if df[df[c] < 0][c].count() > 0]
32 | 
33 | 
34 | def test_loaded_dataset_must_not_contain_inf_values():
35 |     df = load_dataset(conf.TEST_DATA_DIR)
36 | 
37 |     assert inf_value_count(df) == 0
38 | 
39 | 
40 | def test_loaded_dataset_must_not_contain_negative_values():
41 |     df = load_dataset(conf.TEST_DATA_DIR)
42 | 
43 |     assert neg_value_count(df) == 0
44 | 
45 | 
46 | def test_loaded_dataset_must_not_contain_negative_values_except_excluded_cols():
47 |     df = load_dataset(conf.TEST_DATA_DIR, preserve_neg_value_cols=['init_fwd_win_byts', 'init_bwd_win_byts'])
48 | 
49 |     assert neg_value_count(df) != 0
50 |     assert set(negative_value_columns(df)) == {'init_bwd_win_byts', 'init_fwd_win_byts'}
51 | 
52 | 
53 | def test_loaded_dataset_must_contain_label_category():
54 |     df = load_dataset(conf.TEST_DATA_DIR)
55 | 
56 |     assert len(df.label_cat.value_counts()) == len(df.label.value_counts())
57 | 
58 | 
59 | def test_loaded_dataset_must_contain_label_is_attack():
60 |     df = load_dataset(conf.TEST_DATA_DIR)
61 | 
62 |     all_sample_count = len(df)
63 |     benign_sample_count = len(df[df.label == 'Benign'])
64 |     attack_sample_count = all_sample_count - benign_sample_count
65 | 
66 |     assert len(df[df.label_is_attack == 0]) == benign_sample_count
67 |     assert len(df[df.label_is_attack == 1]) == attack_sample_count
68 | 
69 | 
70 | def test_loaded_dataset_must_replace_invalid_value_with_nan(val_data):
71 |     df = load_dataset(conf.TEST_DATA_DIR)
72 | 
73 |     inf_value_c = inf_value_count(val_data)
74 |     neg_value_c = neg_value_count(val_data)
75 | 
76 |     assert (inf_value_c + neg_value_c) == nan_value_count(df)
77 | 
78 | 
79 | def test_loaded_dataset_must_contain_only_specified_columns():
80 |     df = load_dataset(conf.TEST_DATA_DIR, use_cols=['dst_port'])
81 | 
82 |     assert df.columns == ['dst_port']
83 | 
84 | 
85 | def test_loaded_dataset_must_omit_specified_columns():
86 |     df = load_dataset(conf.TEST_DATA_DIR, omit_cols=['dst_port'])
87 | 
88 |     assert 'dst_port' not in df.columns
89 | 


--------------------------------------------------------------------------------
/tests/transform/test_preprocessing.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import numpy as np
  3 | from numpy.testing import assert_array_equal
  4 | from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
  5 | 
  6 | from ml_ids import conf
  7 | from ml_ids.data.dataset import load_dataset
  8 | from ml_ids.model_selection import split_x_y
  9 | from ml_ids.transform.preprocessing import create_pipeline
 10 | 
 11 | 
 12 | @pytest.fixture
 13 | def feature_df():
 14 |     df = load_dataset(conf.TEST_DATA_DIR, omit_cols=['timestamp'])
 15 |     X, _ = split_x_y(df)
 16 |     return X
 17 | 
 18 | 
 19 | def nan_value_count(x):
 20 |     return np.count_nonzero(np.isnan(x))
 21 | 
 22 | 
 23 | def test_pipeline_must_impute_all_missing_values(feature_df):
 24 |     pipeline, _ = create_pipeline(feature_df,
 25 |                                   imputer_strategy='mean',
 26 |                                   scaler=FunctionTransformer,
 27 |                                   scaler_args={'validate': False})
 28 |     transformed = pipeline.fit_transform(feature_df)
 29 | 
 30 |     assert nan_value_count(feature_df.values) != 0
 31 |     assert nan_value_count(transformed) == 0
 32 | 
 33 | 
 34 | def test_pipeline_must_impute_selected_columns_only(feature_df):
 35 |     pipeline, _ = create_pipeline(feature_df,
 36 |                                   imputer_strategy='mean',
 37 |                                   imputer_cols=['flow_duration', 'flow_pkts_s'],
 38 |                                   scaler=FunctionTransformer,
 39 |                                   scaler_args={'validate': False})
 40 | 
 41 |     missing_vals_selected_columns = \
 42 |         nan_value_count(feature_df.flow_duration.values) + nan_value_count(feature_df.flow_pkts_s.values)
 43 | 
 44 |     transformed = pipeline.fit_transform(feature_df)
 45 | 
 46 |     assert nan_value_count(transformed) == (nan_value_count(feature_df.values) - missing_vals_selected_columns)
 47 | 
 48 | 
 49 | def test_pipeline_must_not_impute_values_if_imputer_strategy_none(feature_df):
 50 |     pipeline, get_col_names = create_pipeline(feature_df,
 51 |                                               imputer_strategy=None,
 52 |                                               scaler=FunctionTransformer,
 53 |                                               scaler_args={'validate': False})
 54 | 
 55 |     transformed = pipeline.fit_transform(feature_df)
 56 | 
 57 |     assert nan_value_count(feature_df.values) == nan_value_count(transformed)
 58 |     assert len(feature_df.columns) == len(get_col_names())
 59 | 
 60 | 
 61 | def test_pipeline_must_reorder_columns(feature_df):
 62 |     pipeline, get_col_names = create_pipeline(feature_df,
 63 |                                               imputer_strategy='mean',
 64 |                                               imputer_cols=['flow_duration', 'flow_pkts_s'],
 65 |                                               scaler=FunctionTransformer,
 66 |                                               scaler_args={'validate': False})
 67 | 
 68 |     _ = pipeline.fit_transform(feature_df)
 69 |     column_names = get_col_names()
 70 | 
 71 |     assert len(feature_df.columns) == len(column_names)
 72 |     assert_array_equal(column_names[:2], ['flow_duration', 'flow_pkts_s'])
 73 | 
 74 | 
 75 | def test_pipeline_must_impute_all_missing_values_with_mean(feature_df):
 76 |     pipeline, get_col_names = create_pipeline(feature_df,
 77 |                                               imputer_strategy='mean',
 78 |                                               scaler=FunctionTransformer,
 79 |                                               scaler_args={'validate': False})
 80 |     transformed = pipeline.fit_transform(feature_df)
 81 | 
 82 |     col_idx = np.where(get_col_names() == 'flow_duration')[0]
 83 |     nan_idx = np.where(np.isnan(feature_df.flow_duration.values))[0]
 84 | 
 85 |     assert len(nan_idx) == 10
 86 |     assert np.unique(transformed[nan_idx, col_idx]) == feature_df.flow_duration.mean()
 87 | 
 88 | 
 89 | def test_pipeline_must_impute_all_missing_values_with_median(feature_df):
 90 |     pipeline, get_col_names = create_pipeline(feature_df,
 91 |                                               imputer_strategy='median',
 92 |                                               scaler=FunctionTransformer,
 93 |                                               scaler_args={'validate': False})
 94 |     transformed = pipeline.fit_transform(feature_df)
 95 | 
 96 |     col_idx = np.where(get_col_names() == 'flow_duration')[0]
 97 |     nan_idx = np.where(np.isnan(feature_df.flow_duration.values))[0]
 98 | 
 99 |     assert len(nan_idx) == 10
100 |     assert np.unique(transformed[nan_idx, col_idx]) == feature_df.flow_duration.median()
101 | 
102 | 
103 | def test_pipeline_must_scale_all_values(feature_df):
104 |     pipeline, _ = create_pipeline(feature_df, scaler=MinMaxScaler)
105 |     transformed = pipeline.fit_transform(feature_df)
106 | 
107 |     assert np.min(transformed) == 0
108 |     assert np.max(transformed) == 1
109 | 
110 | 
111 | def test_pipeline_must_one_hot_encode_categorical_values(feature_df):
112 |     nr_categories = 3
113 |     pipeline, _ = create_pipeline(feature_df, cat_cols=['protocol'])
114 |     transformed = pipeline.fit_transform(feature_df)
115 | 
116 |     one_hot_encoded = transformed[:, -nr_categories:]
117 | 
118 |     print(np.unique(one_hot_encoded))
119 | 
120 |     assert transformed.shape[1] == feature_df.shape[1] + (nr_categories - 1)
121 |     assert_array_equal(np.unique(one_hot_encoded), [0., 1.])
122 | 


--------------------------------------------------------------------------------
/upload.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/upload.py


--------------------------------------------------------------------------------