├── .dockerignore ├── .gitattributes ├── .github └── workflows │ ├── build.yml │ ├── deployment.yml │ └── train.yml ├── .gitignore ├── .idea ├── .gitignore ├── deployment.xml ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── ml-ids.iml ├── modules.xml └── vcs.xml ├── .pylintrc ├── Makefile ├── README.md ├── data ├── README.md └── Wednesday-14-02-2018_TrafficForML_CICFlowMeter.csv ├── environment-notebook.yaml ├── environment.yaml ├── ml_ids ├── __init__.py ├── conf.py ├── data │ ├── __init__.py │ ├── dataset.py │ ├── metadata.py │ └── split_dataset.py ├── keras │ ├── __init__.py │ ├── callbacks.py │ ├── evaluation.py │ ├── metrics.py │ ├── model_selection.py │ └── prediction.py ├── libs │ └── dfencoder │ │ └── dataframe.py ├── model_selection.py ├── models │ ├── __init__.py │ └── gradient_boost │ │ ├── __init__.py │ │ ├── mlflow_wrapper.py │ │ └── train.py ├── prediction.py ├── tf_utils.py ├── transform │ ├── __init__.py │ ├── preprocessing.py │ └── sampling.py └── visualization.py ├── models └── gradient_boost │ ├── envs │ ├── local │ │ └── train.py │ └── sagemaker │ │ ├── configs │ │ ├── deploy.json │ │ ├── train-cpu.json │ │ └── train-gpu.json │ │ ├── container │ │ ├── Dockerfile │ │ └── train.py │ │ └── scripts │ │ ├── build_image.sh │ │ ├── deploy.py │ │ ├── push_image_to_ecr.sh │ │ ├── train.py │ │ └── undeploy.py │ ├── project │ ├── MLproject │ ├── conda.yaml │ └── train.py │ ├── training_params.json │ └── training_params_quick_run.json ├── notebooks ├── 01_data-cleanup │ └── data_cleanup.ipynb ├── 02_exploratory-data-analysis │ └── exploratory_data_analysis.ipynb ├── 03_ml-prototype │ ├── ml-prototype.ipynb │ └── models │ │ └── gradient_boost_model.cbm ├── 04_ml-prototype-spark │ ├── ml-prototype-spark.ipynb │ └── models │ │ ├── gb-model │ │ ├── bestModel │ │ │ ├── data │ │ │ │ ├── ._SUCCESS.crc │ │ │ │ ├── .part-00000-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ ├── .part-00001-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ ├── .part-00002-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ ├── .part-00003-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ ├── .part-00004-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ ├── .part-00005-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ ├── .part-00007-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ ├── .part-00008-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ ├── .part-00009-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ ├── .part-00010-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ ├── .part-00011-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ ├── .part-00013-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ ├── .part-00014-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ ├── .part-00015-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ ├── .part-00016-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ ├── .part-00017-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ ├── .part-00019-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ ├── .part-00020-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ ├── .part-00021-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ ├── .part-00022-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ ├── .part-00023-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc │ │ │ │ ├── _SUCCESS │ │ │ │ ├── part-00000-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ ├── part-00001-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ ├── part-00002-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ ├── part-00003-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ ├── part-00004-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ ├── part-00005-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ ├── part-00007-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ ├── part-00008-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ ├── part-00009-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ ├── part-00010-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ ├── part-00011-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ ├── part-00013-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ ├── part-00014-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ ├── part-00015-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ ├── part-00016-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ ├── part-00017-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ ├── part-00019-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ ├── part-00020-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ ├── part-00021-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ ├── part-00022-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ │ └── part-00023-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet │ │ │ ├── metadata │ │ │ │ ├── ._SUCCESS.crc │ │ │ │ ├── .part-00000.crc │ │ │ │ ├── _SUCCESS │ │ │ │ └── part-00000 │ │ │ └── treesMetadata │ │ │ │ ├── ._SUCCESS.crc │ │ │ │ ├── .part-00000-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00001-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00002-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00003-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00004-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00005-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00006-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00007-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00008-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00009-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00010-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00011-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00012-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00013-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00014-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00015-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00016-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00017-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00018-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── .part-00019-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc │ │ │ │ ├── _SUCCESS │ │ │ │ ├── part-00000-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00001-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00002-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00003-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00004-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00005-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00006-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00007-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00008-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00009-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00010-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00011-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00012-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00013-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00014-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00015-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00016-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00017-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ ├── part-00018-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ │ │ └── part-00019-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet │ │ ├── estimator │ │ │ └── metadata │ │ │ │ ├── ._SUCCESS.crc │ │ │ │ ├── .part-00000.crc │ │ │ │ ├── _SUCCESS │ │ │ │ └── part-00000 │ │ ├── evaluator │ │ │ └── metadata │ │ │ │ ├── ._SUCCESS.crc │ │ │ │ ├── .part-00000.crc │ │ │ │ ├── _SUCCESS │ │ │ │ └── part-00000 │ │ └── metadata │ │ │ ├── ._SUCCESS.crc │ │ │ ├── .part-00000.crc │ │ │ ├── _SUCCESS │ │ │ └── part-00000 │ │ └── pipeline-model │ │ ├── metadata │ │ ├── ._SUCCESS.crc │ │ ├── .part-00000.crc │ │ ├── _SUCCESS │ │ └── part-00000 │ │ └── stages │ │ ├── 0_ValueCleaner_57f061a9e393 │ │ └── metadata │ │ │ ├── ._SUCCESS.crc │ │ │ ├── .part-00000.crc │ │ │ ├── _SUCCESS │ │ │ └── part-00000 │ │ ├── 1_Imputer_3f8cf4b571a8 │ │ ├── data │ │ │ ├── ._SUCCESS.crc │ │ │ ├── .part-00000-d346f402-14f7-495c-adb5-386e07999ead-c000.snappy.parquet.crc │ │ │ ├── _SUCCESS │ │ │ └── part-00000-d346f402-14f7-495c-adb5-386e07999ead-c000.snappy.parquet │ │ └── metadata │ │ │ ├── ._SUCCESS.crc │ │ │ ├── .part-00000.crc │ │ │ ├── _SUCCESS │ │ │ └── part-00000 │ │ ├── 2_OneHotEncoderEstimator_f1dc6e50f52e │ │ ├── data │ │ │ ├── ._SUCCESS.crc │ │ │ ├── .part-00000-c909fe56-90d1-4202-a5f4-69907defba9a-c000.snappy.parquet.crc │ │ │ ├── _SUCCESS │ │ │ └── part-00000-c909fe56-90d1-4202-a5f4-69907defba9a-c000.snappy.parquet │ │ └── metadata │ │ │ ├── ._SUCCESS.crc │ │ │ ├── .part-00000.crc │ │ │ ├── _SUCCESS │ │ │ └── part-00000 │ │ ├── 3_VectorAssembler_ef6b7bf933ee │ │ └── metadata │ │ │ ├── ._SUCCESS.crc │ │ │ ├── .part-00000.crc │ │ │ ├── _SUCCESS │ │ │ └── part-00000 │ │ └── 4_BinaryLabelMaker_3b174e5e0c29 │ │ └── metadata │ │ ├── ._SUCCESS.crc │ │ ├── .part-00000.crc │ │ ├── _SUCCESS │ │ └── part-00000 ├── 05_anomaly_detection │ ├── dl-anomaly-detection.ipynb │ ├── img │ │ ├── denoising_autoencoder.png │ │ ├── stacked_autoencoder.png │ │ └── undercomplete_autoencoder.png │ ├── models │ │ ├── denoising_autoencoder_model.h5 │ │ ├── simple_autoencoder_model.h5 │ │ └── stacked_autoencoder_model.h5 │ └── notebook_utils.py ├── 06_dl_classifier │ ├── dl-classifier.ipynb │ ├── models │ │ ├── c0cb0656-558f-4311-b138-9b91ab4d1fe6.h5 │ │ ├── model_class_weight.h5 │ │ ├── model_no_class_weights.h5 │ │ └── opt_model.h5 │ └── notebook_utils.py └── 07_binary_classifier_comparison │ ├── binary-classifier-comparison.ipynb │ ├── models │ └── gb_835066e8-2427-48ca-a521-67195008cb91.catboost │ └── notebook_utils.py ├── project-proposal.pdf ├── setup.cfg ├── setup.py ├── tests ├── data │ └── test_dataset.py ├── transform │ └── test_preprocessing.py └── validation_data │ └── validation.csv └── upload.py /.dockerignore: -------------------------------------------------------------------------------- 1 | build/ 2 | data/ 3 | notebooks/ 4 | tests/ 5 | dataset/ -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.csv filter=lfs diff=lfs merge=lfs -text 2 | tests/validation_data/*.csv -filter=lfs -diff=lfs -merge=lfs -text 3 | *.catboost filter=lfs diff=lfs merge=lfs -text 4 | *.h5 filter=lfs diff=lfs merge=lfs -text 5 | *.cbm filter=lfs diff=lfs merge=lfs -text 6 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - uses: actions/checkout@v1 11 | 12 | - name: Install dependencies 13 | run: | 14 | conda env create --file environment.yaml 15 | source /usr/share/miniconda/etc/profile.d/conda.sh 16 | conda activate ml-ids 17 | pip install -e . 18 | 19 | - name: Static Type Check 20 | run: | 21 | source /usr/share/miniconda/etc/profile.d/conda.sh 22 | conda activate ml-ids 23 | make typecheck 24 | 25 | - name: Code Quality Check 26 | run: | 27 | source /usr/share/miniconda/etc/profile.d/conda.sh 28 | conda activate ml-ids 29 | make lint-errors 30 | 31 | - name: Test with pytest 32 | run: | 33 | source /usr/share/miniconda/etc/profile.d/conda.sh 34 | conda activate ml-ids 35 | make test 36 | -------------------------------------------------------------------------------- /.github/workflows/deployment.yml: -------------------------------------------------------------------------------- 1 | name: Deploy Model on AWS Sagemaker 2 | 3 | on: 4 | deployment 5 | 6 | jobs: 7 | deploy: 8 | name: Deploy 9 | runs-on: ubuntu-latest 10 | 11 | steps: 12 | - name: Checkout 13 | uses: actions/checkout@v1 14 | 15 | - name: Set Tag in Environment 16 | id: set-aws-tag 17 | run: | 18 | if [ -z "$GITHUB_REF" ] 19 | then 20 | echo "No Tag given. Workflow may only be run on tagged commits." 21 | exit 1 22 | fi 23 | echo "::set-output name=awstag::$(echo ${GITHUB_REF:10} | sed 's/[^a-zA-Z0-9]/-/g')" 24 | 25 | - name: Set up Python 3.7 26 | uses: actions/setup-python@v1 27 | with: 28 | python-version: 3.7 29 | 30 | - name: Install Python dependencies 31 | run: | 32 | python -m pip install --upgrade pip 33 | pip install click==7.0 34 | pip install boto3==1.10.28 35 | pip install mlflow==1.4.0 36 | 37 | - name: Configure AWS credentials 38 | uses: aws-actions/configure-aws-credentials@v1 39 | with: 40 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} 41 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 42 | aws-region: eu-west-1 43 | 44 | - name: Login to Amazon ECR 45 | id: login-ecr 46 | uses: aws-actions/amazon-ecr-login@v1 47 | 48 | - name: Deploy model on AWS Sagemaker 49 | id: deploy-model 50 | env: 51 | AWS_TAG: ${{ steps.set-aws-tag.outputs.awstag }} 52 | run: | 53 | make sagemaker_deploy JOB_ID="ml-ids-sagemaker-$AWS_TAG" 54 | -------------------------------------------------------------------------------- /.github/workflows/train.yml: -------------------------------------------------------------------------------- 1 | name: Train Model on AWS Sagemaker 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'm*' 7 | 8 | jobs: 9 | train: 10 | name: Deploy 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - name: Checkout 15 | uses: actions/checkout@v1 16 | 17 | - name: Set Tag in Environment 18 | id: set-aws-tag 19 | run: | 20 | if [ -z "$GITHUB_REF" ] 21 | then 22 | echo "No Tag given. Workflow may only be run on tagged commits." 23 | exit 1 24 | fi 25 | echo "::set-output name=awstag::$(echo ${GITHUB_REF:10} | sed 's/[^a-zA-Z0-9]/-/g')" 26 | 27 | - name: Set up Python 3.7 28 | uses: actions/setup-python@v1 29 | with: 30 | python-version: 3.7 31 | 32 | - name: Install Python dependencies 33 | run: | 34 | python -m pip install --upgrade pip 35 | pip install click==7.0 36 | pip install pandas==0.25.2 37 | pip install sagemaker==1.44.3 38 | 39 | - name: Configure AWS credentials 40 | uses: aws-actions/configure-aws-credentials@v1 41 | with: 42 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} 43 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 44 | aws-region: eu-west-1 45 | 46 | - name: Login to Amazon ECR 47 | id: login-ecr 48 | uses: aws-actions/amazon-ecr-login@v1 49 | 50 | - name: Build, tag, and push image to Amazon ECR 51 | id: build-image 52 | env: 53 | ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} 54 | ECR_REPOSITORY: ml-ids-train-sagemaker 55 | IMAGE_TAG: ${{ github.sha }} 56 | AWS_TAG: ${{ steps.set-aws-tag.outputs.awstag }} 57 | run: | 58 | docker build -f models/gradient_boost/envs/sagemaker/container/Dockerfile -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG . 59 | docker tag $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG $ECR_REGISTRY/$ECR_REPOSITORY:$AWS_TAG 60 | docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG 61 | docker push $ECR_REGISTRY/$ECR_REPOSITORY:$AWS_TAG 62 | echo "::set-output name=image::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" 63 | 64 | - name: Train the packaged model on AWS Sagemaker 65 | id: train-model 66 | env: 67 | IMAGE_NAME: ${{ steps.build-image.outputs.image }} 68 | AWS_TAG: ${{ steps.set-aws-tag.outputs.awstag }} 69 | run: | 70 | make sagemaker_train_aws \ 71 | SAGEMAKER_TRAIN_CONFIG_PATH=models/gradient_boost/envs/sagemaker/configs/train-gpu.json \ 72 | SAGEMAKER_IMAGE_NAME=$IMAGE_NAME \ 73 | TRAIN_PARAM_PATH=models/gradient_boost/training_params.json \ 74 | JOB_ID="ml-ids-sagemaker-$AWS_TAG" 75 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | dataset/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # celery beat schedule file 96 | celerybeat-schedule 97 | 98 | # SageMath parsed files 99 | *.sage.py 100 | 101 | # Environments 102 | .env 103 | .venv 104 | env/ 105 | venv/ 106 | ENV/ 107 | env.bak/ 108 | venv.bak/ 109 | 110 | # Spyder project settings 111 | .spyderproject 112 | .spyproject 113 | 114 | # Rope project settings 115 | .ropeproject 116 | 117 | # mkdocs documentation 118 | /site 119 | 120 | # mypy 121 | .mypy_cache/ 122 | .dmypy.json 123 | dmypy.json 124 | 125 | # Pyre type checker 126 | .pyre/ 127 | 128 | # Catboost 129 | catboost_info 130 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Default ignored files 3 | /workspace.xml -------------------------------------------------------------------------------- /.idea/deployment.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 15 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/ml-ids.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 13 | 14 | 17 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SAGEMAKER_TRAIN_CONFIG_PATH=models/gradient_boost/envs/sagemaker/configs/train-gpu.json 2 | SAGEMAKER_DEPLOY_CONFIG_PATH=models/gradient_boost/envs/sagemaker/configs/deploy.json 3 | TRAIN_PARAM_PATH=models/gradient_boost/training_params.json 4 | TRAIN_PATH=dataset/train.h5 5 | VAL_PATH=dataset/val.h5 6 | TEST_PATH=dataset/test.h5 7 | 8 | clean: 9 | -rm -r -f build 10 | mkdir build 11 | 12 | test: 13 | python -m pytest tests 14 | 15 | lint: 16 | pylint ml_ids 17 | 18 | lint-errors: 19 | pylint ml_ids -E 20 | 21 | typecheck: 22 | mypy ml_ids 23 | 24 | split_dataset: 25 | mkdir -p dataset 26 | python ./ml_ids/data/split_dataset.py \ 27 | --dataset-path $(DATASET_PATH) \ 28 | --output-path dataset \ 29 | --random-seed 42 30 | 31 | train_local: 32 | python ./models/gradient_boost/envs/local/train.py \ 33 | --train-path $(TRAIN_PATH) \ 34 | --val-path $(VAL_PATH) \ 35 | --test-path $(TEST_PATH) \ 36 | --output-path build/models/gradient_boost \ 37 | --param-path $(TRAIN_PARAM_PATH) 38 | 39 | sagemaker_build_image: 40 | ./models/gradient_boost/envs/sagemaker/scripts/build_image.sh ml-ids-train-sagemaker $(TAG) 41 | 42 | sagemaker_push_image: 43 | ./models/gradient_boost/envs/sagemaker/scripts/push_image_to_ecr.sh ml-ids-train-sagemaker $(TAG) | grep -Po '(?<=^image-name=).*' > sagemaker-image-name.txt 44 | 45 | sagemaker_train_local: 46 | python ./models/gradient_boost/envs/sagemaker/scripts/train.py \ 47 | --config-path $(SAGEMAKER_TRAIN_CONFIG_PATH) \ 48 | --param-path $(TRAIN_PARAM_PATH) \ 49 | --mode LOCAL \ 50 | --image-name "ml-ids-train-sagemaker:$(TAG)" \ 51 | --job-id "ml-ids-sagemaker-job" 52 | 53 | sagemaker_train_aws: 54 | python ./models/gradient_boost/envs/sagemaker/scripts/train.py \ 55 | --config-path $(SAGEMAKER_TRAIN_CONFIG_PATH) \ 56 | --param-path $(TRAIN_PARAM_PATH) \ 57 | --mode AWS \ 58 | --image-name $(SAGEMAKER_IMAGE_NAME) \ 59 | --job-id $(JOB_ID) 60 | 61 | sagemaker_deploy: 62 | python ./models/gradient_boost/envs/sagemaker/scripts/deploy.py \ 63 | --config-path $(SAGEMAKER_DEPLOY_CONFIG_PATH) \ 64 | --job-id $(JOB_ID) 65 | 66 | sagemaker_undeploy: 67 | python ./models/gradient_boost/envs/sagemaker/scripts/undeploy.py \ 68 | --config-path $(SAGEMAKER_DEPLOY_CONFIG_PATH) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A machine learning based approach towards building an Intrusion Detection System 2 | 3 | ## Problem Description 4 | With the rising amount of network enabled devices connected to the internet such as mobile phones, IOT appliances or vehicles the concern about the security implications of using these devices is growing. The increase in numbers and types of networked devices inevitably leads to a wider surface of attack whereas the impact of successful attacks is becoming increasingly severe as more critical responsibilities are assumed be these devices. 5 | 6 | To identify and counter network attacks it is common to employ a combination of multiple systems in order to prevent attacks from happening or to detect and stop ongoing attacks if they can not be prevented initially. 7 | These systems are usually comprised of an intrusion prevention system such as a firewall as the first layer of security with intrusion detection systems representing the second layer. 8 | Should the intrusion prevention system be unable to prevent a network attack it is the task of the detection system to identify malicious network traffic in order to stop the ongoing attack and keep the recorded network traffic data for later analysis. This data can subsequently be used to update the prevention system to allow for the detection of the specific network attack in the future. The need for intrusion detection systems is rising as absolute prevention against attacks is not possible due to the rapid emergence of new attack types. 9 | 10 | Even though intrusion detection systems are an essential part of network security many detection systems deployed today have a significant weakness as they facilitate signature-based attack classification patterns which are able to detect the most common known attack patterns but have the drawback of being unable to detect novel attack types. 11 | To overcome this limitation research in intrusion detection systems is focusing on more dynamic approaches based on machine learning and anomaly detection methods. In these systems the normal network behaviour is learned by processing previously recorded benign data packets which allows the system to identify new attack types by analyzing network traffic for anomalous data flows. 12 | 13 | This project aims to implement a classifier capable of identifying network traffic as either benign or malicious based on machine learning and deep learning methodologies. 14 | 15 | ## Data 16 | The data used to train the classifier is taken from the [CSE-CIC-IDS2018](https://www.unb.ca/cic/datasets/ids-2018.html) dataset provided by the Canadian Institute for Cybersecurity. It was created by capturing all network traffic during ten days of operation inside a controlled network environment on AWS where realistic background traffic and different attack scenarios were conducted. 17 | As a result the dataset contains both benign network traffic as well as captures of the most common network attacks. 18 | The dataset is comprised of the raw network captures in pcap format as well as csv files created by using [CICFlowMeter-V3](https://www.unb.ca/cic/research/applications.html#CICFlowMeter) containing 80 statistical features of the individual network flows combined with their corresponding labels. 19 | A network flow is defined as an aggregation of interrelated network packets identified by the following properties: 20 | * Source IP 21 | * Destination IP 22 | * Source port 23 | * Destination port 24 | * Protocol 25 | 26 | The dataset contains approximately 16 million individual network flows and covers the following attack scenarios: 27 | * Brute Force 28 | * DoS, 29 | * DDos 30 | * Heartbleed, 31 | * Web Attack, 32 | * Infiltration, 33 | * Botnet 34 | 35 | ## Approach 36 | The goal of this project is to create a classifier capable of categorising network flows as either benign or malicious. 37 | The problem is understood as a supervised learning problem using the labels provided in the dataset which identify the network flows as either benign or malicious. Different approaches of classifying the data will be evaluated to formulate the problem either as a binary classification or a multiclass classification problem differentiating between the individual classes of attacks provided in the dataset in the later case. A relevant subset of the features provided in the dataset will be used as predictors to classify individual network flows. 38 | Machine learning methods like k-nearest neighbours, random forest or SVM will be applied to the problem and evaluated in the first step in order to assess the feasibility of using traditional machine learning approaches. 39 | Subsequently deep learning models like convolutional neural networks, autoencoders or recurrent neural networks will be employed to create a competing classifier as recent research has shown that deep learning methods represent a promising application in the field of anomaly detection. 40 | The results of both approaches will be compared to select the best performing classifier. 41 | 42 | ## Deliverables 43 | The classifier will be deployed and served via a REST API in conjunction with a simple web application providing a user interface to utilize the API. 44 | 45 | The REST API will provide the following functionality: 46 | * an endpoint to submit network capture files in pcap format. Individual network flows are extracted from the capture files and analysed for malicious network traffic. 47 | * (optional) an endpoint to stream continuous network traffic captures which are analysed in near real-time combined with 48 | * (optional) an endpoint to register a web-socket in order to get notified upon detection of malicious network traffic. 49 | 50 | To further showcase the project, a testbed could be created against which various attack scenarios can be performed. This testbed would be connected to the streaming API for near real-time detection of malicious network traffic. 51 | 52 | ## Computational resources 53 | The requirements regarding the computational resources to train the classifiers are given below: 54 | 55 | | Category | Resource | 56 | | ------------- | ------------- | 57 | | CPU | Intel Core i7 processor | 58 | | RAM | 32 GB | 59 | | GPU | 1 GPU, 8 GB RAM | 60 | | HDD | 100 GB | 61 | 62 | 63 | ## Classifier 64 | 65 | The machine learning estimator created in this project follows a supervised approach and is trained using the [Gradient Boosting](https://en.wikipedia.org/wiki/Gradient_boosting) algorithm. Employing the [CatBoost](https://catboost.ai/) library a binary classifier is created, capable of classifying network flows as either benign or malicious. The chosen parameters of the classifier and its performance metrics can be examined in the following [notebook](https://github.com/cstub/ml-ids/blob/master/notebooks/07_binary_classifier_comparison/binary-classifier-comparison.ipynb). 66 | 67 | ## Deployment Architecture 68 | 69 | The deployment architecture of the complete ML-IDS system is explained in detail in the [system architecture](https://docs.google.com/document/d/1s_EBMTid4gdrsQU_xOCAYK1BzxkhhnYl6wHFSZo_9Tw/edit?usp=sharing). 70 | 71 | ## Model Training and Deployment 72 | 73 | The model can be trained and deployed either locally or via [Amazon SageMaker](https://aws.amazon.com/sagemaker/). 74 | In each case the [MLflow](https://www.mlflow.org/docs/latest/index.html) framework is utilized to train the model and create the model artifacts. 75 | 76 | ### Installation 77 | 78 | To install the necessary dependencies checkout the project and create a new Anaconda environment from the environment.yml file. 79 | 80 | ``` 81 | conda env create -f environment.yml 82 | ``` 83 | 84 | Afterwards activate the environment and install the project resources. 85 | 86 | ``` 87 | conda activate ml-ids 88 | 89 | pip install -e . 90 | ``` 91 | 92 | ### Dataset Creation 93 | 94 | To create the dataset for training use the following command: 95 | 96 | ``` 97 | make split_dataset \ 98 | DATASET_PATH={path-to-source-dataset} 99 | ``` 100 | 101 | This command will read the source dataset and split the dataset into separate train/validation/test sets with a sample ratio of 80%/10%/10%. The specified source dataset should be a folder containing multiple `.csv` files. 102 | You can use the [CIC-IDS-2018 dataset](https://www.unb.ca/cic/datasets/ids-2018.html) provided via [Google Drive](https://drive.google.com/open?id=1HrTPh0YRSZ4T9DLa_c47lubheKUcPl0r) for this purpose. 103 | Once the command completes a new folder `dataset` is created that contains the splitted datasets in `.h5` format. 104 | 105 | ### Local Mode 106 | 107 | To train the model in local mode, using the default parameters and dataset locations created by `split_dataset`, use the following command: 108 | 109 | ``` 110 | make train_local 111 | ``` 112 | 113 | If the datasets are stored in a different location or you want to specify different training parameters, you can optionally supply the dataset locations and a training parameter file: 114 | 115 | ``` 116 | make train_local \ 117 | TRAIN_PATH={path-to-train-dataset} \ 118 | VAL_PATH={path-to-train-dataset} \ 119 | TEST_PATH={path-to-train-dataset} \ 120 | TRAIN_PARAM_PATH={path-to-param-file} 121 | ``` 122 | 123 | Upon completion of the training process the model artifacts can be found in the `build/models/gradient_boost` directory. 124 | 125 | To deploy the model locally the MLflow CLI can be used. 126 | 127 | ``` 128 | mlflow models serve -m build/models/gradient_boost -p 5000 129 | ``` 130 | 131 | The model can also be deployed as a Docker container using the following commands: 132 | 133 | ``` 134 | mlflow models build-docker -m build/models/gradient_boost -n ml-ids-classifier:1.0 135 | 136 | docker run -p 5001:8080 ml-ids-classifier:1.0 137 | ``` 138 | 139 | ### Amazon SageMaker 140 | 141 | To train the model on Amazon SageMaker the following command sequence is used: 142 | 143 | ``` 144 | # build a new docker container for model training 145 | make sagemaker_build_image \ 146 | TAG=1.0 147 | 148 | # upload the container to AWS ECR 149 | make sagemaker_push_image \ 150 | TAG=1.0 151 | 152 | # execute the training container on Amazon SageMaker 153 | make sagemaker_train_aws \ 154 | SAGEMAKER_IMAGE_NAME={ecr-image-name}:1.0 \ 155 | JOB_ID=ml-ids-job-0001 156 | ``` 157 | 158 | This command requires a valid AWS account with the appropriate permissions to be configured locally via the [AWS CLI](https://aws.amazon.com/cli/). Furthermore, [AWS ECR](https://aws.amazon.com/ecr/) and Amazon SageMaker must be configured for the account. 159 | 160 | Using this repository, the manual invocation of the aforementioned commands is not necessary as training on Amazon SageMaker is supported via a [GitHub workflow](https://github.com/cstub/ml-ids/blob/master/.github/workflows/train.yml) that is triggered upon creation of a new tag of the form `m*` (e.g. `m1.0`). 161 | 162 | To deploy a trained model on Amazon SageMaker a [GitHub Deployment request](https://developer.github.com/v3/repos/deployments/) using the GitHub API must be issued, specifying the tag of the model. 163 | 164 | ``` 165 | { 166 | "ref": "refs/tags/m1.0", 167 | "payload": {}, 168 | "description": "Deploy request for model version m1.0", 169 | "auto_merge": false 170 | } 171 | ``` 172 | 173 | This deployment request triggers a [GitHub workflow](https://github.com/cstub/ml-ids/blob/master/.github/workflows/deployment.yml), deploying the model to SageMaker. 174 | After successful deployment the model is accessible via the SageMaker HTTP API. 175 | 176 | ## Using the Classifier 177 | 178 | The classifier deployed on Amazon SageMaker is not directly available publicly, but can be accessed using the [ML-IDS REST API](https://github.com/cstub/ml-ids-api). 179 | 180 | ### REST API 181 | 182 | To invoke the REST API the following command can be used to submit a prediction request for a given network flow: 183 | 184 | ``` 185 | curl -X POST \ 186 | http://ml-ids-cluster-lb-1096011980.eu-west-1.elb.amazonaws.com/api/predictions \ 187 | -H 'Accept: */*' \ 188 | -H 'Content-Type: application/json; format=pandas-split' \ 189 | -H 'Host: ml-ids-cluster-lb-1096011980.eu-west-1.elb.amazonaws.com' \ 190 | -H 'cache-control: no-cache' \ 191 | -d '{"columns":["dst_port","protocol","timestamp","flow_duration","tot_fwd_pkts","tot_bwd_pkts","totlen_fwd_pkts","totlen_bwd_pkts","fwd_pkt_len_max","fwd_pkt_len_min","fwd_pkt_len_mean","fwd_pkt_len_std","bwd_pkt_len_max","bwd_pkt_len_min","bwd_pkt_len_mean","bwd_pkt_len_std","flow_byts_s","flow_pkts_s","flow_iat_mean","flow_iat_std","flow_iat_max","flow_iat_min","fwd_iat_tot","fwd_iat_mean","fwd_iat_std","fwd_iat_max","fwd_iat_min","bwd_iat_tot","bwd_iat_mean","bwd_iat_std","bwd_iat_max","bwd_iat_min","fwd_psh_flags","bwd_psh_flags","fwd_urg_flags","bwd_urg_flags","fwd_header_len","bwd_header_len","fwd_pkts_s","bwd_pkts_s","pkt_len_min","pkt_len_max","pkt_len_mean","pkt_len_std","pkt_len_var","fin_flag_cnt","syn_flag_cnt","rst_flag_cnt","psh_flag_cnt","ack_flag_cnt","urg_flag_cnt","cwe_flag_count","ece_flag_cnt","down_up_ratio","pkt_size_avg","fwd_seg_size_avg","bwd_seg_size_avg","fwd_byts_b_avg","fwd_pkts_b_avg","fwd_blk_rate_avg","bwd_byts_b_avg","bwd_pkts_b_avg","bwd_blk_rate_avg","subflow_fwd_pkts","subflow_fwd_byts","subflow_bwd_pkts","subflow_bwd_byts","init_fwd_win_byts","init_bwd_win_byts","fwd_act_data_pkts","fwd_seg_size_min","active_mean","active_std","active_max","active_min","idle_mean","idle_std","idle_max","idle_min"],"data":[[80,17,"21\\/02\\/2018 10:15:06",119759145,75837,0,2426784,0,32,32,32.0,0.0,0,0,0.0,0.0,20263.87212,633.2460039,1579.1859130859,31767.046875,920247,1,120000000,1579.1859130859,31767.046875,920247,1,0,0.0,0.0,0,0,0,0,0,0,606696,0,633.2460327148,0.0,32,32,32.0,0.0,0.0,0,0,0,0,0,0,0,0,0,32.0004234314,32.0,0.0,0,0,0,0,0,0,75837,2426784,0,0,-1,-1,75836,8,0.0,0.0,0,0,0.0,0.0,0,0]]}' 192 | ``` 193 | 194 | ### ML-IDS API Clients 195 | 196 | For convenience, the Python clients implemented in the [ML-IDS API Clients project](https://github.com/cstub/ml-ids-api-client) can be used to submit new prediction requests to the API and receive real-time notifications on detection of malicious network flows. 197 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | ## Data 2 | 3 | The data used to train the classifiers is taken from the [CSE-CIC-IDS2018](https://www.unb.ca/cic/datasets/ids-2018.html) dataset provided by the Canadian Institute for Cybersecurity. 4 | It was created by capturing all network traffic during ten days of operation inside a controlled network environment on AWS where realistic background traffic and different attack scenarios were conducted. 5 | 6 | The dataset consists of raw network captures in pcap format as well as processed csv files created by using [CICFlowMeter-V3](https://www.unb.ca/cic/research/applications.html#CICFlowMeter) containing 80 statistical features of the individual network flows combined with their corresponding labels. 7 | 8 | Due to size limitations the data provided in this repository represents only a small portion of the dataset in form of processed network flows. The full dataset consisting of the raw network captures and the processed csv files can be retrieved from AWS S3. 9 | 10 | ## Download 11 | 12 | A prerequisite to downloading the full dataset is the installation of the [AWS CLI](https://aws.amazon.com/cli/). 13 | 14 | To download the processed csv files containing the analyzed network flows (~7GB) run the following command: 15 | ```bash 16 | aws s3 sync --no-sign-request --region "s3://cse-cic-ids2018/Processed Traffic Data for ML Algorithms/" 17 | ``` 18 | To download the raw network captures in pcap format (~477GB) run: 19 | ```bash 20 | aws s3 sync --no-sign-request --region "s3://cse-cic-ids2018/Original Network Traffic and Log data/" 21 | ``` 22 | To download the full dataset containing the raw network captures and processed csv files (~484GB) use the following command: 23 | ```bash 24 | aws s3 sync --no-sign-request --region "s3://cse-cic-ids2018/" 25 | ``` 26 | 27 | ## Preprocessed Dataset 28 | 29 | The preprocessed dataset used for model training and evaluation can be found at [Google Drive](https://drive.google.com/drive/folders/1AWhRsVShJ_KvYKrV0VlnM1odtJ4Tp-uC?usp=sharing). 30 | -------------------------------------------------------------------------------- /data/Wednesday-14-02-2018_TrafficForML_CICFlowMeter.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:acff8bc61376ee031d80878ee6099e0b1a87a1bd711d8068298421418c9f8147 3 | size 358223333 4 | -------------------------------------------------------------------------------- /environment-notebook.yaml: -------------------------------------------------------------------------------- 1 | name: ml-ids-notebooks 2 | channels: 3 | - anaconda 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - catboost=0.18.1=py37_0 8 | - click=7.0=py37_0 9 | - cloudpickle=1.2.2=py_0 10 | - eli5=0.10.1=py37_1 11 | - findspark=1.3.0=py_1 12 | - imbalanced-learn=0.5.0=py_0 13 | - jupyter=1.0.0=py_2 14 | - matplotlib=3.1.1=py37_1 15 | - numpy=1.17.2=py37h95a1406_0 16 | - pandas=0.25.2=py37hb3f55d8_0 17 | - pip=19.2.3=py37_0 18 | - pyspark=2.4.4=py_0 19 | - pytest=5.2.1=py37_0 20 | - pytest-runner=5.1=py_0 21 | - python=3.7.3=h33d41f4_1 22 | - python-dateutil<2.8.1 23 | - requests<2.21.0 24 | - scikit-learn=0.21.3=py37hcdab131_0 25 | - scikit-plot=0.3.7=py_1 26 | - scipy=1.3.1=py37h921218d_2 27 | - seaborn=0.9.0=py_1 28 | - setuptools=41.6.0=py37_1 29 | - shap=0.31.0=py37hb3f55d8_0 30 | - pip: 31 | - h5py==2.10.0 32 | - hyperopt==0.2.2 33 | - keras==2.3.1 34 | - keras-applications==1.0.8 35 | - keras-preprocessing==1.1.0 36 | - tables==3.6.1 37 | - tensorboard==2.0.0 38 | - tensorflow-estimator==2.0.0 39 | - tensorflow-gpu==2.0.0 40 | -------------------------------------------------------------------------------- /environment.yaml: -------------------------------------------------------------------------------- 1 | name: ml-ids 2 | channels: 3 | - anaconda 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - catboost=0.18.1=py37_0 8 | - click=7.0=py37_0 9 | - cloudpickle=1.2.2=py_0 10 | - imbalanced-learn=0.5.0=py_0 11 | - matplotlib=3.1.1=py37_1 12 | - mypy=0.750 13 | - numpy=1.17.2=py37h95a1406_0 14 | - pandas=0.25.2=py37hb3f55d8_0 15 | - pip=19.2.3=py37_0 16 | - pylint=2.4.4 17 | - pytest=5.2.1=py37_0 18 | - pytest-runner=5.1=py_0 19 | - python=3.7.3=h33d41f4_1 20 | - python-dateutil<2.8.1 21 | - requests<2.21.0 22 | - scikit-learn=0.21.3=py37hcdab131_0 23 | - scipy=1.3.1=py37h921218d_2 24 | - seaborn=0.9.0=py_1 25 | - setuptools=41.6.0=py37_1 26 | - pip: 27 | - mlflow==1.4 28 | - sagemaker==1.44.3 29 | - h5py==2.10.0 30 | - hyperopt==0.2.2 31 | - keras==2.3.1 32 | - keras-applications==1.0.8 33 | - keras-preprocessing==1.1.0 34 | - tables==3.6.1 35 | - tensorflow-estimator==2.0.0 36 | - tensorflow-gpu==2.0.0 37 | -------------------------------------------------------------------------------- /ml_ids/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/ml_ids/__init__.py -------------------------------------------------------------------------------- /ml_ids/conf.py: -------------------------------------------------------------------------------- 1 | """ 2 | Global configuration variables. 3 | """ 4 | import os 5 | 6 | ROOT_DIR = os.sep.join(os.path.dirname(os.path.abspath(__file__)).split(os.sep)[:-1]) 7 | 8 | TEST_DIR = os.path.join(ROOT_DIR, 'tests') 9 | 10 | TEST_DATA_DIR = os.path.join(TEST_DIR, 'validation_data') 11 | -------------------------------------------------------------------------------- /ml_ids/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/ml_ids/data/__init__.py -------------------------------------------------------------------------------- /ml_ids/data/dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities to manipulate the CIC-IDS-2018 dataset. 3 | """ 4 | from typing import List 5 | import os 6 | import glob 7 | import numpy as np 8 | import pandas as pd 9 | import ml_ids.data.metadata as md 10 | 11 | 12 | def remove_inf_values(df: pd.DataFrame) -> pd.DataFrame: 13 | """ 14 | Replaces values of type `np.inf` and `-np.inf` in a DataFrame with `null` values. 15 | 16 | :param df: Input DataFrame. 17 | :return: The DataFrame without `np.inf` and `-np.inf` values. 18 | """ 19 | inf_columns = [c for c in df.columns if df[df[c] == np.inf][c].count() > 0] 20 | for col in inf_columns: 21 | df[col].replace([np.inf, -np.inf], np.nan, inplace=True) 22 | return df 23 | 24 | 25 | def remove_negative_values(df: pd.DataFrame, ignore_cols: List[str] = None) -> pd.DataFrame: 26 | """ 27 | Removes negative values in a DataFrame with `null` values. 28 | 29 | :param df: Input DataFrame. 30 | :param ignore_cols: Columns to ignore. Negative values in this columns will be preserved. 31 | :return: The DataFrame without negative values. 32 | """ 33 | if ignore_cols is None: 34 | ignore_cols = [] 35 | 36 | numeric_cols = df.select_dtypes(include=[np.number]).columns.drop(ignore_cols).values 37 | 38 | columns = [c for c in numeric_cols if df[df[c] < 0][c].count() > 0] 39 | for col in columns: 40 | mask = df[col] < 0 41 | df.loc[mask, col] = np.nan 42 | return df 43 | 44 | 45 | def add_label_category_column(df: pd.DataFrame) -> pd.DataFrame: 46 | """ 47 | Adds the column `label_cat` to the DataFrame specifying the category of the label. 48 | 49 | :param df: Input DataFrame. 50 | :return: The DataFrame containing a new column `label_cat`. 51 | """ 52 | df[md.COLUMN_LABEL_CAT] = df.label.apply(lambda l: md.LABEL_CAT_MAPPING[l]) 53 | return df 54 | 55 | 56 | def add_label_is_attack_columns(df: pd.DataFrame) -> pd.DataFrame: 57 | """ 58 | Adds the column `label_is_attack` to the DataFrame containing a binary indicator specifying if a row is of category 59 | `benign = 0` or `attack = 1`. 60 | 61 | :param df: Input DataFrame. 62 | :return: The DataFrame containing a new column `label_is_attack`. 63 | """ 64 | df[md.COLUMN_LABEL_IS_ATTACK] = df.label.apply(lambda l: 0 if l == md.LABEL_BENIGN else 1) 65 | return df 66 | 67 | 68 | def load_dataset_generic(load_df_fn, 69 | dataset_path: str, 70 | use_cols: List[str] = None, 71 | omit_cols: List[str] = None, 72 | preserve_neg_value_cols: list = None, 73 | transform_data: bool = True) -> pd.DataFrame: 74 | """ 75 | Loads the dataset from the given path using the supplied function. 76 | All invalid values (`np.inf`, `-np.inf`, negative) are removed and replaced with `null` for easy imputation. 77 | Negative values of columns specified in `preserve_neg_value_cols` will be preserved. 78 | 79 | :param load_df_fn: Function used to load the dataset. 80 | :param dataset_path: Path of the base directory containing all files of the dataset. 81 | :param use_cols: Columns to load. 82 | :param omit_cols: Columns to omit. 83 | :param nrows: Number of rows to load per file. 84 | :param transform_data: Indicates if data should be manipulated (removal of invalid and negative values). 85 | :param preserve_neg_value_cols: Columns in which negative values are preserved. 86 | :return: The dataset as a DataFrame. 87 | """ 88 | cols = None 89 | if use_cols: 90 | cols = use_cols 91 | if omit_cols: 92 | cols = [c for c in md.COLUMN_DTYPES.keys() if c not in omit_cols] 93 | 94 | df = load_df_fn(dataset_path, cols) 95 | 96 | if transform_data: 97 | df = remove_inf_values(df) 98 | df = remove_negative_values(df, preserve_neg_value_cols) 99 | 100 | if md.COLUMN_LABEL in df.columns: 101 | df = add_label_category_column(df) 102 | df = add_label_is_attack_columns(df) 103 | 104 | return df 105 | 106 | 107 | def load_dataset(dataset_path: str, 108 | use_cols: List[str] = None, 109 | omit_cols: List[str] = None, 110 | nrows: int = None, 111 | transform_data: bool = True, 112 | preserve_neg_value_cols: list = None) -> pd.DataFrame: 113 | """ 114 | Loads the dataset in CSV format from the given path. 115 | All invalid values (`np.inf`, `-np.inf`, negative) are removed and replaced with `null` for easy imputation. 116 | Negative values of columns specified in `preserve_neg_value_cols` will be preserved. 117 | 118 | :param dataset_path: Path of the base directory containing all files of the dataset. 119 | :param use_cols: Columns to load. 120 | :param omit_cols: Columns to omit. 121 | :param nrows: Number of rows to load per file. 122 | :param transform_data: Indicates if data should be manipulated (removal of invalid and negative values). 123 | :param preserve_neg_value_cols: Columns in which negative values are preserved. 124 | :return: The dataset as a DataFrame. 125 | """ 126 | 127 | def load_csv(path, cols): 128 | files = glob.glob(os.path.join(path, '*.csv')) 129 | return pd.concat([pd.read_csv(f, dtype=md.COLUMN_DTYPES, usecols=cols, nrows=nrows) for f in files]) 130 | 131 | return load_dataset_generic(load_df_fn=load_csv, 132 | dataset_path=dataset_path, 133 | use_cols=use_cols, 134 | omit_cols=omit_cols, 135 | preserve_neg_value_cols=preserve_neg_value_cols, 136 | transform_data=transform_data) 137 | 138 | 139 | def load_dataset_hdf(dataset_path: str, 140 | use_cols: List[str] = None, 141 | omit_cols: List[str] = None, 142 | preserve_neg_value_cols: list = None, 143 | transform_data: bool = True, 144 | key: str = None) -> pd.DataFrame: 145 | """ 146 | Loads the dataset stored as a HDF file from the given path. 147 | All invalid values (`np.inf`, `-np.inf`, negative) are removed and replaced with `null` for easy imputation. 148 | Negative values of columns specified in `preserve_neg_value_cols` will be preserved. 149 | 150 | :param dataset_path: Path of the base directory containing all files of the dataset. 151 | :param use_cols: Columns to load. 152 | :param omit_cols: Columns to omit. 153 | :param preserve_neg_value_cols: Columns in which negative values are preserved. 154 | :param transform_data: Indicates if data should be manipulated (removal of invalid and negative values). 155 | :param key: Group identifier in the HDF store. 156 | :return: The dataset as a DataFrame. 157 | """ 158 | 159 | def load_hdf(path, cols): 160 | return pd.read_hdf(path, key=key, columns=cols) 161 | 162 | return load_dataset_generic(load_df_fn=load_hdf, 163 | dataset_path=dataset_path, 164 | use_cols=use_cols, 165 | omit_cols=omit_cols, 166 | preserve_neg_value_cols=preserve_neg_value_cols, 167 | transform_data=transform_data) 168 | -------------------------------------------------------------------------------- /ml_ids/data/metadata.py: -------------------------------------------------------------------------------- 1 | """ 2 | Metadata of the CIC-IDS-2018 dataset. 3 | """ 4 | COLUMN_DTYPES = { 5 | 'dst_port': 'uint32', 6 | 'protocol': 'uint8', 7 | 'timestamp': 'object', 8 | 'flow_duration': 'int64', 9 | 'tot_fwd_pkts': 'uint32', 10 | 'tot_bwd_pkts': 'uint32', 11 | 'totlen_fwd_pkts': 'uint32', 12 | 'totlen_bwd_pkts': 'uint32', 13 | 'fwd_pkt_len_max': 'uint16', 14 | 'fwd_pkt_len_min': 'uint16', 15 | 'fwd_pkt_len_mean': 'float32', 16 | 'fwd_pkt_len_std': 'float32', 17 | 'bwd_pkt_len_max': 'uint16', 18 | 'bwd_pkt_len_min': 'uint16', 19 | 'bwd_pkt_len_mean': 'float32', 20 | 'bwd_pkt_len_std': 'float32', 21 | 'flow_byts_s': 'float64', 22 | 'flow_pkts_s': 'float64', 23 | 'flow_iat_mean': 'float32', 24 | 'flow_iat_std': 'float32', 25 | 'flow_iat_max': 'int64', 26 | 'flow_iat_min': 'int64', 27 | 'fwd_iat_tot': 'int64', 28 | 'fwd_iat_mean': 'float32', 29 | 'fwd_iat_std': 'float32', 30 | 'fwd_iat_max': 'int64', 31 | 'fwd_iat_min': 'int64', 32 | 'bwd_iat_tot': 'uint32', 33 | 'bwd_iat_mean': 'float32', 34 | 'bwd_iat_std': 'float32', 35 | 'bwd_iat_max': 'uint32', 36 | 'bwd_iat_min': 'uint32', 37 | 'fwd_psh_flags': 'uint8', 38 | 'bwd_psh_flags': 'uint8', 39 | 'fwd_urg_flags': 'uint8', 40 | 'bwd_urg_flags': 'uint8', 41 | 'fwd_header_len': 'uint32', 42 | 'bwd_header_len': 'uint32', 43 | 'fwd_pkts_s': 'float32', 44 | 'bwd_pkts_s': 'float32', 45 | 'pkt_len_min': 'uint16', 46 | 'pkt_len_max': 'uint16', 47 | 'pkt_len_mean': 'float32', 48 | 'pkt_len_std': 'float32', 49 | 'pkt_len_var': 'float32', 50 | 'fin_flag_cnt': 'uint8', 51 | 'syn_flag_cnt': 'uint8', 52 | 'rst_flag_cnt': 'uint8', 53 | 'psh_flag_cnt': 'uint8', 54 | 'ack_flag_cnt': 'uint8', 55 | 'urg_flag_cnt': 'uint8', 56 | 'cwe_flag_count': 'uint8', 57 | 'ece_flag_cnt': 'uint8', 58 | 'down_up_ratio': 'uint16', 59 | 'pkt_size_avg': 'float32', 60 | 'fwd_seg_size_avg': 'float32', 61 | 'bwd_seg_size_avg': 'float32', 62 | 'fwd_byts_b_avg': 'uint8', 63 | 'fwd_pkts_b_avg': 'uint8', 64 | 'fwd_blk_rate_avg': 'uint8', 65 | 'bwd_byts_b_avg': 'uint8', 66 | 'bwd_pkts_b_avg': 'uint8', 67 | 'bwd_blk_rate_avg': 'uint8', 68 | 'subflow_fwd_pkts': 'uint32', 69 | 'subflow_fwd_byts': 'uint32', 70 | 'subflow_bwd_pkts': 'uint32', 71 | 'subflow_bwd_byts': 'uint32', 72 | 'init_fwd_win_byts': 'int32', 73 | 'init_bwd_win_byts': 'int32', 74 | 'fwd_act_data_pkts': 'uint32', 75 | 'fwd_seg_size_min': 'uint8', 76 | 'active_mean': 'float32', 77 | 'active_std': 'float32', 78 | 'active_max': 'uint32', 79 | 'active_min': 'uint32', 80 | 'idle_mean': 'float32', 81 | 'idle_std': 'float32', 82 | 'idle_max': 'uint64', 83 | 'idle_min': 'uint64', 84 | 'label': 'category' 85 | } 86 | 87 | LABEL_BENIGN = 'Benign' 88 | 89 | LABEL_CAT_MAPPING = { 90 | 'Benign': 0, 91 | 'Bot': 1, 92 | 'Brute Force -Web': 2, 93 | 'Brute Force -XSS': 3, 94 | 'DoS attacks-GoldenEye': 4, 95 | 'DoS attacks-Hulk': 5, 96 | 'DoS attacks-SlowHTTPTest': 6, 97 | 'DoS attacks-Slowloris': 7, 98 | 'DDOS attack-HOIC': 8, 99 | 'DDOS attack-LOIC-UDP': 9, 100 | 'DDoS attacks-LOIC-HTTP': 10, 101 | 'FTP-BruteForce': 11, 102 | 'Infilteration': 12, 103 | 'SQL Injection': 13, 104 | 'SSH-Bruteforce': 14, 105 | 'DDOS LOIT': 15, 106 | 'Heartbleed': 16, 107 | 'PortScan': 17 108 | } 109 | 110 | FEATURES_NO_VARIANCE = [ 111 | "bwd_blk_rate_avg", 112 | "bwd_byts_b_avg", 113 | "bwd_pkts_b_avg", 114 | "bwd_psh_flags", 115 | "bwd_urg_flags", 116 | "fwd_blk_rate_avg", 117 | "fwd_byts_b_avg", 118 | "fwd_pkts_b_avg" 119 | ] 120 | 121 | FEATURES_TO_IGNORE = [ 122 | 'timestamp', 123 | 'dst_port', 124 | 'protocol' 125 | ] 126 | 127 | FEATURES_PRESERVE_NEG_COLUMNS = [ 128 | 'init_fwd_win_byts', 129 | 'init_bwd_win_byts' 130 | ] 131 | 132 | COLUMN_LABEL = 'label' 133 | COLUMN_LABEL_CAT = 'label_cat' 134 | COLUMN_LABEL_IS_ATTACK = 'label_is_attack' 135 | -------------------------------------------------------------------------------- /ml_ids/data/split_dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | CLI to split a single dataset into train/val/test sub-datasets. 3 | """ 4 | import os 5 | import sys 6 | import logging 7 | import click 8 | import pandas as pd 9 | import ml_ids.data.metadata as md 10 | from ml_ids.data.dataset import load_dataset 11 | from ml_ids.model_selection import train_val_test_split 12 | 13 | logging.basicConfig( 14 | format='[%(asctime)s|%(module)s.py|%(levelname)s] %(message)s', 15 | datefmt='%H:%M:%S', 16 | level=logging.INFO, 17 | stream=sys.stdout 18 | ) 19 | 20 | 21 | @click.command() 22 | @click.option('--dataset-path', type=click.Path(exists=True), required=True, 23 | help='Path to the input dataset in .csv format. Can be a folder containing multiple files.') 24 | @click.option('--output-path', type=click.Path(exists=True), required=True, 25 | help='Path to store the output datasets.') 26 | @click.option('--val-size', type=click.FloatRange(0, 1), default=0.1, 27 | help='Fraction of the data used for the validation set.') 28 | @click.option('--test-size', type=click.FloatRange(0, 1), default=0.1, 29 | help='Fraction of the data used for the test set.') 30 | @click.option('--nrows', type=int, 31 | help='Number of rows to load per input file.') 32 | @click.option('--random-seed', type=int, 33 | help='Random seed.') 34 | def split_dataset(dataset_path, output_path, val_size, test_size, nrows, random_seed): 35 | """ 36 | Runs the CLI. 37 | """ 38 | logging.info('Loading dataset from "%s"...', dataset_path) 39 | 40 | dataset = load_dataset(dataset_path=dataset_path, transform_data=False, nrows=nrows) 41 | 42 | train, val, test = train_val_test_split(dataset, 43 | val_size=val_size, 44 | test_size=test_size, 45 | stratify_col=md.COLUMN_LABEL_CAT, 46 | random_state=random_seed) 47 | 48 | train = remove_extra_labels(train) 49 | val = remove_extra_labels(val) 50 | test = remove_extra_labels(test) 51 | 52 | save_dataset(train, output_path, 'train') 53 | save_dataset(val, output_path, 'val') 54 | save_dataset(test, output_path, 'test') 55 | logging.info('Processing complete.') 56 | 57 | 58 | def remove_extra_labels(dataset: pd.DataFrame): 59 | """ 60 | Removes unused target labels. 61 | :param dataset: Input dataset as Pandas DataFrame. 62 | :return: Dataset without unused target labels. 63 | """ 64 | return dataset.drop(columns=[md.COLUMN_LABEL_CAT, md.COLUMN_LABEL_IS_ATTACK]) 65 | 66 | 67 | def save_dataset(dataset: pd.DataFrame, path: str, ds_type: str): 68 | """ 69 | Stores the given dataset in hdf format on the specified path. 70 | 71 | :param dataset: Dataset as Pandas DataFrame. 72 | :param path: Target path to store the dataset. 73 | :param ds_type: Dataset type. 74 | :return: None 75 | """ 76 | file_path = os.path.join(path, '{}.h5'.format(ds_type)) 77 | 78 | logging.info('Storing dataset "%s" of size %d to "%s"', ds_type, len(dataset), file_path) 79 | 80 | dataset.to_hdf(file_path, 'ids_data', format='t', complevel=5, complib='zlib') 81 | 82 | 83 | if __name__ == '__main__': 84 | # pylint: disable=no-value-for-parameter 85 | split_dataset() 86 | -------------------------------------------------------------------------------- /ml_ids/keras/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/ml_ids/keras/__init__.py -------------------------------------------------------------------------------- /ml_ids/keras/callbacks.py: -------------------------------------------------------------------------------- 1 | """ 2 | Custom callbacks for Keras models. 3 | """ 4 | # pylint: disable=import-error 5 | from tensorflow import keras 6 | from tensorflow.keras import callbacks 7 | 8 | K = keras.backend 9 | 10 | 11 | class OneCycleScheduler(callbacks.Callback): 12 | """ 13 | Keras callback implementing a one-cycle learning-rate scheduler. 14 | Provided by https://github.com/ageron/handson-ml2/blob/master/11_training_deep_neural_networks.ipynb. 15 | """ 16 | def __init__(self, iterations, max_rate, start_rate=None, 17 | last_iterations=None, last_rate=None): 18 | self.iterations = iterations 19 | self.max_rate = max_rate 20 | self.start_rate = start_rate or max_rate / 10 21 | self.last_iterations = last_iterations or iterations // 10 + 1 22 | self.half_iteration = (iterations - self.last_iterations) // 2 23 | self.last_rate = last_rate or self.start_rate / 1000 24 | self.iteration = 0 25 | 26 | def _interpolate(self, iter1, iter2, rate1, rate2): 27 | return ((rate2 - rate1) * (iter2 - self.iteration) 28 | / (iter2 - iter1) + rate1) 29 | 30 | def on_batch_begin(self, batch, logs): 31 | if self.iteration < self.half_iteration: 32 | rate = self._interpolate(0, self.half_iteration, self.start_rate, self.max_rate) 33 | elif self.iteration < 2 * self.half_iteration: 34 | rate = self._interpolate(self.half_iteration, 2 * self.half_iteration, 35 | self.max_rate, self.start_rate) 36 | else: 37 | rate = self._interpolate(2 * self.half_iteration, self.iterations, 38 | self.start_rate, self.last_rate) 39 | rate = max(rate, self.last_rate) 40 | self.iteration += 1 41 | K.set_value(self.model.optimizer.lr, rate) 42 | -------------------------------------------------------------------------------- /ml_ids/keras/evaluation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions to evaluate Keras models. 3 | """ 4 | PREDICT_BATCH_SIZE = 16384 5 | 6 | 7 | def evaluate_model(model, X_train, y_train, X_val, y_val, metric_title): 8 | """ 9 | Prints the performance metrics of a Keras model by invoking the `evaluate` function of the model on the training 10 | and validation dataset. 11 | 12 | :param model: Keras model. 13 | :param X_train: Predictor variables of the training dataset. 14 | :param y_train: Target labels of the training dataset. 15 | :param X_val: Predictor variables of the validation dataset. 16 | :param y_val: Target labels of the validation dataset. 17 | :param metric_title: Title of the metrics. 18 | :return: None 19 | """ 20 | print('Evaluation:') 21 | print('===========') 22 | print(' {}'.format(metric_title)) 23 | print('Train: {}'.format(model.evaluate(X_train, y_train, batch_size=PREDICT_BATCH_SIZE, verbose=0))) 24 | print('Val: {}'.format(model.evaluate(X_val, y_val, batch_size=PREDICT_BATCH_SIZE, verbose=0))) 25 | -------------------------------------------------------------------------------- /ml_ids/keras/metrics.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities to create custom metrics for Keras models. 3 | """ 4 | # pylint: disable=import-error 5 | import gc 6 | import numpy as np 7 | from tensorflow import keras 8 | from tensorflow.keras import callbacks 9 | from sklearn.metrics import average_precision_score 10 | 11 | K = keras.backend 12 | 13 | 14 | class AveragePrecisionScoreMetric(callbacks.Callback): 15 | """ 16 | Keras callback calculating the average precision score for a given validation dataset using the 17 | `average_precision_score` metric from Scikit-learn. 18 | """ 19 | def __init__(self, X_val, y_val, batch_size=4096): 20 | super(AveragePrecisionScoreMetric, self).__init__() 21 | self.X_val = X_val 22 | self.y_val = y_val 23 | self.batch_size = batch_size 24 | 25 | def get_precision_score(self): 26 | """ 27 | Calculates the average precision score using scikit-learn. 28 | """ 29 | preds = self.model.predict(self.X_val, batch_size=self.batch_size) 30 | # reduces memory consumption caused by a memory leak in `model.predict()` of Tensorflow 2 31 | # https://github.com/tensorflow/tensorflow/issues/33009 32 | gc.collect() 33 | mse = np.mean(np.power(self.X_val - preds, 2), axis=1) 34 | return average_precision_score(self.y_val, mse) 35 | 36 | def on_epoch_end(self, epoch, logs): 37 | """ 38 | Invoked after each training epoch. 39 | """ 40 | auprc = self.get_precision_score() 41 | logs['val_auprc'] = auprc 42 | print(' - val_auprc: {0:.4f}'.format(auprc)) 43 | -------------------------------------------------------------------------------- /ml_ids/keras/model_selection.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions for model selection of Keras models. 3 | """ 4 | import gc 5 | from typing import Tuple 6 | import numpy as np 7 | from sklearn.model_selection import StratifiedKFold 8 | from tensorflow import keras 9 | 10 | 11 | def cross_val_train(fit_fn, 12 | X: np.ndarray, 13 | y: np.ndarray, 14 | target_transform_fn=id, 15 | target_stratify_fn=id, 16 | n_splits: int = 3, 17 | fit_args: dict = None, 18 | random_state: int = None) -> Tuple[np.ndarray, np.ndarray, list]: 19 | """ 20 | Performs stratified cross-validation for a Keras model using the provided fit function. 21 | 22 | :param fit_fn: The function used to fit a model with a given split of the train and test set. Must return a fitted 23 | Keras model with its history. 24 | :param X: Predictor variables. 25 | :param y: Labels. 26 | :param target_transform_fn: Function to transform the target labels (e.g. one-hot encoding). 27 | :param target_stratify_fn: Function to extract the target label to stratify by. 28 | :param n_splits: Number of cross-validation splits. 29 | :param fit_args: Arguments to pass to the fit function. 30 | :param random_state: Random state. 31 | :return: A triple containing the cross-validation predictions, the true values and a list of history-objects. 32 | """ 33 | if fit_args is None: 34 | fit_args = {} 35 | 36 | kfold = StratifiedKFold(n_splits=n_splits, random_state=random_state) 37 | 38 | cv_predictions = None 39 | cv_y_true = None 40 | hists = [] 41 | fold = 1 42 | 43 | for train_index, val_index in kfold.split(X, target_stratify_fn(y)): 44 | print('\nFold {}/{}:'.format(fold, n_splits)) 45 | print('==========') 46 | 47 | X_train, X_val = X[train_index], X[val_index] 48 | y_train, y_val = y[train_index], y[val_index] 49 | 50 | y_train_ = target_transform_fn(y_train) 51 | y_val_ = target_transform_fn(y_val) 52 | 53 | keras.backend.clear_session() 54 | gc.collect() 55 | 56 | model, hist = fit_fn(X_train, y_train_, X_val, y_val_, fit_args, (fold == 1)) 57 | 58 | if isinstance(hist, list): 59 | hists.extend(hist) 60 | else: 61 | hists.append(hist) 62 | 63 | if cv_predictions is not None: 64 | cv_predictions = np.append(cv_predictions, model.predict(X_val), axis=0) 65 | else: 66 | cv_predictions = model.predict(X_val) 67 | 68 | if cv_y_true is not None: 69 | cv_y_true = np.append(cv_y_true, y_val, axis=0) 70 | else: 71 | cv_y_true = y_val 72 | 73 | fold = fold + 1 74 | 75 | return cv_predictions, cv_y_true, hists 76 | -------------------------------------------------------------------------------- /ml_ids/keras/prediction.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions to create predictions using Keras models. 3 | """ 4 | PREDICT_BATCH_SIZE = 16384 5 | 6 | 7 | def predict(model, X, decision_boundary=0.5): 8 | """ 9 | Performs predictions for a binary classification task given a Keras model and a decision boundary. 10 | If the probability of a sample belonging to the positive class exceeds the decision boundary the positive label 11 | is assigned to the sample, otherwise the negative label is used. 12 | 13 | :param model: Keras model. 14 | :param X: Dataset containing samples. 15 | :param decision_boundary: Decision boundary used to assign predictions to the positive class. 16 | :return: numpy array containing the binary predictions as one of the values {0, 1}. 17 | """ 18 | pred = model.predict(X, batch_size=PREDICT_BATCH_SIZE) 19 | return (pred >= decision_boundary).astype('int').reshape(-1) 20 | 21 | 22 | def predict_proba(model, X): 23 | """ 24 | Performs predictions for a binary classification task given a Keras model. 25 | This function returns the class probability of the positive class. 26 | 27 | :param model: Keras model. 28 | :param X: Dataset containing samples. 29 | :return: numpy array containing the class probabilities of the positive class. 30 | """ 31 | return model.predict(X, batch_size=PREDICT_BATCH_SIZE).reshape(-1) 32 | -------------------------------------------------------------------------------- /ml_ids/libs/dfencoder/dataframe.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, Michael Klear. 2 | # All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions are 6 | # met: 7 | # 8 | # * Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # 11 | # * Redistributions in binary form must reproduce the above 12 | # copyright notice, this list of conditions and the following 13 | # disclaimer in the documentation and/or other materials provided 14 | # with the distribution. 15 | # 16 | # * Neither the name of the dfencoder Developers nor the names of any 17 | # contributors may be used to endorse or promote products derived 18 | # from this software without specific prior written permission. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | 32 | import pandas as pd 33 | import numpy as np 34 | 35 | 36 | class EncoderDataFrame(pd.DataFrame): 37 | def __init__(self, *args, **kwargs): 38 | super(EncoderDataFrame, self).__init__(*args, **kwargs) 39 | 40 | def swap(self, likelihood=.15): 41 | """ 42 | Performs random swapping of data. 43 | Each value has a likelihood of *argument likelihood* 44 | of being randomly replaced with a value from a different 45 | row. 46 | Returns a copy of the dataframe with equal size. 47 | """ 48 | 49 | # select values to swap 50 | tot_rows = self.__len__() 51 | n_rows = int(round(tot_rows * likelihood)) 52 | n_cols = len(self.columns) 53 | 54 | def gen_indices(): 55 | column = np.repeat(np.arange(n_cols).reshape(1, -1), repeats=n_rows, axis=0) 56 | row = np.random.randint(0, tot_rows, size=(n_rows, n_cols)) 57 | return row, column 58 | 59 | row, column = gen_indices() 60 | new_mat = self.values 61 | to_place = new_mat[row, column] 62 | 63 | row, column = gen_indices() 64 | new_mat[row, column] = to_place 65 | 66 | dtypes = {col: typ for col, typ in zip(self.columns, self.dtypes)} 67 | result = EncoderDataFrame(columns=self.columns, data=new_mat) 68 | result = result.astype(dtypes, copy=False) 69 | 70 | return result 71 | -------------------------------------------------------------------------------- /ml_ids/model_selection.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for machine learning model selection. 3 | """ 4 | from typing import Tuple, List 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.metrics import precision_recall_curve 9 | 10 | 11 | def train_val_test_split(df: pd.DataFrame, 12 | val_size: float = 0.1, 13 | test_size: float = 0.1, 14 | stratify_col: str = None, 15 | random_state: int = None) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: 16 | """ 17 | Splits the given DataFrame into three parts used for: 18 | - training 19 | - validation 20 | - test 21 | 22 | :param df: Input DataFrame. 23 | :param val_size: Size of validation set. 24 | :param test_size: Size of test set. 25 | :param stratify_col: Column to stratify. 26 | :param random_state: Random state. 27 | :return: A triple containing (`train`, `val`, `test`) sets. 28 | """ 29 | assert (val_size + test_size) < 1, 'Sum of validation and test size must not be > 1.' 30 | 31 | df_stratify = df[stratify_col] if stratify_col else None 32 | df_train, df_hold = train_test_split(df, 33 | test_size=(val_size + test_size), 34 | stratify=df_stratify, 35 | random_state=random_state) 36 | 37 | df_hold_stratify = df_hold[stratify_col] if stratify_col else None 38 | df_val, df_test = train_test_split(df_hold, 39 | test_size=test_size / (val_size + test_size), 40 | stratify=df_hold_stratify, 41 | random_state=random_state) 42 | 43 | return df_train, df_val, df_test 44 | 45 | 46 | def split_x_y(df: pd.DataFrame, y_cols: List[str] = None) -> Tuple[pd.DataFrame, pd.DataFrame]: 47 | """ 48 | Splits the given DataFrame into a DataFrame `X` containing the predictor variables and a DataFrame 'y' containing 49 | the labels y. 50 | 51 | :param df: Input DataFrame. 52 | :param y_cols: Columns to use in the labels DataFrame `y`. 53 | :return: A tuple containing the DataFrames (`X`, `y`). 54 | """ 55 | if y_cols is None: 56 | y_cols = ['label', 'label_cat', 'label_is_attack'] 57 | return df.drop(columns=y_cols), df[y_cols] 58 | 59 | 60 | def best_precision_for_target_recall(y_true, y_pred_score, target_recall): 61 | """ 62 | Determines the decision boundary for the best precision given a specified target recall by using 63 | the precision-recall curve. 64 | 65 | :param y_true: True labels. 66 | :param y_pred_score: Predicted labels. 67 | :param target_recall: Target recall. 68 | :return: Decision boundary. 69 | """ 70 | _, recalls, thresholds = precision_recall_curve(y_true, y_pred_score) 71 | return thresholds[np.argmin(recalls >= target_recall)] 72 | -------------------------------------------------------------------------------- /ml_ids/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/ml_ids/models/__init__.py -------------------------------------------------------------------------------- /ml_ids/models/gradient_boost/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/ml_ids/models/gradient_boost/__init__.py -------------------------------------------------------------------------------- /ml_ids/models/gradient_boost/mlflow_wrapper.py: -------------------------------------------------------------------------------- 1 | """ 2 | Wrapper to enable usage of a CatBoost estimator with MLflow. 3 | """ 4 | import pickle 5 | import mlflow.pyfunc 6 | from catboost import CatBoostClassifier 7 | from ml_ids.data.dataset import remove_negative_values, remove_inf_values 8 | 9 | 10 | class CatBoostWrapper(mlflow.pyfunc.PythonModel): 11 | """ 12 | MLflow wrapper for CatBoost estimators. 13 | """ 14 | 15 | def load_context(self, context): 16 | # pylint: disable=attribute-defined-outside-init 17 | with open(context.artifacts['pipeline'], 'rb') as f: 18 | self.pipeline = pickle.load(f) 19 | 20 | with open(context.artifacts['col_config'], 'rb') as f: 21 | column_config = pickle.load(f) 22 | 23 | self.clf = CatBoostClassifier() 24 | self.clf.load_model(context.artifacts['cbm_model']) 25 | self.col_names = column_config['col_names'] 26 | self.preserve_cols = column_config['preserve_neg_vals'] 27 | 28 | def preprocess(self, data): 29 | """ 30 | Applies the pre-processing pipeline to the features given in the input dataset. 31 | 32 | :param data: Input dataset. 33 | :return: Transformed dataset. 34 | """ 35 | data = data[self.col_names] 36 | data = remove_inf_values(data) 37 | data = remove_negative_values(data, ignore_cols=self.preserve_cols) 38 | return self.pipeline.transform(data) 39 | 40 | def predict(self, context, model_input): 41 | X = self.preprocess(model_input) 42 | return self.clf.predict(X) 43 | -------------------------------------------------------------------------------- /ml_ids/models/gradient_boost/train.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities to train a machine learning estimator based on the Gradient Boosting algorithm using the CatBoost library. 3 | """ 4 | import logging 5 | from collections import namedtuple 6 | import pandas as pd 7 | from catboost import CatBoostClassifier, Pool 8 | from sklearn.preprocessing import FunctionTransformer 9 | 10 | from ml_ids.transform.preprocessing import create_pipeline 11 | from ml_ids.transform.sampling import upsample_minority_classes 12 | from ml_ids.model_selection import split_x_y 13 | 14 | LOGGER = logging.getLogger(__name__) 15 | 16 | GradientBoostHyperParams = namedtuple('GradientBoostHyperParams', 17 | ['nr_iterations', 'tree_depth', 'l2_reg', 'border_count', 'random_strength', 18 | 'task_type']) 19 | 20 | 21 | def fit_pipeline(train_dataset): 22 | """ 23 | Creates and fits the scikit-learn pre-processing pipeline. 24 | 25 | :param train_dataset: Training dataset. 26 | :return: Tuple of (fitted scikit-learn pipeline, column names). 27 | """ 28 | cols_to_impute = train_dataset.columns[train_dataset.isna().any()].tolist() 29 | 30 | X_train, _ = split_x_y(train_dataset) 31 | 32 | pipeline, get_col_names = create_pipeline(X_train, 33 | imputer_strategy='median', 34 | imputer_cols=cols_to_impute, 35 | scaler=FunctionTransformer, 36 | scaler_args={'validate': False}) 37 | pipeline.fit(X_train) 38 | return pipeline, get_col_names() 39 | 40 | 41 | def preprocess_val_dataset(pipeline, val_dataset): 42 | """ 43 | Pre-processes the validation dataset. 44 | 45 | :param pipeline: Scikit-learn pipeline. 46 | :param val_dataset: Validation dataset. 47 | :return: Tuple of (transformed features, labels) 48 | """ 49 | X_val, y_val = split_x_y(val_dataset) 50 | X_val = pipeline.transform(X_val) 51 | 52 | return X_val, y_val.label_is_attack 53 | 54 | 55 | def preprocess_train_dataset(pipeline, train_dataset, nr_attack_samples, random_state): 56 | """ 57 | Pre-processes the training dataset. 58 | 59 | :param pipeline: Scikit-learn pipeline. 60 | :param train_dataset: Training dataset. 61 | :param nr_attack_samples: Minimum number of attack samples per category. If the actual number of samples in the 62 | dataset is lower than this number the SMOTE algorithm will be used to upsample this category to have the requested 63 | number of samples. 64 | :return: Tuple of (transformed features, labels) 65 | """ 66 | X_train, y_train = split_x_y(train_dataset) 67 | X_train = pipeline.transform(X_train) 68 | 69 | X_train, y_train = upsample_minority_classes(X_train, y_train, 70 | min_samples=nr_attack_samples, 71 | random_state=random_state) 72 | 73 | return X_train, (y_train != 0).astype('int') 74 | 75 | 76 | def calculate_class_weights(y_train): 77 | """ 78 | Calculates the class weights of the unique classes in the training labels. 79 | 80 | :param y_train: Training labels. 81 | :return: Array of class weights. 82 | """ 83 | minority_class_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1]) 84 | return [1, minority_class_weight] 85 | 86 | 87 | def train_gb_classifier(train_pool, 88 | val_pool, 89 | class_weights, 90 | nr_iterations, 91 | tree_depth, 92 | l2_reg, 93 | border_count, 94 | random_strength, 95 | task_type, 96 | random_state=None): 97 | """ 98 | Trains an estimator based on the Gradient Boosting algorithm using the CatBoost library. 99 | 100 | :param train_pool: Training dataset. 101 | :param val_pool: Validation dataset. 102 | :param class_weights: Class weights of the target labels. 103 | :param nr_iterations: The maximum number of trees that can be built when solving machine learning problems. 104 | :param tree_depth: Depth of a single tree. 105 | :param l2_reg: Coefficient at the L2 regularization term of the cost function. 106 | :param border_count: The number of splits for numerical features. 107 | :param random_strength: The amount of randomness to use for scoring splits when the tree structure is selected. 108 | :param task_type: The processing unit type to use for training (CPU | GPU). 109 | :param random_state: State to initialize the random number generator. 110 | :return: Trained CatBoost classifier. 111 | """ 112 | clf = CatBoostClassifier(loss_function='Logloss', 113 | iterations=nr_iterations, 114 | depth=tree_depth, 115 | l2_leaf_reg=l2_reg, 116 | border_count=border_count, 117 | random_strength=random_strength, 118 | task_type=task_type, 119 | class_weights=class_weights, 120 | verbose=1, 121 | random_seed=random_state) 122 | 123 | clf.fit(train_pool, eval_set=val_pool) 124 | return clf 125 | 126 | 127 | def train_model(train_dataset: pd.DataFrame, 128 | val_dataset: pd.DataFrame, 129 | hyper_params: GradientBoostHyperParams, 130 | nr_attack_samples: int, 131 | random_seed: int = None): 132 | """ 133 | Trains an estimator based on the Gradient Boosting algorithm using the CatBoost library. 134 | 135 | :param train_dataset: Training dataset. 136 | :param val_dataset: Validation dataset. 137 | :param hyper_params: Hyper-parameters applied to the Gradient Boosting algorithm. 138 | :param nr_attack_samples: Minimum number of attack samples per category. If the actual number of samples in the 139 | dataset is lower than this number the SMOTE algorithm will be used to upsample this category to have the requested 140 | number of samples. 141 | :param random_seed: Seed to initialize the random number generator. 142 | :return: Tuple of (CatBoost classifier, pre-processing pipeline, column names) 143 | """ 144 | LOGGER.info('Training model with parameters [samples-per-attack-category=%s, hyperparams=%s]', 145 | nr_attack_samples, 146 | hyper_params) 147 | 148 | pipeline, col_names = fit_pipeline(train_dataset) 149 | 150 | X_train, y_train = preprocess_train_dataset(pipeline, train_dataset, nr_attack_samples, random_seed) 151 | train_pool = Pool(X_train, y_train) 152 | 153 | if val_dataset is not None: 154 | X_val, y_val = preprocess_val_dataset(pipeline, val_dataset) 155 | val_pool = Pool(X_val, y_val) 156 | else: 157 | val_pool = None 158 | 159 | clf = train_gb_classifier(train_pool=train_pool, 160 | val_pool=val_pool, 161 | class_weights=calculate_class_weights(y_train), 162 | nr_iterations=hyper_params.nr_iterations, 163 | tree_depth=hyper_params.tree_depth, 164 | l2_reg=hyper_params.l2_reg, 165 | border_count=hyper_params.border_count, 166 | random_strength=hyper_params.random_strength, 167 | task_type=hyper_params.task_type, 168 | random_state=random_seed) 169 | 170 | return clf, pipeline, col_names 171 | -------------------------------------------------------------------------------- /ml_ids/prediction.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities to create predictions given a Scikit-learn estimator and a dataset containing input features. 3 | """ 4 | 5 | 6 | def predict_proba_positive(clf, X): 7 | """ 8 | Performs predictions for a binary classification task given a scikit-learn model. 9 | This function returns the class probability of the positive class. 10 | 11 | :param clf: Scikit-learn estimator. 12 | :param X: Dataset containing the samples. 13 | :return: numpy array containing the class probabilities of the positive class. 14 | """ 15 | return clf.predict_proba(X)[:, 1].reshape(-1) 16 | 17 | 18 | def predict_decision_boundary(clf, X, decision_boundary=0.5): 19 | """ 20 | Performs predictions for a binary classification task given a scikit-learn model and a decision boundary. 21 | If the probability of a sample belonging to the positive class exceeds the decision boundary the positive label 22 | is assigned to the sample, otherwise the negative label is used. 23 | 24 | :param clf: Scikit-learn estimator. 25 | :param X: Dataset containing samples. 26 | :param decision_boundary: Decision boundary used to assign predictions to the positive class. 27 | :return: numpy array containing the binary predictions as one of the values {0, 1}. 28 | """ 29 | pred = predict_proba_positive(clf, X) 30 | return (pred >= decision_boundary).astype('int') 31 | -------------------------------------------------------------------------------- /ml_ids/tf_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions for TensorFlow. 3 | """ 4 | import tensorflow as tf 5 | 6 | 7 | def enable_gpu_memory_growth(): 8 | """ 9 | Enables the experimental setting `allow_memory_growth` for GPU devices 10 | 11 | :return: None 12 | """ 13 | physical_devices = tf.config.experimental.list_physical_devices('GPU') 14 | tf.config.experimental.set_memory_growth(physical_devices[0], True) 15 | -------------------------------------------------------------------------------- /ml_ids/transform/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/ml_ids/transform/__init__.py -------------------------------------------------------------------------------- /ml_ids/transform/preprocessing.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for data pre-processing. 3 | """ 4 | from typing import List 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn.pipeline import Pipeline 8 | from sklearn.compose import ColumnTransformer 9 | from sklearn.impute import SimpleImputer 10 | from sklearn.preprocessing import StandardScaler, OneHotEncoder 11 | from sklearn.exceptions import NotFittedError 12 | from sklearn.base import BaseEstimator 13 | 14 | 15 | def remove_outliers(df: pd.DataFrame, zscore: int = 3) -> pd.DataFrame: 16 | """ 17 | Removes all rows from the given DataFrame containing outliers in any of the columns. 18 | 19 | :param df: Input DataFrame. 20 | :param zscore: z-score to use when calculating outliers. 21 | :return: The DataFrame with all outliers removed. 22 | """ 23 | scores = (df - df.mean()) / df.std(ddof=0).values 24 | return df[(np.abs(scores) < zscore).all(axis=1)] 25 | 26 | 27 | def create_pipeline(df: pd.DataFrame, 28 | imputer_strategy: str = 'mean', 29 | imputer_cols: List[str] = None, 30 | scaler: BaseEstimator = StandardScaler, 31 | scaler_args: dict = None, 32 | cat_cols: List[str] = None, 33 | copy: bool = True): 34 | """ 35 | Creates a pipeline performing the following steps: 36 | - value imputation 37 | - value scaling 38 | - one-hot-encoding of categorical values. 39 | 40 | :param df: Input DataFrame. 41 | :param imputer_strategy: Imputer strategy applied to missing values. 42 | Allowed values are ['mean', 'median', 'most_frequent', 'constant']. 43 | :param imputer_cols: Columns to impute. If no columns are specified all columns will be imputed. 44 | :param scaler: Scikit-learn scaler to be applied to all values. 45 | :param scaler_args: Additional arguments forwarded to the specified scaler. 46 | :param cat_cols: Categorical columns to be one-hot-encoded. 47 | :param copy: If True, a copy of the input will be created. 48 | :return: A tuple containing the pipeline and a function returning the columns names after the pipeline has been 49 | fitted. 50 | """ 51 | 52 | def create_get_feature_names(p, imp, scl, cat): 53 | def get_feature_names(): 54 | if not hasattr(p, 'transformers_'): 55 | raise AssertionError('Pipeline is not yet fitted.') 56 | 57 | try: 58 | cat_names = p.transformers_[2][1].get_feature_names(cat) 59 | except NotFittedError: 60 | cat_names = [] 61 | return np.append(imp, np.append(scl, cat_names)) 62 | 63 | return get_feature_names 64 | 65 | if scaler_args is None: 66 | scaler_args = {} 67 | 68 | cat_features = cat_cols if cat_cols else [] 69 | num_features = [c for c in df.select_dtypes(include=[np.number]).columns.values if c not in cat_features] 70 | imp_features: List[str] = [] 71 | 72 | if imputer_strategy is not None: 73 | imp_features = imputer_cols if imputer_cols else num_features 74 | 75 | scale_features = [f for f in num_features if f not in imp_features] 76 | 77 | imp_pipeline = Pipeline([ 78 | ('imputer', SimpleImputer(missing_values=np.nan, strategy=imputer_strategy, copy=copy)), 79 | ('imp_scaler', scaler(**scaler_args)) 80 | ]) 81 | 82 | pipeline = ColumnTransformer([ 83 | ('imp', imp_pipeline, imp_features), 84 | ('scl', scaler(**scaler_args), scale_features), 85 | ('one_hot', OneHotEncoder(categories='auto'), cat_features) 86 | ]) 87 | 88 | return pipeline, create_get_feature_names(pipeline, imp_features, scale_features, cat_features) 89 | -------------------------------------------------------------------------------- /ml_ids/transform/sampling.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities to modify the amount of samples of specific categories in a datasets. 3 | """ 4 | import numpy as np 5 | import pandas as pd 6 | from imblearn.over_sampling import SMOTE, SMOTENC 7 | from typing import Tuple, List 8 | 9 | 10 | def upsample_minority_classes(X: np.ndarray, 11 | y: pd.DataFrame, 12 | min_samples: int, 13 | random_state: int = None, 14 | cat_cols: List[int] = None, 15 | n_jobs: int = 24) -> Tuple[np.ndarray, np.ndarray]: 16 | """ 17 | Synthetic up-sampling of minority classes using `imblearn.over_sampling.SMOTE`. 18 | 19 | :param X: Predictor variables. 20 | :param y: Labels. 21 | :param min_samples: Minimum samples of each class. 22 | :param random_state: Random state. 23 | :param cat_cols: Column indices of categorical features. 24 | :param n_jobs: Number of threads to use. 25 | :return: A tuple containing the up-sampled X and y values. 26 | """ 27 | counts = y.label_cat.value_counts() 28 | sample_dict = {} 29 | 30 | for i in np.unique(y.label_cat): 31 | sample_dict[i] = max(counts[i], min_samples) 32 | 33 | if cat_cols: 34 | smote = SMOTENC(sampling_strategy=sample_dict, 35 | categorical_features=cat_cols, 36 | n_jobs=n_jobs, 37 | random_state=random_state) 38 | else: 39 | smote = SMOTE(sampling_strategy=sample_dict, n_jobs=n_jobs, random_state=random_state) 40 | 41 | x_s, y_s = smote.fit_resample(X, y.label_cat) 42 | return x_s, y_s 43 | 44 | 45 | def create_sample_dict(df: pd.DataFrame, 46 | default_nr_samples: int, 47 | samples_per_label: dict = None) -> dict: 48 | """ 49 | Creates a dictionary containing the number of samples per label. 50 | 51 | :param df: Input DataFrame. 52 | :param default_nr_samples: Default number of samples per label. 53 | :param samples_per_label: Number of samples for specific labels. 54 | :return: Dictionary containing the number of samples per label. 55 | """ 56 | if samples_per_label is None: 57 | samples_per_label = {} 58 | 59 | sample_dict = df.label_cat.value_counts().to_dict() 60 | 61 | for label in sample_dict.keys(): 62 | requested_samples = samples_per_label[label] if label in samples_per_label else default_nr_samples 63 | existing_samples = sample_dict[label] if label in sample_dict else 0 64 | sample_dict[label] = min(requested_samples, existing_samples) 65 | 66 | return sample_dict 67 | 68 | 69 | def downsample(df: pd.DataFrame, 70 | default_nr_samples: int, 71 | samples_per_label: dict = None, 72 | random_state: int = None) -> pd.DataFrame: 73 | """ 74 | Downsamples the given DataFrame to contain at most `default_nr_samples` per instance of label. 75 | 76 | :param df: Input DataFrame. 77 | :param default_nr_samples: Default number of samples per label. 78 | :param samples_per_label: Number of samples for specific labels. 79 | :param random_state: Random state. 80 | :return: The downsampled DataFrame. 81 | """ 82 | if samples_per_label is None: 83 | samples_per_label = {} 84 | 85 | sample_dict = create_sample_dict(df, default_nr_samples, samples_per_label) 86 | return pd.concat([df[df.label_cat == l].sample(n=n, random_state=random_state) for l, n in sample_dict.items()]) 87 | -------------------------------------------------------------------------------- /ml_ids/visualization.py: -------------------------------------------------------------------------------- 1 | """ 2 | Visualization utilities for IPython Notebooks. 3 | """ 4 | # pylint: disable=import-error 5 | import numpy as np 6 | import pandas as pd 7 | import seaborn as sns 8 | import matplotlib.pyplot as plt 9 | from matplotlib.ticker import MaxNLocator 10 | from sklearn.metrics import confusion_matrix, classification_report, average_precision_score, precision_recall_curve 11 | from IPython.display import display 12 | 13 | 14 | def plot_hist(hist, 15 | metrics=None, 16 | y_lim=None, 17 | size=(8, 5), 18 | ax=None): 19 | """ 20 | Plot a Keras history object. 21 | 22 | :param hist: The Keras history. 23 | :param metrics: A list of histories to plot. 24 | :param y_lim: Limits the y-axis. 25 | :param size: Size of the plot. 26 | :param ax: Axis to apply the plot. 27 | """ 28 | if metrics is None: 29 | metrics = ['loss', 'val_loss'] 30 | 31 | fig_size = size if not ax else None 32 | 33 | df = pd.DataFrame(hist.history)[metrics] 34 | df.plot(figsize=fig_size, ax=ax) 35 | 36 | gca = ax if ax else plt.gca() 37 | gca.xaxis.set_major_locator(MaxNLocator(integer=True)) 38 | 39 | if y_lim: 40 | gca.set_ylim(y_lim) 41 | 42 | if ax: 43 | ax.grid(True) 44 | else: 45 | plt.grid(True) 46 | plt.show() 47 | 48 | 49 | def plot_confusion_matrix(y_true, 50 | y_pred, 51 | classes=None, 52 | size=(10, 10), 53 | normalize=False, 54 | title=None, 55 | print_raw=False, 56 | cmap=plt.cm.Blues): 57 | """ 58 | This function prints and plots the confusion matrix. 59 | Normalization can be applied by setting `normalize=True`. 60 | 61 | :param y_true: True labels. 62 | :param y_pred: Predicted labels. 63 | :param classes: List of class names. 64 | :param size: Size of the plot. 65 | :param normalize: If True values of the confusion matrix will be normalized. 66 | :param title: Title of the plot. 67 | :param print_raw: If True the raw confusion matrix is printed. 68 | :param cmap: Color map 69 | """ 70 | if not title: 71 | if normalize: 72 | title = 'Normalized confusion matrix' 73 | else: 74 | title = 'Confusion matrix, without normalization' 75 | 76 | # Compute confusion matrix 77 | cm = confusion_matrix(y_true, y_pred) 78 | 79 | if normalize: 80 | cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] 81 | 82 | if print_raw: 83 | print(cm) 84 | 85 | fig, ax = plt.subplots(figsize=size) 86 | im = ax.matshow(cm, interpolation='nearest', cmap=cmap) 87 | ax.figure.colorbar(im, ax=ax) 88 | ax.set(title=title, 89 | ylabel='True label', 90 | xlabel='Predicted label') 91 | 92 | if classes is not None: 93 | x_labels = classes 94 | y_labels = classes 95 | 96 | ax.set(xticks=np.arange(cm.shape[1]), 97 | yticks=np.arange(cm.shape[0]), 98 | xticklabels=x_labels, 99 | yticklabels=y_labels) 100 | 101 | plt.margins(2) 102 | ax.tick_params(axis="x", bottom=True, labelbottom=True, top=False, labeltop=False, rotation=45) 103 | 104 | # Rotate the tick labels and set their alignment. 105 | plt.setp(ax.get_xticklabels(), rotation=45, ha="right", 106 | rotation_mode="anchor") 107 | 108 | # Loop over data dimensions and create text annotations. 109 | fmt = '.2f' if normalize else 'd' 110 | thresh = cm.max() / 2. 111 | for i in range(cm.shape[0]): 112 | for j in range(cm.shape[1]): 113 | ax.text(j, i, format(cm[i, j], fmt), 114 | ha="center", va="center", 115 | color="white" if cm[i, j] > thresh else "black") 116 | fig.tight_layout() 117 | return ax 118 | 119 | 120 | def identity(x): 121 | """ 122 | Identity function. 123 | """ 124 | return x 125 | 126 | 127 | def plot_threshold(pred_train, pred_val, threshold, size=(15, 5), transform=identity): 128 | """ 129 | Plots the reconstruction errors of training and test samples and displays the classification threshold. 130 | 131 | :param pred_train: Predictions of training samples. 132 | :param pred_val: Predictions of validation samples. 133 | :param threshold: Classification threshold. 134 | :param size: Size of the plot. 135 | :param transform: Value transformation. 136 | """ 137 | _, ax = plt.subplots(figsize=size) 138 | sns.distplot(transform(pred_train.rec_error.values), hist=False, ax=ax, label='Train Benign') 139 | sns.distplot(transform(pred_val[pred_val.y_true == 0].rec_error.values), hist=False, ax=ax, 140 | label='Validation Benign') 141 | sns.distplot(transform(pred_val[pred_val.y_true == 1].rec_error.values), hist=False, ax=ax, 142 | label='Validation Attack') 143 | ax.axvline(transform(threshold), color='red', linestyle='--') 144 | ax.legend() 145 | 146 | 147 | def get_misclassifications(y, y_true, pred): 148 | """ 149 | Calculates the misclassification rate for each label. 150 | 151 | :param y: Pandas DataFrame containing the target labels. 152 | :param y_true: True labels. 153 | :param pred: Predicted labels. 154 | :return: Pandas DataFrame containing the misclassification per label. 155 | """ 156 | misclassifications = y[y_true != pred] 157 | 158 | mc_df = pd.merge(pd.DataFrame({'misclassified': misclassifications.label.value_counts()}), 159 | pd.DataFrame({'total': y.label.value_counts()}), 160 | how='left', left_index=True, right_index=True) 161 | mc_df['percent_misclassified'] = mc_df.apply(lambda x: x[0] / x[1], axis=1) 162 | return mc_df.sort_values('percent_misclassified', ascending=False) 163 | 164 | 165 | def print_binary_performance(y, y_true, pred, print_misclassifications=True, digits=3): 166 | """ 167 | Prints the performance of a binary classifier using 168 | - the classification report, 169 | - the confusion matrix and 170 | - the misclassification report. 171 | 172 | :param y: Pandas DataFrame containing the target labels (binary, categories). 173 | :param y_true: True labels. 174 | :param pred: Predicted labels. 175 | :param print_misclassifications: Binary indicator instructing that the misclassification report should be printed. 176 | :param digits: Number of digits used to print the classification report. 177 | :return: None 178 | """ 179 | print('Classification Report:') 180 | print('======================') 181 | print(classification_report(y_true, pred, digits=digits)) 182 | 183 | print('Confusion Matrix:') 184 | print('=================') 185 | plot_confusion_matrix(y_true, pred, np.array(['Benign', 'Attack']), size=(5, 5)) 186 | plt.show() 187 | 188 | if print_misclassifications: 189 | print('Misclassifications by attack category:') 190 | print('======================================') 191 | mc_df = get_misclassifications(y, y_true, pred) 192 | display(mc_df) 193 | 194 | 195 | def plot_pr_curve(y_true, y_score, size=(8, 5), average='weighted'): 196 | """ 197 | Plots the precision-recall curve for a single estimator. 198 | 199 | :param y_true: True labels. 200 | :param y_score: Predicted probabilities. 201 | :param size: Size of the plot. 202 | :param average: Average parameter used for the calculation of the average precision score. 203 | :return: None 204 | """ 205 | precisions, recalls, _ = precision_recall_curve(y_true, y_score) 206 | pr_auc = average_precision_score(y_true, y_score, average=average) 207 | 208 | plt.figure(figsize=size) 209 | plt.plot(recalls, precisions, label='auc={}'.format(pr_auc)) 210 | plt.title('Precision / Recall Curve') 211 | plt.xlabel('Recall') 212 | plt.ylabel('Precision') 213 | plt.legend(loc='lower left') 214 | plt.show() 215 | 216 | print('Average PR Score {}'.format(pr_auc)) 217 | 218 | 219 | def plot_pr_curves(y_true, y_score_dict, size=(8, 5), average='weighted'): 220 | """ 221 | Plots the precision-recall curve for a multiple estimators. 222 | 223 | :param y_true: True labels. 224 | :param y_score_dict: Dictionary containing the estimator name as keys and the predicted label probabilities 225 | as values. 226 | :param size: Size of the plot. 227 | :param average: Average parameter used for the calculation of the average precision score. 228 | :return: None 229 | """ 230 | plt.figure(figsize=size) 231 | 232 | for name, y_score in y_score_dict.items(): 233 | precisions, recalls, _ = precision_recall_curve(y_true, y_score) 234 | pr_auc = average_precision_score(y_true, y_score, average=average) 235 | plt.plot(recalls, precisions, label='{} (AUC={})'.format(name, pr_auc)) 236 | 237 | plt.title('Precision / Recall Curve') 238 | plt.xlabel('Recall') 239 | plt.ylabel('Precision') 240 | plt.legend(loc='lower left') 241 | plt.show() 242 | 243 | 244 | def plot_pr_threshold_curves(y_true, y_pred_score, size=(20, 8)): 245 | """ 246 | Plots the precision-recall values for different probability thresholds. 247 | 248 | :param y_true: True labels. 249 | :param y_pred_score: Predicted probabilities. 250 | :param size: Size of the plot. 251 | :return: None 252 | """ 253 | precisions, recalls, thresholds = precision_recall_curve(y_true, y_pred_score) 254 | 255 | # plot precision / recall for different thresholds 256 | plt.figure(figsize=size) 257 | plt.plot(thresholds, precisions[:-1], label="Precision") 258 | plt.plot(thresholds, recalls[:-1], label="Recall") 259 | plt.title('Precision / Recall of different thresholds') 260 | plt.xlabel('Threshold') 261 | plt.ylabel('Precision / Recall') 262 | plt.legend(loc='lower right') 263 | plt.show() 264 | -------------------------------------------------------------------------------- /models/gradient_boost/envs/local/train.py: -------------------------------------------------------------------------------- 1 | import json 2 | import click 3 | import mlflow 4 | import shutil 5 | import os 6 | 7 | 8 | def merge(dict1, dict2): 9 | """ 10 | Merges two dictionaries by creating copies of the dictionaries. 11 | :param dict1: First dictionary to merge 12 | :param dict2: Second dictionary to merge 13 | :return: Merged dictionary 14 | """ 15 | d = dict(dict1) 16 | d.update(dict2) 17 | return d 18 | 19 | 20 | @click.command() 21 | @click.option('--train-path', type=click.Path(exists=True), required=True, 22 | help='Path to the train dataset in .h5 format.') 23 | @click.option('--val-path', type=click.Path(exists=True), required=True, 24 | help='Path to the train dataset in .h5 format.') 25 | @click.option('--test-path', type=click.Path(exists=True), required=True, 26 | help='Path to the train dataset in .h5 format.') 27 | @click.option('--output-path', type=click.Path(), required=True, 28 | help='Path to store the output.') 29 | @click.option('--param-path', type=click.Path(exists=True), required=True, 30 | help='Path to the training parameters.') 31 | def train(train_path, val_path, test_path, output_path, param_path): 32 | with open(param_path, 'r') as f: 33 | params = json.load(f) 34 | 35 | shutil.rmtree(output_path, ignore_errors=True) 36 | os.makedirs(output_path, exist_ok=True) 37 | 38 | run_params = merge(params, { 39 | 'train_path': train_path, 40 | 'val_path': val_path, 41 | 'test_path': test_path, 42 | 'output_path': output_path, 43 | 'artifact_path': output_path, 44 | }) 45 | 46 | mlflow.run('models/gradient_boost/project', 47 | parameters=run_params) 48 | 49 | 50 | if __name__ == '__main__': 51 | train() 52 | -------------------------------------------------------------------------------- /models/gradient_boost/envs/sagemaker/configs/deploy.json: -------------------------------------------------------------------------------- 1 | { 2 | "deploy": { 3 | "app_name": "ml-ids-classifier", 4 | "instance_type": "ml.t2.medium", 5 | "instance_count": 1, 6 | "region": "eu-west-1" 7 | }, 8 | "role": "arn:aws:iam::763816190631:role/service-role/AmazonSageMaker-ExecutionRole-20191125T215860", 9 | "model_bucket": "s3://sagemaker-eu-west-1-763816190631", 10 | "model_artifact": "model.tar.gz", 11 | "model_name": "ml-ids-gb_mlflow_pyfunc" 12 | } -------------------------------------------------------------------------------- /models/gradient_boost/envs/sagemaker/configs/train-cpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "instance_type": "ml.m5.large", 4 | "instance_count": 1, 5 | "task_type": "CPU" 6 | }, 7 | "role": "arn:aws:iam::763816190631:role/service-role/AmazonSageMaker-ExecutionRole-20191125T215860", 8 | "data": { 9 | "train": "s3://ml-ids-2018-sm/training", 10 | "val": "s3://ml-ids-2018-sm/validation", 11 | "test": "s3://ml-ids-2018-sm/testing" 12 | }, 13 | "model_bucket": "s3://sagemaker-eu-west-1-763816190631" 14 | } -------------------------------------------------------------------------------- /models/gradient_boost/envs/sagemaker/configs/train-gpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": { 3 | "instance_type": "ml.p2.xlarge", 4 | "instance_count": 1, 5 | "task_type": "GPU" 6 | }, 7 | "role": "arn:aws:iam::763816190631:role/service-role/AmazonSageMaker-ExecutionRole-20191125T215860", 8 | "data": { 9 | "train": "s3://ml-ids-2018-full/training", 10 | "val": "s3://ml-ids-2018-full/validation", 11 | "test": "s3://ml-ids-2018-full/testing" 12 | }, 13 | "model_bucket": "s3://sagemaker-eu-west-1-763816190631" 14 | } -------------------------------------------------------------------------------- /models/gradient_boost/envs/sagemaker/container/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.1-base 2 | 3 | # Install Miniconda 3 4 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 5 | ENV PATH /opt/conda/bin:$PATH 6 | 7 | RUN apt-get update --fix-missing && \ 8 | apt-get install -y wget bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 git mercurial subversion && \ 9 | apt-get clean 10 | 11 | RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh -O ~/miniconda.sh && \ 12 | /bin/bash ~/miniconda.sh -b -p /opt/conda && \ 13 | rm ~/miniconda.sh && \ 14 | /opt/conda/bin/conda clean -tipsy && \ 15 | ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ 16 | echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ 17 | echo "conda activate base" >> ~/.bashrc && \ 18 | find /opt/conda/ -follow -type f -name '*.a' -delete && \ 19 | find /opt/conda/ -follow -type f -name '*.js.map' -delete && \ 20 | /opt/conda/bin/conda clean -afy 21 | 22 | # Copy Conda environment file 23 | COPY models/gradient_boost/project/conda.yaml /opt/ml/code/conda.yaml 24 | 25 | # Install Conda environment 26 | RUN conda env create -f /opt/ml/code/conda.yaml 27 | 28 | # Copy project files 29 | ADD ml_ids /opt/ml/code/ml_ids 30 | ADD models/gradient_boost/project /opt/ml/code/models/gradient_boost/project 31 | COPY setup.cfg /opt/ml/code/setup.cfg 32 | COPY setup.py /opt/ml/code/setup.py 33 | 34 | # Activate conda env 35 | RUN echo "source activate ml-ids-gradient-boost-catboost" > ~/.bashrc 36 | ENV PATH /opt/conda/envs/ml-ids-gradient-boost-catboost/bin:$PATH 37 | 38 | # Copy train script and make it executable 39 | COPY models/gradient_boost/envs/sagemaker/container/train.py /opt/ml/code/train 40 | RUN chmod +x /opt/ml/code/train 41 | ENV PATH="/opt/ml/code:${PATH}" 42 | 43 | WORKDIR /opt/ml/code -------------------------------------------------------------------------------- /models/gradient_boost/envs/sagemaker/container/train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | import json 6 | import traceback 7 | import uuid 8 | import mlflow 9 | 10 | prefix = '/opt/ml/' 11 | 12 | output_path = os.path.join(prefix, 'output') 13 | model_path = os.path.join(prefix, 'model') 14 | param_path = os.path.join(prefix, 'input/config/hyperparameters.json') 15 | 16 | input_path = prefix + 'input/data' 17 | training_path = os.path.join(input_path, 'training') 18 | validation_path = os.path.join(input_path, 'validation') 19 | testing_path = os.path.join(input_path, 'testing') 20 | 21 | mlflow_project_uri = os.path.join(prefix, 'code/models/gradient_boost/project') 22 | mlflow_out_path = os.path.join('/tmp', str(uuid.uuid4())) 23 | 24 | 25 | def merge(dict1, dict2): 26 | d = dict(dict1) 27 | d.update(dict2) 28 | return d 29 | 30 | 31 | if __name__ == '__main__': 32 | print('Starting the training') 33 | 34 | try: 35 | with open(param_path, 'r') as tc: 36 | training_params = json.load(tc) 37 | 38 | training_file_path = os.path.join(training_path, 'train.h5') 39 | validation_file_path = os.path.join(validation_path, 'val.h5') 40 | testing_file_path = os.path.join(testing_path, 'test.h5') 41 | 42 | mlflow_params = merge(training_params, { 43 | 'train_path': training_file_path, 44 | 'val_path': validation_file_path, 45 | 'test_path': testing_file_path, 46 | 'output_path': mlflow_out_path, 47 | 'artifact_path': model_path 48 | }) 49 | 50 | os.makedirs(mlflow_out_path, exist_ok=True) 51 | 52 | mlflow.run(mlflow_project_uri, parameters=mlflow_params, use_conda=False) 53 | print('Training complete.') 54 | 55 | sys.exit(0) 56 | except Exception as e: 57 | # Write out an error file. This will be returned as the failureReason in the 58 | # DescribeTrainingJob result. 59 | trc = traceback.format_exc() 60 | with open(os.path.join(output_path, 'failure'), 'w') as s: 61 | s.write('Exception during training: ' + str(e) + '\n' + trc) 62 | # Printing this causes the exception to be in the training job logs, as well. 63 | print('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr) 64 | # A non-zero exit code causes the training job to be marked as Failed. 65 | sys.exit(255) 66 | -------------------------------------------------------------------------------- /models/gradient_boost/envs/sagemaker/scripts/build_image.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | image_name=$1 4 | image_version=$2 5 | 6 | if [ "$image_name" == "" ] 7 | then 8 | echo "Usage: $0 " 9 | exit 1 10 | fi 11 | 12 | if [ "$image_version" == "" ] 13 | then 14 | echo "Usage: $1 " 15 | exit 1 16 | fi 17 | 18 | fullname="${image_name}:${image_version}" 19 | 20 | echo "Building image '${fullname}'" 21 | 22 | docker build -f models/gradient_boost/envs/sagemaker/container/Dockerfile -t ${fullname} . -------------------------------------------------------------------------------- /models/gradient_boost/envs/sagemaker/scripts/deploy.py: -------------------------------------------------------------------------------- 1 | import click 2 | import json 3 | import boto3 4 | import tarfile 5 | import re 6 | import logging 7 | from mlflow import sagemaker 8 | 9 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | def unpack(file): 14 | """ 15 | Unpacks compressed files of format `tar` and `tar.gz`. 16 | :param file: Filename. 17 | :return: None 18 | """ 19 | if file.endswith("tar.gz"): 20 | tar = tarfile.open(file, "r:gz") 21 | tar.extractall() 22 | tar.close() 23 | elif file.endswith("tar"): 24 | tar = tarfile.open(file, "r:") 25 | tar.extractall() 26 | tar.close() 27 | 28 | 29 | @click.command() 30 | @click.option('--config-path', type=click.Path(exists=True), required=True, 31 | help='Path to the config.') 32 | @click.option('--job-id', type=str, required=True, 33 | help='Unique ID of the training job. Model is retrieved from a subdirectory with this name.') 34 | def deploy(config_path, job_id): 35 | with open(config_path, 'r') as f: 36 | config = json.load(f) 37 | 38 | app_name = config['deploy']['app_name'] 39 | instance_type = config['deploy']['instance_type'] 40 | instance_count = config['deploy']['instance_count'] 41 | region = config['deploy']['region'] 42 | role = config['role'] 43 | model_name = config['model_name'] 44 | model_bucket = re.sub('s3://', '', config['model_bucket']) 45 | model_artifact = config['model_artifact'] 46 | model_path = '{}/output/{}'.format(job_id, model_artifact) 47 | 48 | logger.info('Deploying model with parameters ' 49 | '[app-name="{}", instance-type="{}", instance-count={}, region="{}", model-path="{}"]' 50 | .format(app_name, instance_type, instance_count, region, model_path)) 51 | 52 | s3 = boto3.client('s3') 53 | s3.download_file(model_bucket, model_path, model_artifact) 54 | 55 | unpack(model_artifact) 56 | 57 | sagemaker.deploy(app_name=app_name, 58 | model_uri=model_name, 59 | execution_role_arn=role, 60 | region_name=region, 61 | mode='replace', 62 | instance_type=instance_type, 63 | instance_count=instance_count) 64 | 65 | 66 | if __name__ == '__main__': 67 | deploy() 68 | -------------------------------------------------------------------------------- /models/gradient_boost/envs/sagemaker/scripts/push_image_to_ecr.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | image_name=$1 4 | image_version=$2 5 | 6 | if [ "$image_name" == "" ] 7 | then 8 | echo "Usage: $0 " 9 | exit 1 10 | fi 11 | 12 | if [ "$image_version" == "" ] 13 | then 14 | echo "Usage: $1 " 15 | exit 1 16 | fi 17 | 18 | # Get the account number associated with the current IAM credentials 19 | account=$(aws sts get-caller-identity --query Account --output text) 20 | 21 | if [ $? -ne 0 ] 22 | then 23 | exit 255 24 | fi 25 | 26 | # Get the region defined in the current configuration (default to eu-west-1 if none defined) 27 | region=$(aws configure get region) 28 | region=${region:-eu-west-1} 29 | 30 | fullname="${account}.dkr.ecr.${region}.amazonaws.com/${image_name}:${image_version}" 31 | 32 | # If the repository doesn't exist in ECR, create it. 33 | 34 | aws ecr describe-repositories --repository-names "${image_name}" > /dev/null 2>&1 35 | 36 | if [ $? -ne 0 ] 37 | then 38 | aws ecr create-repository --repository-name "${image_name}" > /dev/null 39 | fi 40 | 41 | # Get the login command from ECR and execute it directly 42 | $(aws ecr get-login --region ${region} --no-include-email) 43 | 44 | # Build the docker image locally with the image name and then push it to ECR 45 | # with the full name. 46 | 47 | docker tag "${image_name}:${image_version}" ${fullname} 48 | docker push ${fullname} 49 | 50 | echo "image-name=${fullname}" -------------------------------------------------------------------------------- /models/gradient_boost/envs/sagemaker/scripts/train.py: -------------------------------------------------------------------------------- 1 | import json 2 | import click 3 | import logging 4 | from sagemaker.estimator import Estimator 5 | 6 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | def create_performance_metric_regex(id): 11 | """ 12 | Creates the regex for a single performance metric. 13 | Format: metric_name: 0.12345 14 | :param id: Metric identifier. 15 | :return: Regex 16 | """ 17 | return rf'{id}:\s*([\d.]*)' 18 | 19 | 20 | def create_metric_def(name, regex): 21 | """ 22 | Creates a metric definition for a single metric. 23 | :param name: Metric name. 24 | :param regex: Metric regex. 25 | :return: Metric definition as a `dict`. 26 | """ 27 | return {'Name': name, 'Regex': regex} 28 | 29 | 30 | def get_metric_definitions(): 31 | """ 32 | Creates the definitions for all metrics to monitor. 33 | :return: Metric definitions as a `list`. 34 | """ 35 | return [create_metric_def('train:loss', create_performance_metric_regex('learn')), 36 | create_metric_def('val:loss', create_performance_metric_regex('test')), 37 | create_metric_def('val:loss:best', r'bestTest\s=\s([\d.]*)'), 38 | create_metric_def('test:pr_auc', create_performance_metric_regex('pr_auc')), 39 | create_metric_def('test:precision', create_performance_metric_regex('precision')), 40 | create_metric_def('test:recall', create_performance_metric_regex('recall')), 41 | create_metric_def('test:f1', create_performance_metric_regex('f1'))] 42 | 43 | 44 | @click.command() 45 | @click.option('--config-path', type=click.Path(exists=True), required=True, 46 | help='Path to the config.') 47 | @click.option('--param-path', type=click.Path(exists=True), required=True, 48 | help='Path to the training parameters.') 49 | @click.option('--image-name', type=str, required=True, 50 | help='Name of the training image') 51 | @click.option('--mode', type=click.Choice(['LOCAL', 'AWS'], case_sensitive=False), default='LOCAL', 52 | help='Training mode.') 53 | @click.option('--job-id', type=str, required=True, 54 | help='Unique ID of the training job. Model outputs will be stored in a subdirectory with this name.') 55 | def train(config_path, param_path, image_name, mode, job_id): 56 | with open(config_path, 'r') as f: 57 | config = json.load(f) 58 | 59 | with open(param_path, 'r') as f: 60 | params = json.load(f) 61 | 62 | if mode == 'LOCAL': 63 | train_instance_type = 'local' 64 | params['task_type'] = 'CPU' 65 | else: 66 | train_instance_type = config['train']['instance_type'] 67 | params['task_type'] = config['train']['task_type'] 68 | 69 | train_instance_count = config['train']['instance_count'] 70 | role = config['role'] 71 | model_bucket = config['model_bucket'] 72 | 73 | logger.info('Start training with parameters ' 74 | '[job-id="{}", image="{}", mode="{}", instance_type="{}", instance_count={}, params={}]' 75 | .format(job_id, image_name, mode, train_instance_type, train_instance_count, params)) 76 | 77 | estimator = Estimator(image_name=image_name, 78 | role=role, 79 | train_instance_count=train_instance_count, 80 | train_instance_type=train_instance_type, 81 | hyperparameters=params, 82 | output_path=model_bucket, 83 | metric_definitions=get_metric_definitions(), 84 | train_max_run=(2 * 60 * 60)) 85 | 86 | estimator.fit(job_name=job_id, 87 | inputs={ 88 | 'training': config['data']['train'], 89 | 'validation': config['data']['val'], 90 | 'testing': config['data']['test'] 91 | }) 92 | 93 | 94 | if __name__ == '__main__': 95 | train() 96 | -------------------------------------------------------------------------------- /models/gradient_boost/envs/sagemaker/scripts/undeploy.py: -------------------------------------------------------------------------------- 1 | import click 2 | import json 3 | from mlflow import sagemaker 4 | 5 | 6 | @click.command() 7 | @click.option('--config-path', type=click.Path(exists=True), required=True, 8 | help='Path to the config.') 9 | def undeploy(config_path): 10 | with open(config_path, 'r') as f: 11 | config = json.load(f) 12 | 13 | app_name = config['deploy']['app_name'] 14 | region = config['deploy']['region'] 15 | 16 | sagemaker.delete(app_name=app_name, region_name=region) 17 | 18 | 19 | if __name__ == '__main__': 20 | undeploy() 21 | -------------------------------------------------------------------------------- /models/gradient_boost/project/MLproject: -------------------------------------------------------------------------------- 1 | name: gradient_boost_model 2 | 3 | conda_env: conda.yaml 4 | 5 | entry_points: 6 | main: 7 | parameters: 8 | train_path: path 9 | val_path: path 10 | test_path: path 11 | output_path: path 12 | artifact_path: path 13 | use_val_set: {type: bool, default: True} 14 | nr_iterations: {type: int, default: 1000} 15 | tree_depth: {type: int, default: 6} 16 | l2_reg: {type: float, default: 3.0} 17 | border_count: {type: int, default: 254} 18 | random_strength: {type: int, default: 1} 19 | task_type: {type: str, default: 'GPU'} 20 | nr_samples_attack_category: {type: int, default: 1000} 21 | random_seed: {type: int, default: -1} 22 | command: "pip install -e ../../../. && 23 | python train.py --train-path {train_path} 24 | --val-path {val_path} 25 | --test-path {test_path} 26 | --output-path {output_path} 27 | --artifact-path {artifact_path} 28 | --use-val-set {use_val_set} 29 | --random-seed {random_seed} 30 | --nr-iterations {nr_iterations} 31 | --tree-depth {tree_depth} 32 | --l2-reg {l2_reg} 33 | --border-count {border_count} 34 | --random-strength {random_strength} 35 | --task-type {task_type} 36 | --nr-samples-attack-category {nr_samples_attack_category}" -------------------------------------------------------------------------------- /models/gradient_boost/project/conda.yaml: -------------------------------------------------------------------------------- 1 | name: ml-ids-gradient-boost-catboost 2 | channels: 3 | - anaconda 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - python=3.7 8 | - pip=19.2.3=py37_0 9 | - pandas=0.25.2=py37hb3f55d8_0 10 | - catboost=0.18.1=py37_0 11 | - imbalanced-learn=0.5.0=py_0 12 | - scikit-learn=0.21.3=py37hcdab131_0 13 | - scipy=1.3.1=py37h921218d_2 14 | - click=7.0=py37_0 15 | - cloudpickle=1.2.2=py_0 16 | - pip: 17 | - tables==3.6.1 18 | - keras==2.2.4 19 | - mlflow==1.4 -------------------------------------------------------------------------------- /models/gradient_boost/project/train.py: -------------------------------------------------------------------------------- 1 | import click 2 | import logging 3 | import mlflow 4 | import mlflow.pyfunc 5 | import pickle 6 | import os 7 | import shutil 8 | from catboost import Pool 9 | from ml_ids.data.dataset import load_dataset_hdf 10 | from ml_ids.data.metadata import FEATURES_NO_VARIANCE, FEATURES_TO_IGNORE, FEATURES_PRESERVE_NEG_COLUMNS 11 | from ml_ids.prediction import predict_proba_positive 12 | from ml_ids.model_selection import split_x_y 13 | from ml_ids.models.gradient_boost.train import train_model, GradientBoostHyperParams 14 | from ml_ids.models.gradient_boost.mlflow_wrapper import CatBoostWrapper 15 | from sklearn.metrics import average_precision_score, precision_score, recall_score, f1_score 16 | import logging 17 | 18 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | def load_dataset(path): 23 | """ 24 | Loads a single dataset in `hdf` format. 25 | :param path: Dataset path. 26 | :return: Pandas DataFrame. 27 | """ 28 | return load_dataset_hdf(dataset_path=path, 29 | omit_cols=FEATURES_NO_VARIANCE + FEATURES_TO_IGNORE, 30 | preserve_neg_value_cols=FEATURES_PRESERVE_NEG_COLUMNS) 31 | 32 | 33 | def load_train_val_test_dataset(train_path, val_path, test_path): 34 | """ 35 | Loads the train, validation and test datasets. 36 | :param train_path: Path to the train dataset. 37 | :param val_path: Path to the validation dataset. 38 | :param test_path: Path to the test dataset. 39 | :return: the `Tuple(train, val, test)` containing Pandas DataFrames. 40 | """ 41 | return load_dataset(train_path), load_dataset(val_path), load_dataset(test_path) 42 | 43 | 44 | def measure_performance(clf, pipeline, dataset): 45 | """ 46 | Measures performance metrics on the given dataset. 47 | :param clf: Classifier to test. 48 | :param pipeline: Preprocessing pipeline. 49 | :param dataset: Dataset. 50 | :return: the `Tuple(pr_auc, precision, recall, f1)`. 51 | """ 52 | X, y = split_x_y(dataset) 53 | X = pipeline.transform(X) 54 | 55 | pool = Pool(X) 56 | y_true = y.label_is_attack 57 | 58 | pred_proba = predict_proba_positive(clf, pool) 59 | pred = clf.predict(pool) 60 | 61 | pr_auc = average_precision_score(y_true, pred_proba) 62 | precision = precision_score(y_true, pred) 63 | recall = recall_score(y_true, pred) 64 | f1 = f1_score(y_true, pred) 65 | return pr_auc, precision, recall, f1 66 | 67 | 68 | def save_artifacts(cbm_model_path, classifier, pipeline_path, pipeline, col_config_path, column_config): 69 | """ 70 | Save training artifacts to disk. 71 | :param cbm_model_path: Path on disk where the classifier should be stored. 72 | :param classifier: Classifier to store. 73 | :param pipeline_path: Path on disk where the pipeline should be stored. 74 | :param pipeline: Pipeline to store. 75 | :param col_config_path: Path on disk where the config should be stored. 76 | :param column_config: Column config to store. 77 | :return: None 78 | """ 79 | classifier.save_model(cbm_model_path) 80 | with open(pipeline_path, 'wb') as f: 81 | pickle.dump(pipeline, f) 82 | with open(col_config_path, 'wb') as f: 83 | pickle.dump(column_config, f) 84 | 85 | 86 | @click.command() 87 | @click.option('--train-path', type=click.Path(exists=True), required=True, 88 | help='Path to the train dataset in .h5 format.') 89 | @click.option('--val-path', type=click.Path(exists=True), required=True, 90 | help='Path to the train dataset in .h5 format.') 91 | @click.option('--test-path', type=click.Path(exists=True), required=True, 92 | help='Path to the train dataset in .h5 format.') 93 | @click.option('--output-path', type=click.Path(exists=True), required=True, 94 | help='Path to store the output.') 95 | @click.option('--artifact-path', type=click.Path(exists=True), required=True, 96 | help='Path to store the artifacts.') 97 | @click.option('--use-val-set', type=bool, default=True, 98 | help='Determines if the evaluation dataset should be used for early stopping of the training process.' 99 | 'If set to False the evaluation dataset will be appended to the train dataset.') 100 | @click.option('--random-seed', type=int, default=None, 101 | help='Random seed.') 102 | @click.option('--nr-iterations', type=int, required=True) 103 | @click.option('--tree-depth', type=int, required=True) 104 | @click.option('--l2-reg', type=float, required=True) 105 | @click.option('--border-count', type=int, required=True) 106 | @click.option('--random-strength', type=int, required=True) 107 | @click.option('--task-type', type=click.Choice(['CPU', 'GPU'], case_sensitive=False), required=True) 108 | @click.option('--nr-samples-attack-category', type=int, required=True) 109 | def train(train_path, 110 | val_path, 111 | test_path, 112 | output_path, 113 | artifact_path, 114 | use_val_set, 115 | random_seed, 116 | nr_iterations, 117 | tree_depth, 118 | l2_reg, 119 | border_count, 120 | random_strength, 121 | task_type, 122 | nr_samples_attack_category): 123 | shutil.rmtree(output_path, ignore_errors=True) 124 | os.makedirs(output_path, exist_ok=True) 125 | 126 | cbm_model_path = os.path.join(output_path, 'gradient_boost_model.cbm') 127 | pipeline_path = os.path.join(output_path, 'preprocessing_pipeline.pkl') 128 | col_config_path = os.path.join(output_path, 'column_config.pkl') 129 | mlflow_model_path = os.path.join(artifact_path, 'ml-ids-gb_mlflow_pyfunc') 130 | 131 | random_seed = None if random_seed == -1 else random_seed 132 | 133 | logger.info('Loading datasets...') 134 | train_dataset, val_dataset, test_dataset = load_train_val_test_dataset(train_path, val_path, test_path) 135 | 136 | if not use_val_set: 137 | logger.info('Evaluation dataset will not be used for early stopping. Merging with training dataset.') 138 | train_dataset = train_dataset.append(val_dataset) 139 | val_dataset = None 140 | else: 141 | logger.info('Evaluation dataset will be used for early stopping.') 142 | 143 | hyper_params = GradientBoostHyperParams(nr_iterations=nr_iterations, 144 | tree_depth=tree_depth, 145 | l2_reg=l2_reg, 146 | border_count=border_count, 147 | random_strength=random_strength, 148 | task_type=task_type) 149 | 150 | with mlflow.start_run(): 151 | logger.info('Starting training...') 152 | clf, pipeline, column_names = train_model(train_dataset, 153 | val_dataset, 154 | hyper_params=hyper_params, 155 | nr_attack_samples=nr_samples_attack_category, 156 | random_seed=random_seed) 157 | 158 | pr_auc, precision, recall, f1 = measure_performance(clf, pipeline, test_dataset) 159 | logger.info('Estimator performance:') 160 | logger.info('pr_auc: %f', pr_auc) 161 | logger.info('precision: %f', precision) 162 | logger.info('recall: %f', recall) 163 | logger.info('f1: %f', f1) 164 | 165 | save_artifacts(cbm_model_path, 166 | clf, 167 | pipeline_path, 168 | pipeline, 169 | col_config_path, 170 | { 171 | 'col_names': column_names, 172 | 'preserve_neg_vals': FEATURES_PRESERVE_NEG_COLUMNS 173 | }) 174 | 175 | mlflow.pyfunc.save_model( 176 | path=mlflow_model_path, 177 | python_model=CatBoostWrapper(), 178 | artifacts={ 179 | 'cbm_model': cbm_model_path, 180 | 'pipeline': pipeline_path, 181 | 'col_config': col_config_path 182 | }, 183 | conda_env='conda.yaml', 184 | code_path=['../../../ml_ids']) 185 | 186 | logger.info('Training completed.') 187 | 188 | 189 | if __name__ == '__main__': 190 | train() 191 | -------------------------------------------------------------------------------- /models/gradient_boost/training_params.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_type": "GPU", 3 | "use_val_set": true, 4 | "nr_iterations": 2000, 5 | "tree_depth": 10, 6 | "l2_reg": 4.813919374945952, 7 | "border_count": 254, 8 | "random_strength": 5, 9 | "nr_samples_attack_category": 100000 10 | } -------------------------------------------------------------------------------- /models/gradient_boost/training_params_quick_run.json: -------------------------------------------------------------------------------- 1 | { 2 | "task_type": "GPU", 3 | "nr_iterations": 10, 4 | "nr_samples_attack_category": 1000, 5 | "random_seed": 42 6 | } -------------------------------------------------------------------------------- /notebooks/03_ml-prototype/models/gradient_boost_model.cbm: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f9ff34d59ef5e2a1040b921b0b1d7565c63e4fd8d9bf4d080cf31a5e9ee13fc0 3 | size 14315968 4 | -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00000-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00000-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00001-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00001-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00002-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00002-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00003-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00003-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00004-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00004-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00005-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00005-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00007-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00007-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00008-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00008-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00009-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00009-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00010-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00010-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00011-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00011-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00013-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00013-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00014-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00014-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00015-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00015-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00016-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00016-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00017-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00017-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00019-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00019-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00020-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00020-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00021-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00021-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00022-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00022-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00023-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/.part-00023-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/_SUCCESS -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00000-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00000-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00001-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00001-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00002-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00002-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00003-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00003-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00004-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00004-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00005-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00005-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00007-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00007-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00008-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00008-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00009-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00009-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00010-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00010-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00011-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00011-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00013-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00013-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00014-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00014-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00015-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00015-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00016-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00016-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00017-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00017-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00019-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00019-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00020-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00020-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00021-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00021-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00022-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00022-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00023-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/data/part-00023-6cdbfe41-c900-40d4-ab1d-629d8c752c9f-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/metadata/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/metadata/.part-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/metadata/.part-00000.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/metadata/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/metadata/_SUCCESS -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/metadata/part-00000: -------------------------------------------------------------------------------- 1 | {"class":"org.apache.spark.ml.classification.GBTClassificationModel","timestamp":1568299022309,"sparkVersion":"2.4.4","uid":"GBTClassifier_0f5cdab6ac21","paramMap":{"seed":42,"maxDepth":5,"labelCol":"label_is_attack","stepSize":0.5,"featuresCol":"features","maxIter":20},"defaultParamMap":{"rawPredictionCol":"rawPrediction","minInstancesPerNode":1,"impurity":"gini","minInfoGain":0.0,"seed":-715221063584165447,"maxDepth":5,"labelCol":"label","featureSubsetStrategy":"all","subsamplingRate":1.0,"probabilityCol":"probability","maxMemoryInMB":256,"cacheNodeIds":false,"validationTol":0.01,"stepSize":0.1,"checkpointInterval":10,"maxBins":32,"lossType":"logistic","predictionCol":"prediction","featuresCol":"features","maxIter":20},"numFeatures":47,"numTrees":20} 2 | -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00000-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00000-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00001-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00001-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00002-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00002-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00003-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00003-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00004-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00004-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00005-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00005-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00006-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00006-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00007-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00007-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00008-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00008-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00009-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00009-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00010-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00010-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00011-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00011-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00012-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00012-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00013-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00013-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00014-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00014-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00015-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00015-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00016-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00016-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00017-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00017-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00018-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00018-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00019-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/.part-00019-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/_SUCCESS -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00000-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00000-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00001-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00001-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00002-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00002-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00003-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00003-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00004-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00004-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00005-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00005-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00006-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00006-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00007-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00007-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00008-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00008-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00009-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00009-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00010-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00010-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00011-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00011-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00012-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00012-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00013-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00013-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00014-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00014-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00015-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00015-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00016-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00016-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00017-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00017-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00018-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00018-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00019-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/bestModel/treesMetadata/part-00019-6b4df0d7-60c8-4fa0-b6ed-2d08ae1dd849-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/estimator/metadata/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/estimator/metadata/.part-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/estimator/metadata/.part-00000.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/estimator/metadata/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/estimator/metadata/_SUCCESS -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/estimator/metadata/part-00000: -------------------------------------------------------------------------------- 1 | {"class":"org.apache.spark.ml.classification.GBTClassifier","timestamp":1568299022265,"sparkVersion":"2.4.4","uid":"GBTClassifier_0f5cdab6ac21","paramMap":{"seed":42,"labelCol":"label_is_attack","featuresCol":"features"},"defaultParamMap":{"rawPredictionCol":"rawPrediction","minInstancesPerNode":1,"impurity":"gini","minInfoGain":0.0,"seed":-715221063584165447,"maxDepth":5,"labelCol":"label","featureSubsetStrategy":"all","subsamplingRate":1.0,"probabilityCol":"probability","maxMemoryInMB":256,"cacheNodeIds":false,"validationTol":0.01,"stepSize":0.1,"checkpointInterval":10,"maxBins":32,"lossType":"logistic","predictionCol":"prediction","featuresCol":"features","maxIter":20}} 2 | -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/evaluator/metadata/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/evaluator/metadata/.part-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/evaluator/metadata/.part-00000.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/evaluator/metadata/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/evaluator/metadata/_SUCCESS -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/evaluator/metadata/part-00000: -------------------------------------------------------------------------------- 1 | {"class":"org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator","timestamp":1568299022228,"sparkVersion":"2.4.4","uid":"MulticlassClassificationEvaluator_2045e84d5752","paramMap":{"metricName":"weightedRecall","predictionCol":"prediction","labelCol":"label_is_attack"},"defaultParamMap":{"metricName":"f1","predictionCol":"prediction","labelCol":"label"}} 2 | -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/metadata/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/metadata/.part-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/metadata/.part-00000.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/metadata/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/gb-model/metadata/_SUCCESS -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/gb-model/metadata/part-00000: -------------------------------------------------------------------------------- 1 | {"class":"org.apache.spark.ml.tuning.CrossValidatorModel","timestamp":1568299022196,"sparkVersion":"2.4.4","uid":"CrossValidatorModel_4f73491e9469","paramMap":{"seed":880116102,"numFolds":3,"estimatorParamMaps":[[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"1","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.5","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"10","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"1","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.1","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"10","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"1","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.05","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"10","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"1","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.5","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"20","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"1","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.1","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"20","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"1","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.05","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"20","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"3","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.5","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"10","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"3","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.1","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"10","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"3","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.05","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"10","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"3","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.5","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"20","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"3","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.1","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"20","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"3","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.05","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"20","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"5","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.5","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"10","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"5","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.1","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"10","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"5","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.05","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"10","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"5","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.5","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"20","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"5","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.1","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"20","isJson":"true"}],[{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxDepth","value":"5","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"stepSize","value":"0.05","isJson":"true"},{"parent":"GBTClassifier_0f5cdab6ac21","name":"maxIter","value":"20","isJson":"true"}]]},"defaultParamMap":{"seed":880116102,"numFolds":3},"avgMetrics":[],"persistSubModels":false} 2 | -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/metadata/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/metadata/.part-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/metadata/.part-00000.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/metadata/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/metadata/_SUCCESS -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/metadata/part-00000: -------------------------------------------------------------------------------- 1 | {"class":"pyspark.ml.pipeline.PipelineModel","timestamp":1568299021142,"sparkVersion":"2.4.4","uid":"PipelineModel_aec8567ff127","paramMap":{"stageUids":["ValueCleaner_57f061a9e393","Imputer_3f8cf4b571a8","OneHotEncoderEstimator_f1dc6e50f52e","VectorAssembler_ef6b7bf933ee","BinaryLabelMaker_3b174e5e0c29"],"language":"Python"},"defaultParamMap":{}} 2 | -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/0_ValueCleaner_57f061a9e393/metadata/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/0_ValueCleaner_57f061a9e393/metadata/.part-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/0_ValueCleaner_57f061a9e393/metadata/.part-00000.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/0_ValueCleaner_57f061a9e393/metadata/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/0_ValueCleaner_57f061a9e393/metadata/_SUCCESS -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/0_ValueCleaner_57f061a9e393/metadata/part-00000: -------------------------------------------------------------------------------- 1 | {"class":"__main__.ValueCleaner","timestamp":1568299021262,"sparkVersion":"2.4.4","uid":"ValueCleaner_57f061a9e393","paramMap":{"inputCols":["flow_duration","flow_byts_s","flow_pkts_s","flow_iat_min","fwd_iat_tot","fwd_iat_min","init_fwd_win_byts","init_bwd_win_byts","fwd_seg_size_min"],"outputCols":["flow_duration_clean","flow_byts_s_clean","flow_pkts_s_clean","flow_iat_min_clean","fwd_iat_tot_clean","fwd_iat_min_clean","init_fwd_win_byts_clean","init_bwd_win_byts_clean","fwd_seg_size_min_clean"]},"defaultParamMap":{}} 2 | -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/data/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/data/.part-00000-d346f402-14f7-495c-adb5-386e07999ead-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/data/.part-00000-d346f402-14f7-495c-adb5-386e07999ead-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/data/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/data/_SUCCESS -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/data/part-00000-d346f402-14f7-495c-adb5-386e07999ead-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/data/part-00000-d346f402-14f7-495c-adb5-386e07999ead-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/metadata/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/metadata/.part-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/metadata/.part-00000.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/metadata/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/metadata/_SUCCESS -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/1_Imputer_3f8cf4b571a8/metadata/part-00000: -------------------------------------------------------------------------------- 1 | {"class":"org.apache.spark.ml.feature.ImputerModel","timestamp":1568299021353,"sparkVersion":"2.4.4","uid":"Imputer_3f8cf4b571a8","paramMap":{"inputCols":["flow_duration_clean","flow_byts_s_clean","flow_pkts_s_clean","flow_iat_min_clean","fwd_iat_tot_clean","fwd_iat_min_clean","init_fwd_win_byts_clean","init_bwd_win_byts_clean","fwd_seg_size_min_clean"],"outputCols":["flow_duration_imputed","flow_byts_s_imputed","flow_pkts_s_imputed","flow_iat_min_imputed","fwd_iat_tot_imputed","fwd_iat_min_imputed","init_fwd_win_byts_imputed","init_bwd_win_byts_imputed","fwd_seg_size_min_imputed"]},"defaultParamMap":{"missingValue":"NaN","strategy":"mean"}} 2 | -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/data/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/data/.part-00000-c909fe56-90d1-4202-a5f4-69907defba9a-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/data/.part-00000-c909fe56-90d1-4202-a5f4-69907defba9a-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/data/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/data/_SUCCESS -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/data/part-00000-c909fe56-90d1-4202-a5f4-69907defba9a-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/data/part-00000-c909fe56-90d1-4202-a5f4-69907defba9a-c000.snappy.parquet -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/metadata/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/metadata/.part-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/metadata/.part-00000.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/metadata/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/metadata/_SUCCESS -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/2_OneHotEncoderEstimator_f1dc6e50f52e/metadata/part-00000: -------------------------------------------------------------------------------- 1 | {"class":"org.apache.spark.ml.feature.OneHotEncoderModel","timestamp":1568299021798,"sparkVersion":"2.4.4","uid":"OneHotEncoderEstimator_f1dc6e50f52e","paramMap":{"inputCols":["protocol"],"outputCols":["protocol_cat"]},"defaultParamMap":{"dropLast":true,"handleInvalid":"error"}} 2 | -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/3_VectorAssembler_ef6b7bf933ee/metadata/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/3_VectorAssembler_ef6b7bf933ee/metadata/.part-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/3_VectorAssembler_ef6b7bf933ee/metadata/.part-00000.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/3_VectorAssembler_ef6b7bf933ee/metadata/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/3_VectorAssembler_ef6b7bf933ee/metadata/_SUCCESS -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/3_VectorAssembler_ef6b7bf933ee/metadata/part-00000: -------------------------------------------------------------------------------- 1 | {"class":"org.apache.spark.ml.feature.VectorAssembler","timestamp":1568299021967,"sparkVersion":"2.4.4","uid":"VectorAssembler_ef6b7bf933ee","paramMap":{"inputCols":["tot_fwd_pkts","tot_bwd_pkts","totlen_fwd_pkts","totlen_bwd_pkts","fwd_pkt_len_mean","fwd_pkt_len_std","bwd_pkt_len_mean","flow_iat_std","bwd_iat_tot","bwd_iat_min","fwd_psh_flags","fwd_urg_flags","bwd_pkts_s","fin_flag_cnt","rst_flag_cnt","psh_flag_cnt","ack_flag_cnt","urg_flag_cnt","down_up_ratio","active_mean","idle_mean","protocol_cat","flow_duration_imputed","flow_byts_s_imputed","flow_pkts_s_imputed","flow_iat_min_imputed","fwd_iat_tot_imputed","fwd_iat_min_imputed","init_fwd_win_byts_imputed","init_bwd_win_byts_imputed","fwd_seg_size_min_imputed"],"outputCol":"features"},"defaultParamMap":{"handleInvalid":"error","outputCol":"VectorAssembler_ef6b7bf933ee__output"}} 2 | -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/4_BinaryLabelMaker_3b174e5e0c29/metadata/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/4_BinaryLabelMaker_3b174e5e0c29/metadata/.part-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/4_BinaryLabelMaker_3b174e5e0c29/metadata/.part-00000.crc -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/4_BinaryLabelMaker_3b174e5e0c29/metadata/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/04_ml-prototype-spark/models/pipeline-model/stages/4_BinaryLabelMaker_3b174e5e0c29/metadata/_SUCCESS -------------------------------------------------------------------------------- /notebooks/04_ml-prototype-spark/models/pipeline-model/stages/4_BinaryLabelMaker_3b174e5e0c29/metadata/part-00000: -------------------------------------------------------------------------------- 1 | {"class":"__main__.BinaryLabelMaker","timestamp":1568299022005,"sparkVersion":"2.4.4","uid":"BinaryLabelMaker_3b174e5e0c29","paramMap":{"inputCols":["label"],"outputCols":["label_is_attack"],"classLabel":"Benign"},"defaultParamMap":{}} 2 | -------------------------------------------------------------------------------- /notebooks/05_anomaly_detection/img/denoising_autoencoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/05_anomaly_detection/img/denoising_autoencoder.png -------------------------------------------------------------------------------- /notebooks/05_anomaly_detection/img/stacked_autoencoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/05_anomaly_detection/img/stacked_autoencoder.png -------------------------------------------------------------------------------- /notebooks/05_anomaly_detection/img/undercomplete_autoencoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/notebooks/05_anomaly_detection/img/undercomplete_autoencoder.png -------------------------------------------------------------------------------- /notebooks/05_anomaly_detection/models/denoising_autoencoder_model.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:13f9ca921d4d76f3a745450fa844e22c2d5716440efcc22c2170f3bc0f21f179 3 | size 13411104 4 | -------------------------------------------------------------------------------- /notebooks/05_anomaly_detection/models/simple_autoencoder_model.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:18216e715acf520b92ba511d4a27f37b90377887540a1d2b1217d46b41d7d93a 3 | size 70464 4 | -------------------------------------------------------------------------------- /notebooks/05_anomaly_detection/models/stacked_autoencoder_model.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:db4d61f4d8ee4e9d43db255afcac4c2443aea48268ea9ea867783460cdfa065d 3 | size 204328 4 | -------------------------------------------------------------------------------- /notebooks/05_anomaly_detection/notebook_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | from sklearn.metrics import classification_report, average_precision_score, roc_auc_score, precision_recall_curve, \ 5 | roc_curve 6 | from IPython.display import display 7 | from ml_ids.visualization import plot_confusion_matrix 8 | 9 | 10 | def predict(model, X, y): 11 | preds = model.predict(X, batch_size=8196) 12 | mse = np.mean(np.power(X - preds, 2), axis=1) 13 | 14 | return pd.DataFrame({'y_true': y, 'rec_error': mse}) 15 | 16 | 17 | def evaluate_pr_roc(pred): 18 | pr_auc = average_precision_score(pred.y_true, pred.rec_error) 19 | roc_auc = roc_auc_score(pred.y_true, pred.rec_error) 20 | return pr_auc, roc_auc 21 | 22 | 23 | def plot_evaluation_curves(pred): 24 | fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(20, 8)) 25 | 26 | precisions, recalls, thresholds = precision_recall_curve(pred.y_true, pred.rec_error) 27 | fpr, tpr, _ = roc_curve(pred.y_true, pred.rec_error) 28 | pr_auc, roc_auc = evaluate_pr_roc(pred) 29 | 30 | # plot precision / recall curve 31 | ax1.plot(recalls, precisions, label='auc={}'.format(pr_auc)) 32 | ax1.set_title('Precision / Recall Curve') 33 | ax1.set_xlabel('Recall') 34 | ax1.set_ylabel('Precision') 35 | ax1.legend(loc='lower right') 36 | 37 | # plot ROC curve 38 | ax2.plot(fpr, tpr, label='auc={}'.format(roc_auc)) 39 | ax2.set_title('ROC Curve') 40 | ax2.set_ylabel('True Positive Rate') 41 | ax2.set_xlabel("False Positive Rate") 42 | ax2.legend(loc='lower right') 43 | 44 | 45 | def plot_pr_threshold_curves(pred, pr_plot_lim=[0, 1]): 46 | fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(20, 8)) 47 | 48 | precisions, recalls, thresholds = precision_recall_curve(pred.y_true, pred.rec_error) 49 | 50 | # plot precision / recall for different thresholds 51 | ax1.plot(thresholds, precisions[:-1], label="Precision") 52 | ax1.plot(thresholds, recalls[:-1], label="Recall") 53 | ax1.set_title('Precision / Recall of different thresholds') 54 | ax1.set_xlabel('Threshold') 55 | ax1.set_ylabel('Precision / Recall') 56 | ax1.legend(loc='lower right') 57 | 58 | # plot precision / recall for different thresholds 59 | ax2.plot(thresholds, precisions[:-1], label="Precision") 60 | ax2.plot(thresholds, recalls[:-1], label="Recall") 61 | ax2.set_title('Precision / Recall of different thresholds') 62 | ax2.set_xlabel('Threshold') 63 | ax2.set_ylabel('Precision / Recall') 64 | ax2.set_xlim(pr_plot_lim) 65 | ax2.legend(loc='lower right') 66 | 67 | 68 | def best_precision_for_target_recall(pred, target_recall): 69 | precisions, recalls, thresholds = precision_recall_curve(pred.y_true, pred.rec_error) 70 | return thresholds[np.argmin(recalls >= target_recall)] 71 | 72 | 73 | def get_misclassifications(y, pred_binary): 74 | misclassifications = y[y.label_is_attack != pred_binary] 75 | 76 | mc_df = pd.merge(pd.DataFrame({'misclassified': misclassifications.label.value_counts()}), 77 | pd.DataFrame({'total': y.label.value_counts()}), 78 | how='left', left_index=True, right_index=True) 79 | mc_df['percent_misclassified'] = mc_df.apply(lambda x: x[0] / x[1], axis=1) 80 | return mc_df.sort_values('percent_misclassified', ascending=False) 81 | 82 | 83 | def print_performance(y, pred, threshold): 84 | pred_binary = (pred.rec_error >= threshold).astype('int') 85 | 86 | print('Classification Report:') 87 | print('======================') 88 | print(classification_report(pred.y_true, pred_binary)) 89 | 90 | print('Confusion Matrix:') 91 | print('=================') 92 | plot_confusion_matrix(pred.y_true, pred_binary, np.array(['Benign', 'Attack']), size=(5, 5)) 93 | plt.show() 94 | 95 | print('Misclassifications by attack category:') 96 | print('======================================') 97 | mc_df = get_misclassifications(y, pred_binary) 98 | display(mc_df) 99 | 100 | 101 | def filter_benign(X, y): 102 | return X[y.label_is_attack == 0] 103 | -------------------------------------------------------------------------------- /notebooks/06_dl_classifier/models/c0cb0656-558f-4311-b138-9b91ab4d1fe6.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:8efc348b48af452153dec068d1367cec784ffc2930049df4eaf371d10c0d1caa 3 | size 4651784 4 | -------------------------------------------------------------------------------- /notebooks/06_dl_classifier/models/model_class_weight.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:60e8392296780d912e8bff335bdadb81deae5b035925d2282e963d45def4ce95 3 | size 4231072 4 | -------------------------------------------------------------------------------- /notebooks/06_dl_classifier/models/model_no_class_weights.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e0d9dff11e5600e74974a8e6657be10f7af7d1abe7b66ea2308d9d6eea4d29eb 3 | size 4231072 4 | -------------------------------------------------------------------------------- /notebooks/06_dl_classifier/models/opt_model.h5: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d941b094728d0e970231c2f40440da9bfe2c9c5f9898954064954483d210857a 3 | size 4655880 4 | -------------------------------------------------------------------------------- /notebooks/06_dl_classifier/notebook_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gc 3 | from ml_ids.model_selection import split_x_y, train_val_test_split 4 | from ml_ids.transform.sampling import upsample_minority_classes, downsample 5 | from ml_ids.transform.preprocessing import create_pipeline 6 | from collections import Counter 7 | 8 | 9 | def transform_data(dataset, 10 | attack_samples, 11 | imputer_strategy, 12 | scaler, 13 | benign_samples=None, 14 | random_state=None): 15 | 16 | cols_to_impute = dataset.columns[dataset.isna().any()].tolist() 17 | 18 | train_data, val_data, test_data = train_val_test_split(dataset, 19 | val_size=0.1, 20 | test_size=0.1, 21 | stratify_col='label_cat', 22 | random_state=random_state) 23 | 24 | if benign_samples: 25 | train_data = downsample(train_data, default_nr_samples=benign_samples, random_state=random_state) 26 | 27 | X_train_raw, y_train = split_x_y(train_data) 28 | X_val_raw, y_val = split_x_y(val_data) 29 | X_test_raw, y_test = split_x_y(test_data) 30 | 31 | print('Samples:') 32 | print('========') 33 | print('Training: {}'.format(X_train_raw.shape)) 34 | print('Val: {}'.format(X_val_raw.shape)) 35 | print('Test: {}'.format(X_test_raw.shape)) 36 | 37 | print('\nTraining labels:') 38 | print('================') 39 | print(y_train.label.value_counts()) 40 | print('\nValidation labels:') 41 | print('==================') 42 | print(y_val.label.value_counts()) 43 | print('\nTest labels:') 44 | print('============') 45 | print(y_test.label.value_counts()) 46 | 47 | del train_data, val_data, test_data 48 | gc.collect() 49 | 50 | pipeline, get_col_names = create_pipeline(X_train_raw, 51 | imputer_strategy=imputer_strategy, 52 | imputer_cols=cols_to_impute, 53 | scaler=scaler) 54 | 55 | X_train = pipeline.fit_transform(X_train_raw) 56 | X_val = pipeline.transform(X_val_raw) 57 | X_test = pipeline.transform(X_test_raw) 58 | 59 | column_names = get_col_names() 60 | 61 | print('Samples:') 62 | print('========') 63 | print('Training: {}'.format(X_train.shape)) 64 | print('Val: {}'.format(X_val.shape)) 65 | print('Test: {}'.format(X_test.shape)) 66 | 67 | print('\nMissing values:') 68 | print('===============') 69 | print('Training: {}'.format(np.count_nonzero(np.isnan(X_train)))) 70 | print('Val: {}'.format(np.count_nonzero(np.isnan(X_val)))) 71 | print('Test: {}'.format(np.count_nonzero(np.isnan(X_test)))) 72 | 73 | print('\nScaling:') 74 | print('========') 75 | print('Training: min={}, max={}'.format(np.min(X_train), np.max(X_train))) 76 | print('Val: min={}, max={}'.format(np.min(X_val), np.max(X_val))) 77 | print('Test: min={}, max={}'.format(np.min(X_test), np.max(X_test))) 78 | 79 | X_train, y_train = upsample_minority_classes(X_train, 80 | y_train, 81 | min_samples=attack_samples, 82 | random_state=random_state) 83 | 84 | print('Samples:') 85 | print('========') 86 | print('Training: {}'.format(X_train.shape)) 87 | 88 | print('\nTraining labels:') 89 | print('================') 90 | print(Counter(y_train)) 91 | 92 | return X_train, y_train, X_val, y_val, X_test, y_test, column_names 93 | -------------------------------------------------------------------------------- /notebooks/07_binary_classifier_comparison/models/gb_835066e8-2427-48ca-a521-67195008cb91.catboost: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ceccc696d2c5eae0d550425f772221088e7a66b26a626461642e14c2b42099ce 3 | size 31179384 4 | -------------------------------------------------------------------------------- /notebooks/07_binary_classifier_comparison/notebook_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gc 3 | from ml_ids.model_selection import split_x_y, train_val_test_split 4 | from ml_ids.transform.sampling import upsample_minority_classes, downsample 5 | from ml_ids.transform.preprocessing import create_pipeline 6 | from collections import Counter 7 | 8 | 9 | def get_best_model_path(trials, model_path_var='model_path'): 10 | return trials.results[np.argmin(trials.losses())][model_path_var] 11 | 12 | 13 | def print_trial_results(trials, best_run, model_path_var='model_path'): 14 | best_model_path = get_best_model_path(trials, model_path_var) 15 | 16 | print('Best validation score: {}'.format(-np.min(trials.losses()))) 17 | print('Best model path: {}\n'.format(best_model_path)) 18 | print('Best model parameters:') 19 | print('======================') 20 | print(best_run) 21 | 22 | 23 | def transform_data(dataset, 24 | attack_samples, 25 | imputer_strategy, 26 | scaler, 27 | benign_samples=None, 28 | random_state=None): 29 | 30 | cols_to_impute = dataset.columns[dataset.isna().any()].tolist() 31 | 32 | train_data, val_data, test_data = train_val_test_split(dataset, 33 | val_size=0.1, 34 | test_size=0.1, 35 | stratify_col='label_cat', 36 | random_state=random_state) 37 | 38 | if benign_samples: 39 | train_data = downsample(train_data, default_nr_samples=benign_samples, random_state=random_state) 40 | 41 | X_train_raw, y_train = split_x_y(train_data) 42 | X_val_raw, y_val = split_x_y(val_data) 43 | X_test_raw, y_test = split_x_y(test_data) 44 | 45 | print('Samples:') 46 | print('========') 47 | print('Training: {}'.format(X_train_raw.shape)) 48 | print('Val: {}'.format(X_val_raw.shape)) 49 | print('Test: {}'.format(X_test_raw.shape)) 50 | 51 | print('\nTraining labels:') 52 | print('================') 53 | print(y_train.label.value_counts()) 54 | print('\nValidation labels:') 55 | print('==================') 56 | print(y_val.label.value_counts()) 57 | print('\nTest labels:') 58 | print('============') 59 | print(y_test.label.value_counts()) 60 | 61 | del train_data, val_data, test_data 62 | gc.collect() 63 | 64 | pipeline, get_col_names = create_pipeline(X_train_raw, 65 | imputer_strategy=imputer_strategy, 66 | imputer_cols=cols_to_impute, 67 | scaler=scaler) 68 | 69 | X_train = pipeline.fit_transform(X_train_raw) 70 | X_val = pipeline.transform(X_val_raw) 71 | X_test = pipeline.transform(X_test_raw) 72 | 73 | column_names = get_col_names() 74 | 75 | print('Samples:') 76 | print('========') 77 | print('Training: {}'.format(X_train.shape)) 78 | print('Val: {}'.format(X_val.shape)) 79 | print('Test: {}'.format(X_test.shape)) 80 | 81 | print('\nMissing values:') 82 | print('===============') 83 | print('Training: {}'.format(np.count_nonzero(np.isnan(X_train)))) 84 | print('Val: {}'.format(np.count_nonzero(np.isnan(X_val)))) 85 | print('Test: {}'.format(np.count_nonzero(np.isnan(X_test)))) 86 | 87 | print('\nScaling:') 88 | print('========') 89 | print('Training: min={}, max={}'.format(np.min(X_train), np.max(X_train))) 90 | print('Val: min={}, max={}'.format(np.min(X_val), np.max(X_val))) 91 | print('Test: min={}, max={}'.format(np.min(X_test), np.max(X_test))) 92 | 93 | X_train, y_train = upsample_minority_classes(X_train, 94 | y_train, 95 | min_samples=attack_samples, 96 | random_state=random_state) 97 | 98 | print('Samples:') 99 | print('========') 100 | print('Training: {}'.format(X_train.shape)) 101 | 102 | print('\nTraining labels:') 103 | print('================') 104 | print(Counter(y_train)) 105 | 106 | return X_train, y_train, X_val, y_val, X_test, y_test, column_names 107 | -------------------------------------------------------------------------------- /project-proposal.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/project-proposal.pdf -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [aliases] 2 | test=pytest 3 | 4 | [mypy-numpy.*] 5 | ignore_missing_imports = True 6 | 7 | [mypy-pandas.*] 8 | ignore_missing_imports = True 9 | 10 | [mypy-matplotlib.*] 11 | ignore_missing_imports = True 12 | 13 | [mypy-IPython.*] 14 | ignore_missing_imports = True 15 | 16 | [mypy-sklearn.*] 17 | ignore_missing_imports = True 18 | 19 | [mypy-seaborn.*] 20 | ignore_missing_imports = True 21 | 22 | [mypy-tensorflow.*] 23 | ignore_missing_imports = True 24 | 25 | [mypy-mlflow.*] 26 | ignore_missing_imports = True 27 | 28 | [mypy-catboost.*] 29 | ignore_missing_imports = True 30 | 31 | [mypy-imblearn.*] 32 | ignore_missing_imports = True -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | setup( 4 | name='ml-ids', 5 | version='0.1', 6 | description='Machine learning based Intrusion Detection System', 7 | long_description='Machine learning based Intrusion Detection System', 8 | classifiers=[ 9 | 'Programming Language :: Python :: 3', 10 | ], 11 | url='https://github.com/cstub/ml-ids', 12 | author='cstub', 13 | author_email='stumpf.christoph@gmail.com', 14 | license='MIT', 15 | packages=['ml_ids'], 16 | install_requires=[ 17 | ], 18 | setup_requires=['pytest-runner'], 19 | tests_require=['pytest'] 20 | ) 21 | -------------------------------------------------------------------------------- /tests/data/test_dataset.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pandas as pd 3 | import numpy as np 4 | import os 5 | from ml_ids import conf 6 | from ml_ids.data.dataset import load_dataset 7 | 8 | 9 | @pytest.fixture 10 | def val_data(): 11 | validation_data_path = os.path.join(conf.TEST_DATA_DIR, 'validation.csv') 12 | return pd.read_csv(validation_data_path) 13 | 14 | 15 | def inf_value_count(df): 16 | return df[(df == np.inf) | (df == -np.inf)].count().sum() 17 | 18 | 19 | def neg_value_count(df): 20 | numeric_cols = df.select_dtypes(include=[np.number]).columns.values 21 | df_num = df[numeric_cols] 22 | return df_num[df_num < 0].count().sum() 23 | 24 | 25 | def nan_value_count(df): 26 | return df.isna().sum().sum() 27 | 28 | 29 | def negative_value_columns(df): 30 | numeric_cols = df.select_dtypes(include=[np.number]).columns.values 31 | return [c for c in numeric_cols if df[df[c] < 0][c].count() > 0] 32 | 33 | 34 | def test_loaded_dataset_must_not_contain_inf_values(): 35 | df = load_dataset(conf.TEST_DATA_DIR) 36 | 37 | assert inf_value_count(df) == 0 38 | 39 | 40 | def test_loaded_dataset_must_not_contain_negative_values(): 41 | df = load_dataset(conf.TEST_DATA_DIR) 42 | 43 | assert neg_value_count(df) == 0 44 | 45 | 46 | def test_loaded_dataset_must_not_contain_negative_values_except_excluded_cols(): 47 | df = load_dataset(conf.TEST_DATA_DIR, preserve_neg_value_cols=['init_fwd_win_byts', 'init_bwd_win_byts']) 48 | 49 | assert neg_value_count(df) != 0 50 | assert set(negative_value_columns(df)) == {'init_bwd_win_byts', 'init_fwd_win_byts'} 51 | 52 | 53 | def test_loaded_dataset_must_contain_label_category(): 54 | df = load_dataset(conf.TEST_DATA_DIR) 55 | 56 | assert len(df.label_cat.value_counts()) == len(df.label.value_counts()) 57 | 58 | 59 | def test_loaded_dataset_must_contain_label_is_attack(): 60 | df = load_dataset(conf.TEST_DATA_DIR) 61 | 62 | all_sample_count = len(df) 63 | benign_sample_count = len(df[df.label == 'Benign']) 64 | attack_sample_count = all_sample_count - benign_sample_count 65 | 66 | assert len(df[df.label_is_attack == 0]) == benign_sample_count 67 | assert len(df[df.label_is_attack == 1]) == attack_sample_count 68 | 69 | 70 | def test_loaded_dataset_must_replace_invalid_value_with_nan(val_data): 71 | df = load_dataset(conf.TEST_DATA_DIR) 72 | 73 | inf_value_c = inf_value_count(val_data) 74 | neg_value_c = neg_value_count(val_data) 75 | 76 | assert (inf_value_c + neg_value_c) == nan_value_count(df) 77 | 78 | 79 | def test_loaded_dataset_must_contain_only_specified_columns(): 80 | df = load_dataset(conf.TEST_DATA_DIR, use_cols=['dst_port']) 81 | 82 | assert df.columns == ['dst_port'] 83 | 84 | 85 | def test_loaded_dataset_must_omit_specified_columns(): 86 | df = load_dataset(conf.TEST_DATA_DIR, omit_cols=['dst_port']) 87 | 88 | assert 'dst_port' not in df.columns 89 | -------------------------------------------------------------------------------- /tests/transform/test_preprocessing.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | from numpy.testing import assert_array_equal 4 | from sklearn.preprocessing import FunctionTransformer, MinMaxScaler 5 | 6 | from ml_ids import conf 7 | from ml_ids.data.dataset import load_dataset 8 | from ml_ids.model_selection import split_x_y 9 | from ml_ids.transform.preprocessing import create_pipeline 10 | 11 | 12 | @pytest.fixture 13 | def feature_df(): 14 | df = load_dataset(conf.TEST_DATA_DIR, omit_cols=['timestamp']) 15 | X, _ = split_x_y(df) 16 | return X 17 | 18 | 19 | def nan_value_count(x): 20 | return np.count_nonzero(np.isnan(x)) 21 | 22 | 23 | def test_pipeline_must_impute_all_missing_values(feature_df): 24 | pipeline, _ = create_pipeline(feature_df, 25 | imputer_strategy='mean', 26 | scaler=FunctionTransformer, 27 | scaler_args={'validate': False}) 28 | transformed = pipeline.fit_transform(feature_df) 29 | 30 | assert nan_value_count(feature_df.values) != 0 31 | assert nan_value_count(transformed) == 0 32 | 33 | 34 | def test_pipeline_must_impute_selected_columns_only(feature_df): 35 | pipeline, _ = create_pipeline(feature_df, 36 | imputer_strategy='mean', 37 | imputer_cols=['flow_duration', 'flow_pkts_s'], 38 | scaler=FunctionTransformer, 39 | scaler_args={'validate': False}) 40 | 41 | missing_vals_selected_columns = \ 42 | nan_value_count(feature_df.flow_duration.values) + nan_value_count(feature_df.flow_pkts_s.values) 43 | 44 | transformed = pipeline.fit_transform(feature_df) 45 | 46 | assert nan_value_count(transformed) == (nan_value_count(feature_df.values) - missing_vals_selected_columns) 47 | 48 | 49 | def test_pipeline_must_not_impute_values_if_imputer_strategy_none(feature_df): 50 | pipeline, get_col_names = create_pipeline(feature_df, 51 | imputer_strategy=None, 52 | scaler=FunctionTransformer, 53 | scaler_args={'validate': False}) 54 | 55 | transformed = pipeline.fit_transform(feature_df) 56 | 57 | assert nan_value_count(feature_df.values) == nan_value_count(transformed) 58 | assert len(feature_df.columns) == len(get_col_names()) 59 | 60 | 61 | def test_pipeline_must_reorder_columns(feature_df): 62 | pipeline, get_col_names = create_pipeline(feature_df, 63 | imputer_strategy='mean', 64 | imputer_cols=['flow_duration', 'flow_pkts_s'], 65 | scaler=FunctionTransformer, 66 | scaler_args={'validate': False}) 67 | 68 | _ = pipeline.fit_transform(feature_df) 69 | column_names = get_col_names() 70 | 71 | assert len(feature_df.columns) == len(column_names) 72 | assert_array_equal(column_names[:2], ['flow_duration', 'flow_pkts_s']) 73 | 74 | 75 | def test_pipeline_must_impute_all_missing_values_with_mean(feature_df): 76 | pipeline, get_col_names = create_pipeline(feature_df, 77 | imputer_strategy='mean', 78 | scaler=FunctionTransformer, 79 | scaler_args={'validate': False}) 80 | transformed = pipeline.fit_transform(feature_df) 81 | 82 | col_idx = np.where(get_col_names() == 'flow_duration')[0] 83 | nan_idx = np.where(np.isnan(feature_df.flow_duration.values))[0] 84 | 85 | assert len(nan_idx) == 10 86 | assert np.unique(transformed[nan_idx, col_idx]) == feature_df.flow_duration.mean() 87 | 88 | 89 | def test_pipeline_must_impute_all_missing_values_with_median(feature_df): 90 | pipeline, get_col_names = create_pipeline(feature_df, 91 | imputer_strategy='median', 92 | scaler=FunctionTransformer, 93 | scaler_args={'validate': False}) 94 | transformed = pipeline.fit_transform(feature_df) 95 | 96 | col_idx = np.where(get_col_names() == 'flow_duration')[0] 97 | nan_idx = np.where(np.isnan(feature_df.flow_duration.values))[0] 98 | 99 | assert len(nan_idx) == 10 100 | assert np.unique(transformed[nan_idx, col_idx]) == feature_df.flow_duration.median() 101 | 102 | 103 | def test_pipeline_must_scale_all_values(feature_df): 104 | pipeline, _ = create_pipeline(feature_df, scaler=MinMaxScaler) 105 | transformed = pipeline.fit_transform(feature_df) 106 | 107 | assert np.min(transformed) == 0 108 | assert np.max(transformed) == 1 109 | 110 | 111 | def test_pipeline_must_one_hot_encode_categorical_values(feature_df): 112 | nr_categories = 3 113 | pipeline, _ = create_pipeline(feature_df, cat_cols=['protocol']) 114 | transformed = pipeline.fit_transform(feature_df) 115 | 116 | one_hot_encoded = transformed[:, -nr_categories:] 117 | 118 | print(np.unique(one_hot_encoded)) 119 | 120 | assert transformed.shape[1] == feature_df.shape[1] + (nr_categories - 1) 121 | assert_array_equal(np.unique(one_hot_encoded), [0., 1.]) 122 | -------------------------------------------------------------------------------- /upload.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cstub/ml-ids/b0e0b117adf635e30f357170342faa25dadd4063/upload.py --------------------------------------------------------------------------------