├── .gitignore ├── .idea ├── .gitignore ├── CNS2022_Code.iml ├── inspectionProfiles │ ├── Project_Default.xml │ └── profiles_settings.xml ├── misc.xml ├── modules.xml └── vcs.xml ├── Experiments_code └── rf_feature_importance.py ├── Labelling ├── CICIDS2017_labelling_fixed_CICFlowMeter.ipynb ├── CICIDS2017_original_version_labelling.ipynb ├── CICIDS2018_labelling_fixed_CICFlowMeter.ipynb └── CICIDS2018_original_version_labelling.ipynb └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.toptal.com/developers/gitignore/api/macos,pycharm,python 3 | # Edit at https://www.toptal.com/developers/gitignore?templates=macos,pycharm,python 4 | 5 | ### macOS ### 6 | # General 7 | .DS_Store 8 | .AppleDouble 9 | .LSOverride 10 | 11 | # Icon must end with two \r 12 | Icon 13 | 14 | 15 | # Thumbnails 16 | ._* 17 | 18 | # Files that might appear in the root of a volume 19 | .DocumentRevisions-V100 20 | .fseventsd 21 | .Spotlight-V100 22 | .TemporaryItems 23 | .Trashes 24 | .VolumeIcon.icns 25 | .com.apple.timemachine.donotpresent 26 | 27 | # Directories potentially created on remote AFP share 28 | .AppleDB 29 | .AppleDesktop 30 | Network Trash Folder 31 | Temporary Items 32 | .apdisk 33 | 34 | ### PyCharm ### 35 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 36 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 37 | 38 | # User-specific stuff 39 | .idea/**/workspace.xml 40 | .idea/**/tasks.xml 41 | .idea/**/usage.statistics.xml 42 | .idea/**/dictionaries 43 | .idea/**/shelf 44 | 45 | # AWS User-specific 46 | .idea/**/aws.xml 47 | 48 | # Generated files 49 | .idea/**/contentModel.xml 50 | 51 | # Sensitive or high-churn files 52 | .idea/**/dataSources/ 53 | .idea/**/dataSources.ids 54 | .idea/**/dataSources.local.xml 55 | .idea/**/sqlDataSources.xml 56 | .idea/**/dynamic.xml 57 | .idea/**/uiDesigner.xml 58 | .idea/**/dbnavigator.xml 59 | 60 | # Gradle 61 | .idea/**/gradle.xml 62 | .idea/**/libraries 63 | 64 | # Gradle and Maven with auto-import 65 | # When using Gradle or Maven with auto-import, you should exclude module files, 66 | # since they will be recreated, and may cause churn. Uncomment if using 67 | # auto-import. 68 | # .idea/artifacts 69 | # .idea/compiler.xml 70 | # .idea/jarRepositories.xml 71 | # .idea/modules.xml 72 | # .idea/*.iml 73 | # .idea/modules 74 | # *.iml 75 | # *.ipr 76 | 77 | # CMake 78 | cmake-build-*/ 79 | 80 | # Mongo Explorer plugin 81 | .idea/**/mongoSettings.xml 82 | 83 | # File-based project format 84 | *.iws 85 | 86 | # IntelliJ 87 | out/ 88 | 89 | # mpeltonen/sbt-idea plugin 90 | .idea_modules/ 91 | 92 | # JIRA plugin 93 | atlassian-ide-plugin.xml 94 | 95 | # Cursive Clojure plugin 96 | .idea/replstate.xml 97 | 98 | # SonarLint plugin 99 | .idea/sonarlint/ 100 | 101 | # Crashlytics plugin (for Android Studio and IntelliJ) 102 | com_crashlytics_export_strings.xml 103 | crashlytics.properties 104 | crashlytics-build.properties 105 | fabric.properties 106 | 107 | # Editor-based Rest Client 108 | .idea/httpRequests 109 | 110 | # Android studio 3.1+ serialized cache file 111 | .idea/caches/build_file_checksums.ser 112 | 113 | ### PyCharm Patch ### 114 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 115 | 116 | # *.iml 117 | # modules.xml 118 | # .idea/misc.xml 119 | # *.ipr 120 | 121 | # Sonarlint plugin 122 | # https://plugins.jetbrains.com/plugin/7973-sonarlint 123 | .idea/**/sonarlint/ 124 | 125 | # SonarQube Plugin 126 | # https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin 127 | .idea/**/sonarIssues.xml 128 | 129 | # Markdown Navigator plugin 130 | # https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced 131 | .idea/**/markdown-navigator.xml 132 | .idea/**/markdown-navigator-enh.xml 133 | .idea/**/markdown-navigator/ 134 | 135 | # Cache file creation bug 136 | # See https://youtrack.jetbrains.com/issue/JBR-2257 137 | .idea/$CACHE_FILE$ 138 | 139 | # CodeStream plugin 140 | # https://plugins.jetbrains.com/plugin/12206-codestream 141 | .idea/codestream.xml 142 | 143 | ### Python ### 144 | # Byte-compiled / optimized / DLL files 145 | __pycache__/ 146 | *.py[cod] 147 | *$py.class 148 | 149 | # C extensions 150 | *.so 151 | 152 | # Distribution / packaging 153 | .Python 154 | build/ 155 | develop-eggs/ 156 | dist/ 157 | downloads/ 158 | eggs/ 159 | .eggs/ 160 | lib/ 161 | lib64/ 162 | parts/ 163 | sdist/ 164 | var/ 165 | wheels/ 166 | share/python-wheels/ 167 | *.egg-info/ 168 | .installed.cfg 169 | *.egg 170 | MANIFEST 171 | 172 | # PyInstaller 173 | # Usually these files are written by a python script from a template 174 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 175 | *.manifest 176 | *.spec 177 | 178 | # Installer logs 179 | pip-log.txt 180 | pip-delete-this-directory.txt 181 | 182 | # Unit test / coverage reports 183 | htmlcov/ 184 | .tox/ 185 | .nox/ 186 | .coverage 187 | .coverage.* 188 | .cache 189 | nosetests.xml 190 | coverage.xml 191 | *.cover 192 | *.py,cover 193 | .hypothesis/ 194 | .pytest_cache/ 195 | cover/ 196 | 197 | # Translations 198 | *.mo 199 | *.pot 200 | 201 | # Django stuff: 202 | *.log 203 | local_settings.py 204 | db.sqlite3 205 | db.sqlite3-journal 206 | 207 | # Flask stuff: 208 | instance/ 209 | .webassets-cache 210 | 211 | # Scrapy stuff: 212 | .scrapy 213 | 214 | # Sphinx documentation 215 | docs/_build/ 216 | 217 | # PyBuilder 218 | .pybuilder/ 219 | target/ 220 | 221 | # Jupyter Notebook 222 | .ipynb_checkpoints 223 | 224 | # IPython 225 | profile_default/ 226 | ipython_config.py 227 | 228 | # pyenv 229 | # For a library or package, you might want to ignore these files since the code is 230 | # intended to run in multiple environments; otherwise, check them in: 231 | # .python-version 232 | 233 | # pipenv 234 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 235 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 236 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 237 | # install all needed dependencies. 238 | #Pipfile.lock 239 | 240 | # poetry 241 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 242 | # This is especially recommended for binary packages to ensure reproducibility, and is more 243 | # commonly ignored for libraries. 244 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 245 | #poetry.lock 246 | 247 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 248 | __pypackages__/ 249 | 250 | # Celery stuff 251 | celerybeat-schedule 252 | celerybeat.pid 253 | 254 | # SageMath parsed files 255 | *.sage.py 256 | 257 | # Environments 258 | .env 259 | .venv 260 | env/ 261 | venv/ 262 | ENV/ 263 | env.bak/ 264 | venv.bak/ 265 | 266 | # Spyder project settings 267 | .spyderproject 268 | .spyproject 269 | 270 | # Rope project settings 271 | .ropeproject 272 | 273 | # mkdocs documentation 274 | /site 275 | 276 | # mypy 277 | .mypy_cache/ 278 | .dmypy.json 279 | dmypy.json 280 | 281 | # Pyre type checker 282 | .pyre/ 283 | 284 | # pytype static type analyzer 285 | .pytype/ 286 | 287 | # Cython debug symbols 288 | cython_debug/ 289 | 290 | # PyCharm 291 | # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can 292 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 293 | # and can be added to the global gitignore or merged into this file. For a more nuclear 294 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 295 | #.idea/ 296 | 297 | # End of https://www.toptal.com/developers/gitignore/api/macos,pycharm,python 298 | 299 | Feature Importance/Random Forest/SHAP Figures/* -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Datasource local storage ignored files 5 | /dataSources/ 6 | /dataSources.local.xml 7 | # Editor-based HTTP Client requests 8 | /httpRequests/ 9 | -------------------------------------------------------------------------------- /.idea/CNS2022_Code.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /Experiments_code/rf_feature_importance.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from sklearn.model_selection import train_test_split 4 | 5 | sys.path = sorted(sys.path, key=lambda s:'envs' not in s) 6 | import os 7 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 8 | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" 9 | os.environ["MODIN_ENGINE"] = "dask" 10 | 11 | import time 12 | import random as rn 13 | import numpy as np 14 | from sklearn.metrics import classification_report 15 | import pandas as pd 16 | import json 17 | import cudf 18 | from cuml.dask.ensemble import RandomForestClassifier as cuRF 19 | import collections 20 | import dask_cudf 21 | from dask_cuda import LocalCUDACluster 22 | from dask.distributed import Client 23 | from helpers import load_data_database 24 | from glob import glob 25 | from feature_engine.selection import DropCorrelatedFeatures 26 | 27 | 28 | seed = 1 # [1] 2017, 2018 orig, 2018 fixed [123] for 2017 fixed 29 | np.random.seed(seed) 30 | rn.seed(seed) 31 | year = 2018 32 | old = False 33 | ngpus = 2 34 | 35 | 36 | def makehash(): 37 | return collections.defaultdict(makehash) 38 | 39 | def sort_importances(unsorted): 40 | benchmark = unsorted["benchmark"] 41 | res = makehash() 42 | 43 | for features, values in unsorted.items(): 44 | if features == 'benchmark': 45 | continue 46 | for label, results in values.items(): 47 | if label.isnumeric(): 48 | prec = results['precision'] 49 | recall = results['recall'] 50 | f1 = results['f1-score'] 51 | 52 | benchmark_prec = benchmark[label]['precision'] 53 | benchmark_recall = benchmark[label]['recall'] 54 | benchmark_f1 = benchmark[label]['f1-score'] 55 | 56 | diff_prec = prec - benchmark_prec 57 | diff_recall = recall - benchmark_recall 58 | diff_f1 = f1 - benchmark_f1 59 | 60 | res[label]['precision'][features] = diff_prec 61 | res[label]['recall'][features] = diff_recall 62 | res[label]['f1-score'][features] = diff_f1 63 | 64 | # sort dictionary 65 | sorted_feature_importance = makehash() 66 | 67 | for label, vals in res.items(): 68 | for metric, metric_results in vals.items(): 69 | a = metric_results.items() 70 | sort_orders = sorted(a, key=lambda x: x[1]) 71 | sorted_feature_importance[label][metric] = sort_orders 72 | 73 | filename = f"{year}_feature_importance_rf_sorted.json" 74 | 75 | with open(filename, "w") as outfile: 76 | json.dump(sorted_feature_importance, outfile, indent=4) 77 | 78 | 79 | def drop_col_feat_imp(X_full, y_full): 80 | #X_full[X_full.select_dtypes(np.float64).columns] = X_full.select_dtypes(np.float64).astype(np.float32) 81 | X_full_corr = X_full.astype('float32') 82 | y_full = y_full.astype('int32') 83 | 84 | tr = DropCorrelatedFeatures(variables=None, method='pearson', threshold=0.9) 85 | 86 | print("Calculating correlated features:") 87 | X_full = tr.fit_transform(X_full_corr) 88 | print(f'Correlated Feature Sets: {str(tr.correlated_feature_sets_)}') 89 | print(f'Dropped features: {str(tr.features_to_drop_)}') 90 | 91 | X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, random_state=seed, stratify=y_full, shuffle=True) 92 | 93 | # due to not yet implemented feature - sharding causes issues with a mysterious issue with the classifier and the 94 | # labels not being in order 95 | X_train = X_train.reset_index(drop=True) 96 | X_test = X_test.reset_index(drop=True) 97 | y_train = y_train.reset_index(drop=True) 98 | y_test = y_test.reset_index(drop=True) 99 | 100 | 101 | X_train = dask_cudf.from_cudf(cudf.from_pandas(X_train), npartitions=ngpus).persist() 102 | X_test = dask_cudf.from_cudf(cudf.from_pandas(X_test), npartitions=ngpus).persist() 103 | y_train = dask_cudf.from_cudf(cudf.from_pandas(y_train), npartitions=ngpus).persist() 104 | y_test = dask_cudf.from_cudf(cudf.from_pandas(y_test), npartitions=ngpus).persist() 105 | 106 | # list for storing feature importances 107 | importances = {} 108 | rf = cuRF(max_depth=30, n_estimators=100, random_state=seed, verbose=True, n_streams=25) 109 | st = time.time() 110 | print(f"Starting base- {st}") 111 | rf.fit(X_train, y_train, convert_dtype=True) 112 | y_pred = rf.predict(X_test) 113 | importances['benchmark'] = classification_report(y_test.compute().to_numpy(), y_pred.compute().to_numpy(), output_dict=True) 114 | print(f'Elapsed time time: {time.time()-st}') 115 | 116 | # iterating over all columns and storing feature importance (difference between benchmark and new model) 117 | for i, col in enumerate(X_train.columns): 118 | print(f"Doing col: {col} [{i}/{len(X_train.columns)}]") 119 | model_clone = cuRF(max_depth=30, n_estimators=100, random_state=seed, verbose=True, n_streams=25) 120 | model_clone.fit(X_train.drop(col, axis=1), y_train, convert_dtype=True) 121 | model_pred = model_clone.predict(X_test.drop(col, axis=1)) 122 | importances[col] = classification_report(y_test.compute().to_numpy(), model_pred.compute().to_numpy(), output_dict=True) 123 | print(f'[{year}] - Finished col {col}. Elapsed time time: {time.time() - st}') 124 | 125 | print("Saving importance feature dict") 126 | if old: 127 | filename = f'{year}_old_feature_importance_rf.json' 128 | else: 129 | filename = f'{year}_new_feature_importance_rf.json' 130 | 131 | with open(filename, 'w') as fp: 132 | json.dump(importances, fp, indent=4) 133 | fp.write(f'Correlated Feature Sets: {str(tr.correlated_feature_sets_)}') 134 | fp.write(f'Dropped features: {str(tr.features_to_drop_)}') 135 | 136 | return importances 137 | 138 | 139 | def load_from_local(folder_path): 140 | files = glob(folder_path + "/*.csv") 141 | csv_dataframes = [] 142 | for file in files: 143 | print(f"-- Reading in {file}") 144 | df = pd.read_csv(file) 145 | print(df.columns) 146 | df.columns = df.columns.str.lstrip(" ") 147 | df.drop(['id', 'Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Timestamp', 'Attempted Category'], axis=1, inplace=True) 148 | df = df.replace('Infinity', np.nan) 149 | df = df.replace(np.inf, np.nan) 150 | df = df.dropna() 151 | 152 | for column in df.columns: 153 | if column != 'Label': 154 | df[column] = pd.to_numeric(df[column], errors='coerce', downcast="float") 155 | 156 | csv_dataframes.extend([df]) 157 | 158 | df = pd.concat(csv_dataframes, ignore_index=True) 159 | labels = df['Label'].astype('category') 160 | y = pd.Series(labels.cat.codes) 161 | train = df.drop(['Label'], axis=1) 162 | 163 | with open(f'{folder_path}/label_mapping.txt', 'w') as f: 164 | f.write(str(dict(enumerate(labels.cat.categories)))) 165 | 166 | 167 | return train, y 168 | 169 | 170 | if __name__ == '__main__': 171 | # Create a Dask Cluster with one worker per GPU 172 | cluster = LocalCUDACluster() 173 | client = Client(cluster) 174 | 175 | file_path = # 176 | 177 | training_featuresdf, labeldf = load_from_local(file_path) 178 | 179 | importances = drop_col_feat_imp(training_featuresdf, labeldf) 180 | sort_importances(importances) -------------------------------------------------------------------------------- /Labelling/CICIDS2017_labelling_fixed_CICFlowMeter.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "import glob\n", 14 | "import os\n", 15 | "from sys import platform\n", 16 | "import datetime\n", 17 | "\n", 18 | "# THIS LABELLING SCRIPT IS USED TO LABEL THE CORRECTED VERSION OF CIC-IDS-2017. FOR DETAILS CONSULT OUR WEBSITE:\n", 19 | "# https://intrusion-detection.distrinet-research.be/CNS2022/index.html\n", 20 | "\n", 21 | "pd.set_option('display.max_rows', 100)\n", 22 | "\n", 23 | "# Enter the path that contains the CSV files that were generated by the CICFlowMeter tool. There should be five CSV\n", 24 | "# files in total, one per day.\n", 25 | "DATASET_PATH = \"\"\n", 26 | "\n", 27 | "# Enter the output path for the fully labelled CSV files\n", 28 | "OUTPUT_PATH = \"\"\n", 29 | "\n", 30 | "# If set to true, a column is added at the front of the CSV with line numbers\n", 31 | "print_index = True" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "outputs": [], 38 | "source": [ 39 | "# Basic preprocessing before getting started on labelling.\n", 40 | "# Deletes rows with \"Infinity\" and NaNs, converts \"Timestamp\" to Pandas Datetime, and converts all necessary columns to\n", 41 | "# numeric values\n", 42 | "def format_csv_for_labelling(df):\n", 43 | " df = df.replace('Infinity', np.nan)\n", 44 | " df['Timestamp'] = pd.to_datetime(df['Timestamp'])\n", 45 | " for column in df.columns:\n", 46 | " if column not in ['Flow ID' , 'Timestamp', 'Src IP', 'Dst IP', 'Label']:\n", 47 | " df[column] = pd.to_numeric(df[column], errors='coerce')\n", 48 | "\n", 49 | " df.dropna()\n", 50 | "\n", 51 | " return df.dropna()\n", 52 | "\n", 53 | "def read_csvs_from_path_and_reformat(path):\n", 54 | " df = pd.read_csv(path, encoding='cp1252')\n", 55 | "\n", 56 | " df = format_csv_for_labelling(df)\n", 57 | " print(\"labels after pre-processing:\", df[\"Label\"].value_counts())\n", 58 | "\n", 59 | " df[\"Attempted Category\"] = -1\n", 60 | "\n", 61 | " int64_columns = [\"Total TCP Flow Time\"]\n", 62 | "\n", 63 | " int32_columns = [\"Src Port\", \"Dst Port\", \"Flow Duration\", \"Total Fwd Packet\", \"Total Bwd packets\", \"Total Length of Fwd Packet\", \"Total Length of Bwd Packet\", \"Fwd Packet Length Max\",\n", 64 | " \"Fwd Packet Length Min\", \"Bwd Packet Length Max\", \"Bwd Packet Length Min\", \"Flow IAT Max\", \"Flow IAT Min\", \"Fwd IAT Total\", \"Fwd IAT Max\", \"Fwd IAT Min\", \"Bwd IAT Total\",\n", 65 | " \"Bwd IAT Max\", \"Bwd IAT Min\", \"Fwd PSH Flags\", \"Bwd PSH Flags\", \"Fwd URG Flags\", \"Bwd URG Flags\", \"Packet Length Min\", \"Packet Length Max\", \"FIN Flag Count\", \"SYN Flag Count\", \"RST Flag Count\", \"PSH Flag Count\",\n", 66 | " \"ACK Flag Count\", \"URG Flag Count\", \"CWR Flag Count\", \"ECE Flag Count\", \"Subflow Fwd Packets\", \"Subflow Fwd Bytes\",\n", 67 | " \"Subflow Bwd Packets\", \"Subflow Bwd Bytes\", \"FWD Init Win Bytes\", \"Bwd Init Win Bytes\", \"Fwd Act Data Pkts\", \"Fwd Seg Size Min\", \"Active Max\",\n", 68 | " \"Active Min\", \"Idle Max\", \"Idle Min\"]\n", 69 | "\n", 70 | " int16_columns = [\"Fwd Header Length\", \"Bwd Header Length\", \"ICMP Code\", \"ICMP Type\"]\n", 71 | "\n", 72 | " for column in int64_columns:\n", 73 | " df[column] = df[column].astype('int64')\n", 74 | "\n", 75 | " for column in int32_columns:\n", 76 | " df[column] = df[column].astype('int32')\n", 77 | "\n", 78 | " for column in int16_columns:\n", 79 | " df[column] = df[column].astype('int16')\n", 80 | "\n", 81 | " return df\n", 82 | "\n", 83 | "# Main labelling function. Only used for labelling Malicious and Malicious - Attempted flows.\n", 84 | "# Timestamps are in NANOSECONDS (!) Unix time. Note that the CSV files are in the UTC timezone.\n", 85 | "# df = dataframe with flows. Note that labelling happens inplace on the 'df' parameter, and so this function doesn't return anything\n", 86 | "# label = the label that will be given to flows matching the criteria specified in the function\n", 87 | "# additional_filters = add any additional constraints that cannot be covered by the already provided function arguments\n", 88 | "# see examples in the actual labelling logic for correct syntax\n", 89 | "# attempted_category = please consult our website (https://intrusion-detection.distrinet-research.be/CNS2022/Tools_Documentation.html)\n", 90 | "# for details on how the \"Attempted\" categories are defined.\n", 91 | "# payload_filter = When set to true, this will automatically add a constraint [\"Total Length of Fwd Packet\"] == 0. Note that\n", 92 | "# the Attempted label and category still need to be specified manually\n", 93 | "def label_flows(df, label, attack_start_time_nanoseconds, attack_end_time_nanoseconds, src_ip_list=None,\n", 94 | " dst_ip_list= None, src_port_list=None, dst_port_list=None, additional_filters=[], attempted_category=-1, payload_filter=False):\n", 95 | "\n", 96 | "\n", 97 | " # Create initial mask for whole df with all values set to True. Squeeze is necessary to remove second axis (with value 1)\n", 98 | " # The reason is that a df of shape (X,) gets converted to (1,X) if you '&' it with a df of shape (X,1)\n", 99 | " mask = pd.DataFrame(True,index=df.index,columns=[df.columns[0]]).squeeze()\n", 100 | "\n", 101 | " attack_start_datetime = pd.to_datetime(attack_start_time_nanoseconds, unit='ns')\n", 102 | " attack_end_datetime = pd.to_datetime(attack_end_time_nanoseconds, unit='ns')\n", 103 | "\n", 104 | " mask &= (df[\"Timestamp\"] >= attack_start_datetime)\n", 105 | " mask &= (df[\"Timestamp\"] <= attack_end_datetime)\n", 106 | "\n", 107 | " if src_ip_list is not None:\n", 108 | " mask &= (df[\"Src IP\"].isin(src_ip_list))\n", 109 | " if dst_ip_list is not None:\n", 110 | " mask &= (df[\"Dst IP\"].isin(dst_ip_list))\n", 111 | "\n", 112 | " if src_port_list is not None:\n", 113 | " mask &= (df[\"Src Port\"].isin(src_port_list))\n", 114 | " if dst_port_list is not None:\n", 115 | " mask &= (df[\"Dst Port\"].isin(dst_port_list))\n", 116 | "\n", 117 | " if payload_filter:\n", 118 | " mask &= (df[\"Total Length of Fwd Packet\"] == 0)\n", 119 | "\n", 120 | " for filter in additional_filters:\n", 121 | " mask &= filter\n", 122 | "\n", 123 | " df[\"Label\"].mask(mask, label, inplace=True)\n", 124 | " df[\"Attempted Category\"].mask(mask, attempted_category, inplace=True)\n", 125 | "\n", 126 | "# This function is called when all labelling of malicious flows is completed. Anything that has not yet received a label\n", 127 | "# so far is labelled as Benign.\n", 128 | "def label_rest_as_benign_and_write_csv(df, file_to_write):\n", 129 | " df[\"Label\"].mask(df[\"Label\"] == \"NeedManualLabel\", \"BENIGN\", inplace=True)\n", 130 | "\n", 131 | " # Relabel artefact flows with [Flow Id] = '8.0.6.4-8.6.0.1-0-0-0' to label = 0\n", 132 | " df[\"Label\"].mask(df[\"Flow ID\"] == '8.0.6.4-8.6.0.1-0-0-0', \"BENIGN\", inplace=True)\n", 133 | "\n", 134 | " print(\"label count after labelling:\\r\\n\", df[\"Label\"].value_counts())\n", 135 | " print(\"Attempted Category count after labelling:\\r\\n\", df[\"Attempted Category\"].value_counts())\n", 136 | "\n", 137 | " # Adds line numbers in the first column if print_index is set to true\n", 138 | " if print_index:\n", 139 | " df.reset_index(inplace=True, drop=True)\n", 140 | " df.index += 1\n", 141 | " df.index.name = 'id'\n", 142 | " df.to_csv(file_to_write)\n", 143 | " else:\n", 144 | " df.to_csv(file_to_write, index=False)\n" 145 | ], 146 | "metadata": { 147 | "collapsed": false 148 | } 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 3, 153 | "outputs": [ 154 | { 155 | "name": "stdout", 156 | "output_type": "stream", 157 | "text": [ 158 | "labels after pre-processing: NeedManualLabel 371624\n", 159 | "Name: Label, dtype: int64\n", 160 | "label count after labelling:\r\n", 161 | " BENIGN 371624\n", 162 | "Name: Label, dtype: int64\n", 163 | "Attempted Category count after labelling:\r\n", 164 | " -1 371624\n", 165 | "Name: Attempted Category, dtype: int64\n" 166 | ] 167 | } 168 | ], 169 | "source": [ 170 | "monday_df = read_csvs_from_path_and_reformat(DATASET_PATH + \"Monday-WorkingHours.pcap_Flow.csv\")\n", 171 | "\n", 172 | "label_rest_as_benign_and_write_csv(monday_df, OUTPUT_PATH + \"monday.csv\")" 173 | ], 174 | "metadata": { 175 | "collapsed": false 176 | } 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 4, 181 | "outputs": [ 182 | { 183 | "name": "stdout", 184 | "output_type": "stream", 185 | "text": [ 186 | "labels after pre-processing: NeedManualLabel 322078\n", 187 | "Name: Label, dtype: int64\n", 188 | "label count after labelling:\r\n", 189 | " BENIGN 315106\n", 190 | "FTP-Patator 3972\n", 191 | "SSH-Patator 2961\n", 192 | "SSH-Patator - Attempted 27\n", 193 | "FTP-Patator - Attempted 12\n", 194 | "Name: Label, dtype: int64\n", 195 | "Attempted Category count after labelling:\r\n", 196 | " -1 322039\n", 197 | " 3 27\n", 198 | " 0 10\n", 199 | " 2 2\n", 200 | "Name: Attempted Category, dtype: int64\n" 201 | ] 202 | } 203 | ], 204 | "source": [ 205 | "#--------------------+\n", 206 | "# TUESDAY 04-07-2017 |\n", 207 | "#--------------------+\n", 208 | "\n", 209 | "tuesday_df = read_csvs_from_path_and_reformat(DATASET_PATH + \"Tuesday-WorkingHours.pcap_Flow.csv\")\n", 210 | "\n", 211 | "# FTP-PATATOR\n", 212 | "# -----------\n", 213 | "\n", 214 | "label_flows(tuesday_df, \"FTP-Patator\", 1499170672838272000, 1499174416931403000, [\"172.16.0.1\"],\n", 215 | " [\"192.168.10.50\"], dst_port_list=[21])\n", 216 | "\n", 217 | "# Default payload filter\n", 218 | "label_flows(tuesday_df, \"FTP-Patator - Attempted\", 1499170672838272000, 1499174416931403000, [\"172.16.0.1\"],\n", 219 | " [\"192.168.10.50\"], dst_port_list=[21], payload_filter=True, attempted_category=0)\n", 220 | "\n", 221 | "label_flows(tuesday_df, \"FTP-Patator - Attempted\", 1499170672838272000, 1499174416931403000, [\"172.16.0.1\"],\n", 222 | " [\"192.168.10.50\"], dst_port_list=[21], additional_filters=[(tuesday_df[\"Src Port\"] == 52108)],\n", 223 | " attempted_category=2)\n", 224 | "\n", 225 | "# SSH-Patator\n", 226 | "# -----------\n", 227 | "\n", 228 | "label_flows(tuesday_df, \"SSH-Patator\", 1499188141049616000, 1499195059018486000, [\"172.16.0.1\"],\n", 229 | " [\"192.168.10.50\"], dst_port_list=[22])\n", 230 | "\n", 231 | "label_flows(tuesday_df, \"SSH-Patator - Attempted\", 1499188141049616000, 1499195059018486000, [\"172.16.0.1\"],\n", 232 | " [\"192.168.10.50\"], dst_port_list=[22], payload_filter=True, attempted_category=0)\n", 233 | "\n", 234 | "label_flows(tuesday_df, \"SSH-Patator - Attempted\", 1499188141049616000, 1499195059018486000, [\"172.16.0.1\"],\n", 235 | " [\"192.168.10.50\"], dst_port_list=[22], additional_filters=\n", 236 | " [\n", 237 | " (tuesday_df[\"Total Length of Fwd Packet\"] <= 32) & (tuesday_df[\"Total Length of Bwd Packet\"] == 0)\n", 238 | " ], attempted_category=3)\n", 239 | "\n", 240 | "label_rest_as_benign_and_write_csv(tuesday_df, OUTPUT_PATH + \"tuesday.csv\")\n", 241 | "\n", 242 | "tuesday_df = None" 243 | ], 244 | "metadata": { 245 | "collapsed": false 246 | } 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 5, 251 | "outputs": [ 252 | { 253 | "name": "stdout", 254 | "output_type": "stream", 255 | "text": [ 256 | "labels after pre-processing: NeedManualLabel 496641\n", 257 | "Name: Label, dtype: int64\n", 258 | "label count after labelling:\r\n", 259 | " BENIGN 319120\n", 260 | "DoS Hulk 158468\n", 261 | "DoS GoldenEye 7567\n", 262 | "DoS Slowloris 3859\n", 263 | "DoS Slowhttptest - Attempted 3368\n", 264 | "DoS Slowloris - Attempted 1847\n", 265 | "DoS Slowhttptest 1740\n", 266 | "DoS Hulk - Attempted 581\n", 267 | "DoS GoldenEye - Attempted 80\n", 268 | "Heartbleed 11\n", 269 | "Name: Label, dtype: int64\n", 270 | "Attempted Category count after labelling:\r\n", 271 | " -1 490765\n", 272 | " 0 2927\n", 273 | " 6 2804\n", 274 | " 5 138\n", 275 | " 4 4\n", 276 | " 2 3\n", 277 | "Name: Attempted Category, dtype: int64\n" 278 | ] 279 | } 280 | ], 281 | "source": [ 282 | "#----------------------+\n", 283 | "# WEDNESDAY 05-07-2017 |\n", 284 | "#----------------------+\n", 285 | "\n", 286 | "wednesday_df = read_csvs_from_path_and_reformat(DATASET_PATH + \"Wednesday-WorkingHours.pcap_Flow.csv\")\n", 287 | "\n", 288 | "# DoS Slowloris\n", 289 | "# -------------\n", 290 | "\n", 291 | "# Accidental early launch of the tool with wrong parameters\n", 292 | "label_flows(wednesday_df, \"DoS Slowloris - Attempted\", 1499258926211817000, 1499258927000000000, [\"172.16.0.1\"],\n", 293 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=5)\n", 294 | "\n", 295 | "# Normal attack\n", 296 | "label_flows(wednesday_df, \"DoS Slowloris\", 1499258934539220000, 1499260278500956000, [\"172.16.0.1\"],\n", 297 | " [\"192.168.10.50\"], dst_port_list=[80], additional_filters=[\n", 298 | " ~(wednesday_df[\"Src Port\"].isin([33358, 33360, 33362, 54114]))\n", 299 | " ])\n", 300 | "\n", 301 | "# port 33358, 33360 and 33362 contain attack teardown flows\n", 302 | "label_flows(wednesday_df, \"DoS Slowloris - Attempted\", 1499258934539220000, 1499260278500956000, [\"172.16.0.1\"],\n", 303 | " [\"192.168.10.50\"], src_port_list=[33358, 33360, 33362], dst_port_list=[80], attempted_category=2)\n", 304 | "\n", 305 | "#Payload filter (order is important, this part needs to come before Attempted category 6)\n", 306 | "label_flows(wednesday_df, \"DoS Slowloris - Attempted\", 1499258934539220000, 1499260278500956000, [\"172.16.0.1\"],\n", 307 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=0, payload_filter=True, additional_filters=[\n", 308 | " ~(wednesday_df[\"Src Port\"].isin([33358, 33360, 33362, 54114]))\n", 309 | " ])\n", 310 | "\n", 311 | "#Target unresponsive because of DoS, no payloads in these flows\n", 312 | "label_flows(wednesday_df, \"DoS Slowloris - Attempted\", 1499258934539220000, 1499260278500956000,\n", 313 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], attempted_category=6, additional_filters=[\n", 314 | " ~(wednesday_df[\"Dst Port\"].isin([33358, 33360, 33362, 54114])) & (wednesday_df[\"Total Length of Bwd Packet\"] == 0)\n", 315 | " & (wednesday_df[\"Flow Duration\"] >= 199800)\n", 316 | " ])\n", 317 | "\n", 318 | "# Artefact likely from authors checking the webserver\n", 319 | "label_flows(wednesday_df, \"DoS Slowloris - Attempted\", 1499258934539220000, 1499260278500956000, [\"172.16.0.1\"],\n", 320 | " [\"192.168.10.50\"], src_port_list=[54114], dst_port_list=[80], attempted_category=4)\n", 321 | "\n", 322 | "# DoS Slowhttptest\n", 323 | "# ----------------\n", 324 | "\n", 325 | "label_flows(wednesday_df, \"DoS Slowhttptest\", 1499260537936810000, 1499261869331517000, [\"172.16.0.1\"],\n", 326 | " [\"192.168.10.50\"], dst_port_list=[80], additional_filters=[\n", 327 | " ~(wednesday_df[\"Src Port\"].isin([33372]))])\n", 328 | "\n", 329 | "\n", 330 | "# Attack startup artefact\n", 331 | "label_flows(wednesday_df, \"DoS Slowhttptest - Attempted\", 1499260537936810000, 1499261869331517000, [\"172.16.0.1\"],\n", 332 | " [\"192.168.10.50\"], src_port_list=[33372], dst_port_list=[80], attempted_category=2)\n", 333 | "\n", 334 | "# Payload filter\n", 335 | "label_flows(wednesday_df, \"DoS Slowhttptest - Attempted\", 1499260537936810000, 1499261869331517000, [\"172.16.0.1\"],\n", 336 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=0, payload_filter=True, additional_filters=[\n", 337 | " ~(wednesday_df[\"Src Port\"].isin([33372, 37670]))])\n", 338 | "\n", 339 | "# Retransmissions because target web server is brought down\n", 340 | "label_flows(wednesday_df, \"DoS Slowhttptest - Attempted\", 1499260537936810000, 1499261869331517000, [\"172.16.0.1\"],\n", 341 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=6, additional_filters=[\n", 342 | " ~(wednesday_df[\"Src Port\"].isin([33372, 37670])) & (wednesday_df[\"Total Length of Fwd Packet\"] == 0) &\n", 343 | " (wednesday_df[\"Flow Duration\"] >= 199984) & (wednesday_df[\"Total Bwd packets\"] == 0)\n", 344 | " ]\n", 345 | " )\n", 346 | "\n", 347 | "# Artefact from authors likely checking the webserver\n", 348 | "label_flows(wednesday_df, \"DoS Slowhttptest - Attempted\", 1499260537936810000, 1499261869331517000, [\"172.16.0.1\"],\n", 349 | " [\"192.168.10.50\"], src_port_list=[37670], dst_port_list=[80], attempted_category=4)\n", 350 | "\n", 351 | "\n", 352 | "# DoS Hulk\n", 353 | "# --------\n", 354 | "\n", 355 | "# Note that ports 48678 and 43664 have a benign flow launched by attacker IP while attack is already ongoing,\n", 356 | "# containing benign HTTP request. This will be labelled as Attack artefact\n", 357 | "label_flows(wednesday_df, \"DoS Hulk\", 1499262203194704000, 1499262299999999999, [\"172.16.0.1\"],\n", 358 | " [\"192.168.10.50\"], dst_port_list=[80], additional_filters=[\n", 359 | " ~(wednesday_df[\"Src Port\"].isin([48678 , 43664]))\n", 360 | " ])\n", 361 | "\n", 362 | "#Attack artefact - likely authors checking webserver mid-attack.\n", 363 | "label_flows(wednesday_df, \"DoS Hulk - Attempted\", 1499262203194704000, 1499262299999999999, [\"172.16.0.1\"],\n", 364 | " [\"192.168.10.50\"], src_port_list=[48678 , 43664], dst_port_list=[80], attempted_category=4)\n", 365 | "\n", 366 | "# Normal DoS Hulk\n", 367 | "label_flows(wednesday_df, \"DoS Hulk\", 1499262300000000000, 1499263641326171000, [\"172.16.0.1\"],\n", 368 | " [\"192.168.10.50\"], dst_port_list=[80])\n", 369 | "\n", 370 | "# Payload filter\n", 371 | "label_flows(wednesday_df, \"DoS Hulk - Attempted\", 1499262203194704000, 1499263641326171000, [\"172.16.0.1\"],\n", 372 | " [\"192.168.10.50\"], dst_port_list=[80], payload_filter=True, attempted_category=0, additional_filters=[\n", 373 | " ~(wednesday_df[\"Src Port\"].isin([48678 , 43664]))])\n", 374 | "\n", 375 | "# Artefacts caused by either attack tool or non-empty TCP appendices. Reasoning is that 282 is minimum size of malicious payload\n", 376 | "label_flows(wednesday_df, \"DoS Hulk - Attempted\", 1499262203194704000, 1499263641326171000, [\"172.16.0.1\"],\n", 377 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=3, additional_filters=[\n", 378 | " ~(wednesday_df[\"Src Port\"].isin([48678 , 43664])) & (wednesday_df[\"Total Length of Fwd Packet\"] > 0)\n", 379 | " & (wednesday_df[\"Total Length of Fwd Packet\"] < 282)\n", 380 | " ])\n", 381 | "\n", 382 | "# DoS GoldenEye\n", 383 | "# -------------\n", 384 | "\n", 385 | "label_flows(wednesday_df, \"DoS GoldenEye\", 1499263803231753000, 1499264408915718000, [\"172.16.0.1\"],\n", 386 | " [\"192.168.10.50\"], dst_port_list=[80])\n", 387 | "\n", 388 | "label_flows(wednesday_df, \"DoS GoldenEye - Attempted\", 1499263803231753000, 1499264408915718000, [\"172.16.0.1\"],\n", 389 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=0, payload_filter=True)\n", 390 | "\n", 391 | "# Heartbleed\n", 392 | "# ----------\n", 393 | "\n", 394 | "label_flows(wednesday_df, \"Heartbleed\", 1499278335650811000, 1499279563294455000, [\"172.16.0.1\"],\n", 395 | " [\"192.168.10.51\"], dst_port_list=[444], additional_filters=[\n", 396 | " (wednesday_df[\"Src Port\"] == 45022)\n", 397 | " ])\n", 398 | "\n", 399 | "label_flows(wednesday_df, \"Heartbleed - Attempted\", 1499278335650811000, 1499279563294455000, [\"172.16.0.1\"],\n", 400 | " [\"192.168.10.51\"], dst_port_list=[444], attempted_category=0, payload_filter=True, additional_filters=[\n", 401 | " (wednesday_df[\"Src Port\"] == 45022)])\n", 402 | "\n", 403 | "label_rest_as_benign_and_write_csv(wednesday_df, OUTPUT_PATH + \"wednesday.csv\")\n", 404 | "\n", 405 | "wednesday_df = None" 406 | ], 407 | "metadata": { 408 | "collapsed": false 409 | } 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": 6, 414 | "outputs": [ 415 | { 416 | "name": "stdout", 417 | "output_type": "stream", 418 | "text": [ 419 | "labels after pre-processing: NeedManualLabel 362076\n", 420 | "Name: Label, dtype: int64\n", 421 | "label count after labelling:\r\n", 422 | " BENIGN 288172\n", 423 | "Infiltration - Portscan 71767\n", 424 | "Web Attack - Brute Force - Attempted 1292\n", 425 | "Web Attack - XSS - Attempted 655\n", 426 | "Web Attack - Brute Force 73\n", 427 | "Infiltration - Attempted 45\n", 428 | "Infiltration 36\n", 429 | "Web Attack - XSS 18\n", 430 | "Web Attack - SQL Injection 13\n", 431 | "Web Attack - SQL Injection - Attempted 5\n", 432 | "Name: Label, dtype: int64\n", 433 | "Attempted Category count after labelling:\r\n", 434 | " -1 360079\n", 435 | " 0 1908\n", 436 | " 4 71\n", 437 | " 2 18\n", 438 | "Name: Attempted Category, dtype: int64\n" 439 | ] 440 | } 441 | ], 442 | "source": [ 443 | "#---------------------+\n", 444 | "# THURSDAY 06-07-2017 |\n", 445 | "#---------------------+\n", 446 | "\n", 447 | "thursday_df = read_csvs_from_path_and_reformat(DATASET_PATH + \"Thursday-WorkingHours.pcap_Flow.csv\")\n", 448 | "\n", 449 | "# Web Attack - Brute Force\n", 450 | "# ------------------------\n", 451 | "\n", 452 | "label_flows(thursday_df, \"Web Attack - Brute Force - Attempted\", 1499343354880049000, 1499343531179279000,\n", 453 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80], attempted_category=2)\n", 454 | "\n", 455 | "label_flows(thursday_df, \"Web Attack - Brute Force\", 1499343567660566000, 1499346011622209000,\n", 456 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80], additional_filters=\n", 457 | " [\n", 458 | " (thursday_df[\"Total Fwd Packet\"] > 20) | (thursday_df[\"Src Port\"] == 44464)\n", 459 | " ])\n", 460 | "\n", 461 | "label_flows(thursday_df, \"Web Attack - Brute Force - Attempted\", 1499343567660566000, 1499346011622209000,\n", 462 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80], payload_filter=True, attempted_category=0,\n", 463 | " additional_filters=\n", 464 | " [~((thursday_df[\"Total Fwd Packet\"] > 20) | (thursday_df[\"Src Port\"] == 44464))])\n", 465 | "\n", 466 | "label_flows(thursday_df, \"Web Attack - Brute Force - Attempted\", 1499343567660566000, 1499346011622209000,\n", 467 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80], attempted_category=4,\n", 468 | " additional_filters=\n", 469 | " [\n", 470 | " (thursday_df[\"Total Length of Fwd Packet\"] > 0) & ~(thursday_df[\"Src Port\"] == 44464) &\n", 471 | " (thursday_df[\"Total Fwd Packet\"] == 5) & (thursday_df[\"Total Bwd packets\"] == 5)\n", 472 | " ])\n", 473 | "\n", 474 | "# Web Attack - XSS\n", 475 | "# ----------------\n", 476 | "\n", 477 | "label_flows(thursday_df, \"Web Attack - XSS\", 1499346935283859000, 1499348121341704000, [\"172.16.0.1\"],\n", 478 | " [\"192.168.10.50\"], dst_port_list=[80], additional_filters=\n", 479 | " [\n", 480 | " ~(thursday_df[\"Src Port\"].isin([36180, 36182, 36184, 36186, 36188, 36190])) &\n", 481 | " (thursday_df[\"Total Fwd Packet\"] >= 150)\n", 482 | " ])\n", 483 | "\n", 484 | "label_flows(thursday_df, \"Web Attack - XSS - Attempted\", 1499346935283859000, 1499348121341704000, [\"172.16.0.1\"],\n", 485 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=0, payload_filter=True, additional_filters=\n", 486 | " [~(thursday_df[\"Src Port\"].isin([36180, 36182, 36184, 36186, 36188, 36190]))])\n", 487 | "\n", 488 | "label_flows(thursday_df, \"Web Attack - XSS - Attempted\", 1499346935283859000, 1499348121341704000, [\"172.16.0.1\"],\n", 489 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=2, additional_filters=\n", 490 | " [\n", 491 | " ~(thursday_df[\"Src Port\"].isin([36180, 36182, 36184, 36186, 36188, 36190])) &\n", 492 | " (thursday_df[\"Total Length of Fwd Packet\"] > 0) & (thursday_df[\"Total Fwd Packet\"] < 150)\n", 493 | " ])\n", 494 | "\n", 495 | "# Web Attack - SQL Injection\n", 496 | "# --------------------------\n", 497 | "\n", 498 | "label_flows(thursday_df, \"Web Attack - SQL Injection - Attempted\", 1499348127852814000, 1499348145720612000,\n", 499 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80], attempted_category=2,\n", 500 | " additional_filters=[\n", 501 | " thursday_df[\"Src Port\"].isin([36180, 36182, 36184, 36186, 36188])\n", 502 | " ])\n", 503 | "\n", 504 | "label_flows(thursday_df, \"Web Attack - SQL Injection\", 1499348145732950000, 1499348575320284000,\n", 505 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80],\n", 506 | " additional_filters=[\n", 507 | " ~(thursday_df[\"Src Port\"].isin([36180, 36182, 36184, 36186, 36188]))\n", 508 | " ])\n", 509 | "\n", 510 | "label_flows(thursday_df, \"Web Attack - SQL Injection - Attempted\", 1499348127852814000, 1499348145720612000,\n", 511 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80], attempted_category=0,\n", 512 | " payload_filter=True)\n", 513 | "\n", 514 | "\n", 515 | "# Infiltration\n", 516 | "# 5.1 Dropbox Download\n", 517 | "# ------------\n", 518 | "\n", 519 | "\n", 520 | "label_flows(thursday_df, \"Infiltration\", 1499361542547210000, 1499366769364731000, [\"192.168.10.8\"], [\"205.174.165.73\"])\n", 521 | "\n", 522 | "label_flows(thursday_df, \"Infiltration - Attempted\", 1499361542547210000, 1499366769364731000, [\"192.168.10.8\"],\n", 523 | " [\"205.174.165.73\"], attempted_category=0, payload_filter=True)\n", 524 | "\n", 525 | "label_flows(thursday_df, \"Infiltration - Attempted\", 1499361228830533000, 1499361301251276000 , [\"192.168.10.9\"],\n", 526 | " [\"205.174.165.73\"], attempted_category=2)\n", 527 | "\n", 528 | "# 5.2 Cooldisk Mac\n", 529 | "\n", 530 | "label_flows(thursday_df, \"Infiltration\", 1499363616453990000, 1499371339347892000, [\"192.168.10.25\"], [\"205.174.165.73\"])\n", 531 | "\n", 532 | "label_flows(thursday_df, \"Infiltration - Attempted\", 1499363616453990000, 1499371339347892000, [\"192.168.10.25\"],\n", 533 | " [\"205.174.165.73\"], attempted_category=0, payload_filter=True)\n", 534 | "\n", 535 | "# 5.3 NMAP + Portscan\n", 536 | "\n", 537 | "# Round 1\n", 538 | "\n", 539 | "label_flows(thursday_df, \"Infiltration - Portscan\", 1499360431706755000, 1499360445728887000, [\"172.16.0.1\"],\n", 540 | " [\"192.168.10.51\"], additional_filters=[\n", 541 | " (thursday_df[\"Src Port\"] == 50122) | (thursday_df[\"Src Port\"] == 50133)\n", 542 | " ])\n", 543 | "\n", 544 | "# Round 2\n", 545 | "\n", 546 | "label_flows(thursday_df, \"Infiltration - Portscan\", 1499362410884008000, 1499362444285175000, [\"192.168.10.8\"],\n", 547 | " [\"192.168.10.5\"])\n", 548 | "\n", 549 | "# Round 3\n", 550 | "\n", 551 | "label_flows(thursday_df, \"Infiltration - Portscan\", 1499364314425162000, 1499366764331875000, [\"192.168.10.8\"],\n", 552 | " [\"192.168.10.5\", \"192.168.10.9\", \"192.168.10.12\", \"192.168.10.14\", \"192.168.10.15\", \"192.168.10.16\",\n", 553 | " \"192.168.10.17\", \"192.168.10.19\", \"192.168.10.25\", \"192.168.10.50\", \"192.168.10.51\"], additional_filters= [\n", 554 | " ~((thursday_df[\"Fwd Packet Length Max\"] == 408) & (thursday_df[\"Dst IP\"] == \"192.168.10.50\")) &\n", 555 | " ~((thursday_df[\"Total Length of Fwd Packet\"].isin([176, 20514])) & (thursday_df[\"Dst IP\"] == \"192.168.10.50\"))\n", 556 | " ]\n", 557 | ")\n", 558 | "\n", 559 | "label_rest_as_benign_and_write_csv(thursday_df, OUTPUT_PATH + \"thursday.csv\")\n", 560 | "\n", 561 | "thursday_df = None" 562 | ], 563 | "metadata": { 564 | "collapsed": false 565 | } 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": 7, 570 | "outputs": [ 571 | { 572 | "name": "stdout", 573 | "output_type": "stream", 574 | "text": [ 575 | "labels after pre-processing: NeedManualLabel 547557\n", 576 | "Name: Label, dtype: int64\n", 577 | "label count after labelling:\r\n", 578 | " BENIGN 288544\n", 579 | "Portscan 159066\n", 580 | "DDoS 95144\n", 581 | "Botnet - Attempted 4067\n", 582 | "Botnet 736\n", 583 | "Name: Label, dtype: int64\n", 584 | "Attempted Category count after labelling:\r\n", 585 | " -1 543490\n", 586 | " 1 4067\n", 587 | "Name: Attempted Category, dtype: int64\n" 588 | ] 589 | } 590 | ], 591 | "source": [ 592 | "#---------------------+\n", 593 | "# FRIDAY 07-07-2017 |\n", 594 | "#---------------------+\n", 595 | "\n", 596 | "friday_df = read_csvs_from_path_and_reformat(DATASET_PATH + \"Friday-WorkingHours.pcap_Flow.csv\")\n", 597 | "\n", 598 | "# Portscan\n", 599 | "# --------\n", 600 | "\n", 601 | "#First round\n", 602 | "label_flows(friday_df, \"Portscan\", 1499446532117090000, 1499447948582083000, [\"172.16.0.1\"], [\"192.168.10.50\"])\n", 603 | "\n", 604 | "\n", 605 | "#Second round\n", 606 | "label_flows(friday_df, \"Portscan\", 1499449905450532000, 1499451841699238000, [\"172.16.0.1\"], [\"192.168.10.50\"])\n", 607 | "\n", 608 | "# Botnet\n", 609 | "# ------\n", 610 | "\n", 611 | "label_flows(friday_df, \"Botnet\", 1499432653990571000, 1499436122903736000, [\"192.168.10.15\", \"192.168.10.9\",\n", 612 | " \"192.168.10.14\", \"192.168.10.5\", \"192.168.10.8\"], [\"205.174.165.73\"])\n", 613 | "\n", 614 | "label_flows(friday_df, \"Botnet - Attempted\", 1499432653990571000, 1499436122903736000, [\"192.168.10.15\", \"192.168.10.9\",\n", 615 | " \"192.168.10.14\", \"192.168.10.5\", \"192.168.10.8\"], [\"205.174.165.73\"], attempted_category=0,\n", 616 | " payload_filter=True)\n", 617 | "\n", 618 | "label_flows(friday_df, \"Botnet - Attempted\", 1499436180000000000, 1499457684606663000, [\"192.168.10.15\", \"192.168.10.9\",\n", 619 | " \"192.168.10.14\", \"192.168.10.5\", \"192.168.10.8\"], [\"205.174.165.73\"], attempted_category=1)\n", 620 | "\n", 621 | "\n", 622 | "# DDoS\n", 623 | "# ----\n", 624 | "\n", 625 | "label_flows(friday_df, \"DDoS\", 1499453791796937000, 1499454972216560000, [\"172.16.0.1\"], [\"192.168.10.50\"])\n", 626 | "\n", 627 | "label_flows(friday_df, \"DDoS - Attempted\", 1499453791796937000, 1499454972216560000, [\"172.16.0.1\"], [\"192.168.10.50\"],\n", 628 | " attempted_category=0, payload_filter=True)\n", 629 | "\n", 630 | "label_rest_as_benign_and_write_csv(friday_df, OUTPUT_PATH + \"friday.csv\")\n", 631 | "\n", 632 | "friday_df = None" 633 | ], 634 | "metadata": { 635 | "collapsed": false 636 | } 637 | }, 638 | { 639 | "cell_type": "code", 640 | "execution_count": 16, 641 | "outputs": [], 642 | "source": [], 643 | "metadata": { 644 | "collapsed": false 645 | } 646 | } 647 | ], 648 | "metadata": { 649 | "kernelspec": { 650 | "display_name": "Python 3", 651 | "language": "python", 652 | "name": "python3" 653 | }, 654 | "language_info": { 655 | "codemirror_mode": { 656 | "name": "ipython", 657 | "version": 2 658 | }, 659 | "file_extension": ".py", 660 | "mimetype": "text/x-python", 661 | "name": "python", 662 | "nbconvert_exporter": "python", 663 | "pygments_lexer": "ipython2", 664 | "version": "2.7.6" 665 | } 666 | }, 667 | "nbformat": 4, 668 | "nbformat_minor": 0 669 | } 670 | -------------------------------------------------------------------------------- /Labelling/CICIDS2017_original_version_labelling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "import glob\n", 14 | "import os\n", 15 | "from sys import platform\n", 16 | "import datetime\n", 17 | "\n", 18 | "# THIS LABELLING SCRIPT IS USED TO LABEL THE OLD VERSION OF CIC-IDS-2017. THIS VERSION SHOULD ONLY BE USED IF YOU\n", 19 | "# WISH TO RECREATE OUR RESULTS AS REPORTED IN OUR PAPER: https://intrusion-detection.distrinet-research.be/CNS2022/index.html\n", 20 | "\n", 21 | "# THIS SCRIPT ACCEPTS AS INPUT THE ORIGINAL CSVs AS RELEASED BY THE DATASET AUTHORS: https://www.unb.ca/cic/datasets/ids-2017.html\n", 22 | "\n", 23 | "pd.set_option('display.max_rows', 100)\n", 24 | "\n", 25 | "\n", 26 | "DATASET_PATH = \"\"\n", 27 | "OUTPUT_PATH = \"\"\n", 28 | "\n", 29 | "# unset to remove line index (to refer to line numbers when writing final csv)\n", 30 | "print_index = True" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "outputs": [], 37 | "source": [ 38 | "def format_csv_for_labelling(df):\n", 39 | " # strip leading whitespaces in column names\n", 40 | " df.columns = df.columns.str.lstrip(\" \")\n", 41 | "\n", 42 | " print(\"labels before pre-processing:\", df[\"Label\"].value_counts())\n", 43 | "\n", 44 | " # Since CICIDS 2017 authors used 12-hour format but removed AM/PM, we need to reconstruct it\n", 45 | " # We do this based on the knowledge they collected traffic from 9:00 AM to 5:00 PM.\n", 46 | " df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M')\n", 47 | " df['Timestamp'] = df['Timestamp'].apply(lambda x: x + pd.DateOffset(hours=12) if x.hour < 7 else x)\n", 48 | "\n", 49 | " # Convert to UTC from New Brunswick summer timezone (UTC-3)\n", 50 | " df['Timestamp'] = df['Timestamp'] + pd.DateOffset(hours=3)\n", 51 | "\n", 52 | " for column in df.columns:\n", 53 | " if column not in ['Flow ID' , 'Timestamp', 'Source IP', 'Destination IP', 'Label']:\n", 54 | " df[column] = pd.to_numeric(df[column])\n", 55 | "\n", 56 | " # Add attempted category column and initialise to -1\n", 57 | " df[\"Attempted Category\"] = -1\n", 58 | "\n", 59 | " # CICIDS 2017 author-released version comes prelabelled. This makes sure previous labels don't interfere\n", 60 | " df[\"Label\"] = \"NeedManualLabel\"\n", 61 | "\n", 62 | " print(\"labels after pre-processing:\", df[\"Label\"].value_counts())\n", 63 | "\n", 64 | " return df\n", 65 | "\n", 66 | "def read_csvs_from_path_and_reformat(path):\n", 67 | " df= pd.read_csv(path, encoding='cp1252')\n", 68 | "\n", 69 | " df = format_csv_for_labelling(df)\n", 70 | "\n", 71 | " return df\n", 72 | "\n", 73 | "def label_flows(df, label, attack_start_time_nanoseconds, attack_end_time_nanoseconds, src_ip_list=None,\n", 74 | " dst_ip_list= None, src_port_list=None, dst_port_list=None, attempted_category = -1, additional_filters=[],\n", 75 | " also_flip_flow_direction=False, payload_filter=False):\n", 76 | " # Note that labelling happens inplace on the 'df' parameter, and so this function doesn't return anything\n", 77 | "\n", 78 | " # Create initial mask with all values set to True. Squeeze is necessary to remove second axis (with value 1)\n", 79 | " # The reason is that a df of shape (X,) gets converted to (1,X) if you '&' it with a df of shape (X,1)\n", 80 | " custom_mask = pd.DataFrame(True, index=df.index, columns=[df.columns[0]]).squeeze()\n", 81 | "\n", 82 | " # Need to round the start time down to the nearest minute because otherwise some flows at the start of the attack\n", 83 | " # are labelled as benign\n", 84 | " attack_start_datetime = pd.to_datetime(attack_start_time_nanoseconds, unit='ns').floor(freq='T')\n", 85 | " attack_end_datetime = pd.to_datetime(attack_end_time_nanoseconds, unit='ns')\n", 86 | "\n", 87 | " custom_mask &= (df[\"Timestamp\"] >= attack_start_datetime)\n", 88 | " custom_mask &= (df[\"Timestamp\"] <= attack_end_datetime)\n", 89 | "\n", 90 | " if src_ip_list is not None:\n", 91 | " custom_mask &= (df[\"Source IP\"].isin(src_ip_list))\n", 92 | " if dst_ip_list is not None:\n", 93 | " custom_mask &= (df[\"Destination IP\"].isin(dst_ip_list))\n", 94 | "\n", 95 | " if src_port_list is not None:\n", 96 | " custom_mask &= (df[\"Source Port\"].isin(src_port_list))\n", 97 | " if dst_port_list is not None:\n", 98 | " custom_mask &= (df[\"Destination Port\"].isin(dst_port_list))\n", 99 | "\n", 100 | " # IMPORTANT NOTE: If you decide to add TotLen Fwd Pkt == 6 for catching RST packets, you still have to manually alter some additional_filters for flipped flows where\n", 101 | " # you couldn't use payload_filter boolean function input value\n", 102 | " if payload_filter:\n", 103 | " custom_mask &= (df[\"Total Length of Fwd Packets\"] == 0)\n", 104 | "\n", 105 | " for filter in additional_filters:\n", 106 | " custom_mask &= filter\n", 107 | "\n", 108 | " df[\"Label\"].mask(custom_mask, label, inplace=True)\n", 109 | " df[\"Attempted Category\"].mask(custom_mask, attempted_category, inplace=True)\n", 110 | "\n", 111 | " if also_flip_flow_direction:\n", 112 | " if additional_filters:\n", 113 | " raise AttributeError(\"Cannot set also_flip_flow_direction to True when additional_filters is not empty\")\n", 114 | "\n", 115 | " custom_mask = pd.DataFrame(True, index=df.index, columns=[df.columns[0]]).squeeze()\n", 116 | "\n", 117 | " custom_mask &= (df[\"Timestamp\"] >= attack_start_datetime)\n", 118 | " custom_mask &= (df[\"Timestamp\"] <= attack_end_datetime)\n", 119 | "\n", 120 | " if src_ip_list is not None:\n", 121 | " custom_mask &= (df[\"Destination IP\"].isin(src_ip_list))\n", 122 | " if dst_ip_list is not None:\n", 123 | " custom_mask &= (df[\"Source IP\"].isin(dst_ip_list))\n", 124 | "\n", 125 | " if src_port_list is not None:\n", 126 | " custom_mask &= (df[\"Destination Port\"].isin(src_port_list))\n", 127 | " if dst_port_list is not None:\n", 128 | " custom_mask &= (df[\"Source Port\"].isin(dst_port_list))\n", 129 | "\n", 130 | " if payload_filter:\n", 131 | " custom_mask &= (df[\"Total Length of Bwd Packets\"] == 0)\n", 132 | "\n", 133 | " for filter in additional_filters:\n", 134 | " custom_mask &= filter\n", 135 | "\n", 136 | " df[\"Label\"].mask(custom_mask, label, inplace=True)\n", 137 | " df[\"Attempted Category\"].mask(custom_mask, attempted_category, inplace=True)\n", 138 | "\n", 139 | "def label_rest_as_benign_and_write_csv(df, file_to_write):\n", 140 | " df[\"Label\"].mask(df[\"Label\"] == \"NeedManualLabel\", \"BENIGN\", inplace=True)\n", 141 | "\n", 142 | " # Relabel artefact flows with [Flow Id] = '8.0.6.4-8.6.0.1-0-0-0' to label = 0\n", 143 | " df[\"Label\"].mask(df[\"Flow ID\"] == '8.0.6.4-8.6.0.1-0-0-0', \"BENIGN\", inplace=True)\n", 144 | "\n", 145 | " print(\"label count after labelling:\\r\\n\", df[\"Label\"].value_counts())\n", 146 | " print(\"Attempted Category count after labelling:\\r\\n\", df[\"Attempted Category\"].value_counts())\n", 147 | "\n", 148 | " if print_index:\n", 149 | " df.reset_index(inplace=True, drop=True)\n", 150 | " df.index += 1\n", 151 | " df.index.name = 'id'\n", 152 | " df.to_csv(file_to_write)\n", 153 | " else:\n", 154 | " df.to_csv(file_to_write, index=False)\n" 155 | ], 156 | "metadata": { 157 | "collapsed": false 158 | } 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 3, 163 | "outputs": [ 164 | { 165 | "name": "stdout", 166 | "output_type": "stream", 167 | "text": [ 168 | "labels before pre-processing: BENIGN 432074\n", 169 | "FTP-Patator 7938\n", 170 | "SSH-Patator 5897\n", 171 | "Name: Label, dtype: int64\n", 172 | "labels after pre-processing: NeedManualLabel 445909\n", 173 | "Name: Label, dtype: int64\n", 174 | "label count after labelling:\r\n", 175 | " BENIGN 430465\n", 176 | "FTP-Patator - Attempted 5489\n", 177 | "FTP-Patator 3991\n", 178 | "SSH-Patator - Attempted 3003\n", 179 | "SSH-Patator 2961\n", 180 | "Name: Label, dtype: int64\n", 181 | "Attempted Category count after labelling:\r\n", 182 | " -1 437417\n", 183 | " 3 6918\n", 184 | " 0 1571\n", 185 | " 2 3\n", 186 | "Name: Attempted Category, dtype: int64\n" 187 | ] 188 | } 189 | ], 190 | "source": [ 191 | "#--------------------+\n", 192 | "# TUESDAY 04-07-2017 |\n", 193 | "#--------------------+\n", 194 | "\n", 195 | "tuesday_df = read_csvs_from_path_and_reformat(DATASET_PATH + \"tuesday/Tuesday-WorkingHours.pcap_ISCX.csv\")\n", 196 | "\n", 197 | "# FTP-PATATOR\n", 198 | "# -----------\n", 199 | "\n", 200 | "label_flows(tuesday_df, \"FTP-Patator\", 1499170672838272000, 1499174416931403000, [\"172.16.0.1\"],\n", 201 | " [\"192.168.10.50\"], dst_port_list=[21], also_flip_flow_direction=True)\n", 202 | "\n", 203 | "# Default payload filter\n", 204 | "label_flows(tuesday_df, \"FTP-Patator - Attempted\", 1499170672838272000, 1499174416931403000, [\"172.16.0.1\"],\n", 205 | " [\"192.168.10.50\"], dst_port_list=[21], payload_filter=True, also_flip_flow_direction=True, attempted_category=0)\n", 206 | "\n", 207 | "label_flows(tuesday_df, \"FTP-Patator - Attempted\", 1499170672838272000, 1499174416931403000, [\"172.16.0.1\"],\n", 208 | " [\"192.168.10.50\"], dst_port_list=[21], src_port_list=[52108], attempted_category=2)\n", 209 | "\n", 210 | "# Flows with RSTs that are technically TCP appendices, but not picked up by payload filter because of non-zero payload\n", 211 | "label_flows(tuesday_df, \"FTP-Patator - Attempted\", 1499170672838272000, 1499174416931403000, [\"172.16.0.1\"],\n", 212 | " [\"192.168.10.50\"], dst_port_list=[21], additional_filters=\n", 213 | " [\n", 214 | " (tuesday_df[\"Source Port\"] != 52108) & (tuesday_df[\"Total Length of Bwd Packets\"] == 0) &\n", 215 | " (tuesday_df[\"Total Length of Fwd Packets\"] > 0)\n", 216 | " ], attempted_category=3)\n", 217 | "\n", 218 | "label_flows(tuesday_df, \"FTP-Patator - Attempted\", 1499170672838272000, 1499174416931403000, [\"192.168.10.50\"],\n", 219 | " [\"172.16.0.1\"], src_port_list=[21], additional_filters=\n", 220 | " [\n", 221 | " (tuesday_df[\"Destination Port\"] != 52108) & (tuesday_df[\"Total Length of Fwd Packets\"] == 0) &\n", 222 | " (tuesday_df[\"Total Length of Bwd Packets\"] > 0)\n", 223 | " ], attempted_category=3)\n", 224 | "\n", 225 | "\n", 226 | "# SSH-Patator\n", 227 | "# -----------\n", 228 | "\n", 229 | "label_flows(tuesday_df, \"SSH-Patator\", 1499188141049616000, 1499195059018486000, [\"172.16.0.1\"],\n", 230 | " [\"192.168.10.50\"], dst_port_list=[22], also_flip_flow_direction=True)\n", 231 | "\n", 232 | "#Payload filter\n", 233 | "label_flows(tuesday_df, \"SSH-Patator - Attempted\", 1499188141049616000, 1499195059018486000, [\"172.16.0.1\"],\n", 234 | " [\"192.168.10.50\"], dst_port_list=[22], payload_filter=True, also_flip_flow_direction=True, attempted_category=0)\n", 235 | "\n", 236 | "label_flows(tuesday_df, \"SSH-Patator - Attempted\", 1499188141049616000, 1499195059018486000, [\"172.16.0.1\"],\n", 237 | " [\"192.168.10.50\"], dst_port_list=[22], additional_filters=\n", 238 | " [\n", 239 | " (tuesday_df[\"Total Length of Fwd Packets\"] <= 32) & (tuesday_df[\"Total Length of Bwd Packets\"] == 0)\n", 240 | " ], attempted_category=3)\n", 241 | "\n", 242 | "label_rest_as_benign_and_write_csv(tuesday_df, OUTPUT_PATH + \"Tuesday-WorkingHours.pcap_ISCX.csv\")\n", 243 | "\n", 244 | "tuesday_df = None" 245 | ], 246 | "metadata": { 247 | "collapsed": false 248 | } 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 3, 253 | "outputs": [ 254 | { 255 | "ename": "FileNotFoundError", 256 | "evalue": "[Errno 2] No such file or directory: '/media/farodin/AEAA59A1AA59673D/CICIDS2017/CSV_newest_CICFlowMeter_20220728/Unlabelled/wednesday/Wednesday-workingHours.pcap_ISCX.csv'", 257 | "output_type": "error", 258 | "traceback": [ 259 | "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", 260 | "\u001B[0;31mFileNotFoundError\u001B[0m Traceback (most recent call last)", 261 | "Input \u001B[0;32mIn [3]\u001B[0m, in \u001B[0;36m\u001B[0;34m()\u001B[0m\n\u001B[1;32m 1\u001B[0m \u001B[38;5;66;03m#----------------------+\u001B[39;00m\n\u001B[1;32m 2\u001B[0m \u001B[38;5;66;03m# WEDNESDAY 05-07-2017 |\u001B[39;00m\n\u001B[1;32m 3\u001B[0m \u001B[38;5;66;03m#----------------------+\u001B[39;00m\n\u001B[0;32m----> 5\u001B[0m wednesday_df \u001B[38;5;241m=\u001B[39m \u001B[43mread_csvs_from_path_and_reformat\u001B[49m\u001B[43m(\u001B[49m\u001B[43mDATASET_PATH\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m+\u001B[39;49m\u001B[43m \u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mwednesday/Wednesday-workingHours.pcap_ISCX.csv\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[1;32m 7\u001B[0m \u001B[38;5;66;03m# DoS Slowloris\u001B[39;00m\n\u001B[1;32m 8\u001B[0m \u001B[38;5;66;03m# -------------\u001B[39;00m\n\u001B[1;32m 9\u001B[0m \n\u001B[1;32m 10\u001B[0m \u001B[38;5;66;03m# Accidental early launch of the tool with wrong parameters\u001B[39;00m\n\u001B[1;32m 11\u001B[0m label_flows(wednesday_df, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mDoS Slowloris - Attempted\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;241m1499258926211817000\u001B[39m, \u001B[38;5;241m1499258927000000000\u001B[39m, [\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m172.16.0.1\u001B[39m\u001B[38;5;124m\"\u001B[39m],\n\u001B[1;32m 12\u001B[0m [\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m192.168.10.50\u001B[39m\u001B[38;5;124m\"\u001B[39m], dst_port_list\u001B[38;5;241m=\u001B[39m[\u001B[38;5;241m80\u001B[39m], attempted_category\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m5\u001B[39m)\n", 262 | "Input \u001B[0;32mIn [2]\u001B[0m, in \u001B[0;36mread_csvs_from_path_and_reformat\u001B[0;34m(path)\u001B[0m\n\u001B[1;32m 29\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mread_csvs_from_path_and_reformat\u001B[39m(path):\n\u001B[0;32m---> 30\u001B[0m df\u001B[38;5;241m=\u001B[39m \u001B[43mpd\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mread_csv\u001B[49m\u001B[43m(\u001B[49m\u001B[43mpath\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mencoding\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mcp1252\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[1;32m 32\u001B[0m df \u001B[38;5;241m=\u001B[39m format_csv_for_labelling(df)\n\u001B[1;32m 34\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m df\n", 263 | "File \u001B[0;32m~/anaconda3/envs/pytorch/lib/python3.8/site-packages/pandas/util/_decorators.py:311\u001B[0m, in \u001B[0;36mdeprecate_nonkeyword_arguments..decorate..wrapper\u001B[0;34m(*args, **kwargs)\u001B[0m\n\u001B[1;32m 305\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mlen\u001B[39m(args) \u001B[38;5;241m>\u001B[39m num_allow_args:\n\u001B[1;32m 306\u001B[0m warnings\u001B[38;5;241m.\u001B[39mwarn(\n\u001B[1;32m 307\u001B[0m msg\u001B[38;5;241m.\u001B[39mformat(arguments\u001B[38;5;241m=\u001B[39marguments),\n\u001B[1;32m 308\u001B[0m \u001B[38;5;167;01mFutureWarning\u001B[39;00m,\n\u001B[1;32m 309\u001B[0m stacklevel\u001B[38;5;241m=\u001B[39mstacklevel,\n\u001B[1;32m 310\u001B[0m )\n\u001B[0;32m--> 311\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mfunc\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n", 264 | "File \u001B[0;32m~/anaconda3/envs/pytorch/lib/python3.8/site-packages/pandas/io/parsers/readers.py:586\u001B[0m, in \u001B[0;36mread_csv\u001B[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001B[0m\n\u001B[1;32m 571\u001B[0m kwds_defaults \u001B[38;5;241m=\u001B[39m _refine_defaults_read(\n\u001B[1;32m 572\u001B[0m dialect,\n\u001B[1;32m 573\u001B[0m delimiter,\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 582\u001B[0m defaults\u001B[38;5;241m=\u001B[39m{\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mdelimiter\u001B[39m\u001B[38;5;124m\"\u001B[39m: \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m,\u001B[39m\u001B[38;5;124m\"\u001B[39m},\n\u001B[1;32m 583\u001B[0m )\n\u001B[1;32m 584\u001B[0m kwds\u001B[38;5;241m.\u001B[39mupdate(kwds_defaults)\n\u001B[0;32m--> 586\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43m_read\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfilepath_or_buffer\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mkwds\u001B[49m\u001B[43m)\u001B[49m\n", 265 | "File \u001B[0;32m~/anaconda3/envs/pytorch/lib/python3.8/site-packages/pandas/io/parsers/readers.py:482\u001B[0m, in \u001B[0;36m_read\u001B[0;34m(filepath_or_buffer, kwds)\u001B[0m\n\u001B[1;32m 479\u001B[0m _validate_names(kwds\u001B[38;5;241m.\u001B[39mget(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mnames\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;28;01mNone\u001B[39;00m))\n\u001B[1;32m 481\u001B[0m \u001B[38;5;66;03m# Create the parser.\u001B[39;00m\n\u001B[0;32m--> 482\u001B[0m parser \u001B[38;5;241m=\u001B[39m \u001B[43mTextFileReader\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfilepath_or_buffer\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwds\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 484\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m chunksize \u001B[38;5;129;01mor\u001B[39;00m iterator:\n\u001B[1;32m 485\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m parser\n", 266 | "File \u001B[0;32m~/anaconda3/envs/pytorch/lib/python3.8/site-packages/pandas/io/parsers/readers.py:811\u001B[0m, in \u001B[0;36mTextFileReader.__init__\u001B[0;34m(self, f, engine, **kwds)\u001B[0m\n\u001B[1;32m 808\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mhas_index_names\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;129;01min\u001B[39;00m kwds:\n\u001B[1;32m 809\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39moptions[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mhas_index_names\u001B[39m\u001B[38;5;124m\"\u001B[39m] \u001B[38;5;241m=\u001B[39m kwds[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mhas_index_names\u001B[39m\u001B[38;5;124m\"\u001B[39m]\n\u001B[0;32m--> 811\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_engine \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_make_engine\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mengine\u001B[49m\u001B[43m)\u001B[49m\n", 267 | "File \u001B[0;32m~/anaconda3/envs/pytorch/lib/python3.8/site-packages/pandas/io/parsers/readers.py:1040\u001B[0m, in \u001B[0;36mTextFileReader._make_engine\u001B[0;34m(self, engine)\u001B[0m\n\u001B[1;32m 1036\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[1;32m 1037\u001B[0m \u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mUnknown engine: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mengine\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m (valid options are \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mmapping\u001B[38;5;241m.\u001B[39mkeys()\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m)\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m 1038\u001B[0m )\n\u001B[1;32m 1039\u001B[0m \u001B[38;5;66;03m# error: Too many arguments for \"ParserBase\"\u001B[39;00m\n\u001B[0;32m-> 1040\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mmapping\u001B[49m\u001B[43m[\u001B[49m\u001B[43mengine\u001B[49m\u001B[43m]\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mf\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43moptions\u001B[49m\u001B[43m)\u001B[49m\n", 268 | "File \u001B[0;32m~/anaconda3/envs/pytorch/lib/python3.8/site-packages/pandas/io/parsers/c_parser_wrapper.py:51\u001B[0m, in \u001B[0;36mCParserWrapper.__init__\u001B[0;34m(self, src, **kwds)\u001B[0m\n\u001B[1;32m 48\u001B[0m kwds[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124musecols\u001B[39m\u001B[38;5;124m\"\u001B[39m] \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39musecols\n\u001B[1;32m 50\u001B[0m \u001B[38;5;66;03m# open handles\u001B[39;00m\n\u001B[0;32m---> 51\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_open_handles\u001B[49m\u001B[43m(\u001B[49m\u001B[43msrc\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mkwds\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 52\u001B[0m \u001B[38;5;28;01massert\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mhandles \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m\n\u001B[1;32m 54\u001B[0m \u001B[38;5;66;03m# Have to pass int, would break tests using TextReader directly otherwise :(\u001B[39;00m\n", 269 | "File \u001B[0;32m~/anaconda3/envs/pytorch/lib/python3.8/site-packages/pandas/io/parsers/base_parser.py:222\u001B[0m, in \u001B[0;36mParserBase._open_handles\u001B[0;34m(self, src, kwds)\u001B[0m\n\u001B[1;32m 218\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21m_open_handles\u001B[39m(\u001B[38;5;28mself\u001B[39m, src: FilePathOrBuffer, kwds: \u001B[38;5;28mdict\u001B[39m[\u001B[38;5;28mstr\u001B[39m, Any]) \u001B[38;5;241m-\u001B[39m\u001B[38;5;241m>\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[1;32m 219\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m\"\"\"\u001B[39;00m\n\u001B[1;32m 220\u001B[0m \u001B[38;5;124;03m Let the readers open IOHandles after they are done with their potential raises.\u001B[39;00m\n\u001B[1;32m 221\u001B[0m \u001B[38;5;124;03m \"\"\"\u001B[39;00m\n\u001B[0;32m--> 222\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mhandles \u001B[38;5;241m=\u001B[39m \u001B[43mget_handle\u001B[49m\u001B[43m(\u001B[49m\n\u001B[1;32m 223\u001B[0m \u001B[43m \u001B[49m\u001B[43msrc\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 224\u001B[0m \u001B[43m \u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mr\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\n\u001B[1;32m 225\u001B[0m \u001B[43m \u001B[49m\u001B[43mencoding\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mkwds\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mencoding\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mNone\u001B[39;49;00m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 226\u001B[0m \u001B[43m \u001B[49m\u001B[43mcompression\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mkwds\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mcompression\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mNone\u001B[39;49;00m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 227\u001B[0m \u001B[43m \u001B[49m\u001B[43mmemory_map\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mkwds\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mmemory_map\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mFalse\u001B[39;49;00m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 228\u001B[0m \u001B[43m \u001B[49m\u001B[43mstorage_options\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mkwds\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mstorage_options\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mNone\u001B[39;49;00m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 229\u001B[0m \u001B[43m \u001B[49m\u001B[43merrors\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mkwds\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mencoding_errors\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mstrict\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 230\u001B[0m \u001B[43m \u001B[49m\u001B[43m)\u001B[49m\n", 270 | "File \u001B[0;32m~/anaconda3/envs/pytorch/lib/python3.8/site-packages/pandas/io/common.py:701\u001B[0m, in \u001B[0;36mget_handle\u001B[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001B[0m\n\u001B[1;32m 696\u001B[0m \u001B[38;5;28;01melif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(handle, \u001B[38;5;28mstr\u001B[39m):\n\u001B[1;32m 697\u001B[0m \u001B[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001B[39;00m\n\u001B[1;32m 698\u001B[0m \u001B[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001B[39;00m\n\u001B[1;32m 699\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m ioargs\u001B[38;5;241m.\u001B[39mencoding \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mb\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m ioargs\u001B[38;5;241m.\u001B[39mmode:\n\u001B[1;32m 700\u001B[0m \u001B[38;5;66;03m# Encoding\u001B[39;00m\n\u001B[0;32m--> 701\u001B[0m handle \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mopen\u001B[39;49m\u001B[43m(\u001B[49m\n\u001B[1;32m 702\u001B[0m \u001B[43m \u001B[49m\u001B[43mhandle\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 703\u001B[0m \u001B[43m \u001B[49m\u001B[43mioargs\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mmode\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 704\u001B[0m \u001B[43m \u001B[49m\u001B[43mencoding\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mioargs\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mencoding\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 705\u001B[0m \u001B[43m \u001B[49m\u001B[43merrors\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43merrors\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 706\u001B[0m \u001B[43m \u001B[49m\u001B[43mnewline\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\n\u001B[1;32m 707\u001B[0m \u001B[43m \u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 708\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m 709\u001B[0m \u001B[38;5;66;03m# Binary mode\u001B[39;00m\n\u001B[1;32m 710\u001B[0m handle \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mopen\u001B[39m(handle, ioargs\u001B[38;5;241m.\u001B[39mmode)\n", 271 | "\u001B[0;31mFileNotFoundError\u001B[0m: [Errno 2] No such file or directory: '/media/farodin/AEAA59A1AA59673D/CICIDS2017/CSV_newest_CICFlowMeter_20220728/Unlabelled/wednesday/Wednesday-workingHours.pcap_ISCX.csv'" 272 | ] 273 | } 274 | ], 275 | "source": [ 276 | "#----------------------+\n", 277 | "# WEDNESDAY 05-07-2017 |\n", 278 | "#----------------------+\n", 279 | "\n", 280 | "wednesday_df = read_csvs_from_path_and_reformat(DATASET_PATH + \"wednesday/Wednesday-workingHours.pcap_ISCX.csv\")\n", 281 | "\n", 282 | "# DoS Slowloris\n", 283 | "# -------------\n", 284 | "\n", 285 | "# Accidental early launch of the tool with wrong parameters\n", 286 | "label_flows(wednesday_df, \"DoS Slowloris - Attempted\", 1499258926211817000, 1499258927000000000, [\"172.16.0.1\"],\n", 287 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=5)\n", 288 | "\n", 289 | "label_flows(wednesday_df, \"DoS Slowloris\", 1499258934539220000, 1499260278500956000, [\"172.16.0.1\"],\n", 290 | " [\"192.168.10.50\"], dst_port_list=[80], additional_filters=[\n", 291 | " ~(wednesday_df[\"Source Port\"].isin([33358, 33360, 33362, 54114]))\n", 292 | " ])\n", 293 | "\n", 294 | "label_flows(wednesday_df, \"DoS Slowloris\", 1499258934539220000, 1499260278500956000,\n", 295 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], additional_filters=[\n", 296 | " ~(wednesday_df[\"Destination Port\"].isin([33358, 33360, 33362, 54114]))\n", 297 | " ])\n", 298 | "\n", 299 | "# port 33358, 33360 and 33362 contain attack teardown flows\n", 300 | "label_flows(wednesday_df, \"DoS Slowloris - Attempted\", 1499258934539220000, 1499260278500956000, [\"172.16.0.1\"],\n", 301 | " [\"192.168.10.50\"], src_port_list=[33358, 33360, 33362], dst_port_list=[80], attempted_category=2)\n", 302 | "\n", 303 | "#Payload filter (order is important, this part needs to come before Attempted category 6) (can't flip with boolean function input because of additional filters)\n", 304 | "label_flows(wednesday_df, \"DoS Slowloris - Attempted\", 1499258934539220000, 1499260278500956000, [\"172.16.0.1\"],\n", 305 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=0, payload_filter=True, additional_filters=[\n", 306 | " ~(wednesday_df[\"Source Port\"].isin([33358, 33360, 33362, 54114]))])\n", 307 | "\n", 308 | "label_flows(wednesday_df, \"DoS Slowloris - Attempted\", 1499258934539220000, 1499260278500956000,\n", 309 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], attempted_category=0, additional_filters=[\n", 310 | " ~(wednesday_df[\"Destination Port\"].isin([33358, 33360, 33362, 54114])) & (wednesday_df[\"Total Length of Bwd Packets\"] == 0)\n", 311 | " ])\n", 312 | "\n", 313 | "#Target unresponsive because of DoS, no payloads in these flows\n", 314 | "label_flows(wednesday_df, \"DoS Slowloris - Attempted\", 1499258934539220000, 1499260278500956000,\n", 315 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], attempted_category=6, additional_filters=[\n", 316 | " ~(wednesday_df[\"Destination Port\"].isin([33358, 33360, 33362, 54114])) & (wednesday_df[\"Total Length of Bwd Packets\"] == 0)\n", 317 | " & (wednesday_df[\"Flow Duration\"] >= 199800)\n", 318 | " ])\n", 319 | "\n", 320 | "# Artefact likely from authors checking the webserver\n", 321 | "label_flows(wednesday_df, \"DoS Slowloris - Attempted\", 1499258934539220000, 1499260278500956000, [\"172.16.0.1\"],\n", 322 | " [\"192.168.10.50\"], src_port_list=[54114], dst_port_list=[80], attempted_category=4)\n", 323 | "\n", 324 | "# DoS Slowhttptest\n", 325 | "# ----------------\n", 326 | "\n", 327 | "label_flows(wednesday_df, \"DoS Slowhttptest\", 1499260537936810000, 1499261869331517000, [\"172.16.0.1\"],\n", 328 | " [\"192.168.10.50\"], dst_port_list=[80], additional_filters=[\n", 329 | " ~(wednesday_df[\"Source Port\"].isin([33372]))\n", 330 | " ]\n", 331 | " )\n", 332 | "\n", 333 | "label_flows(wednesday_df, \"DoS Slowhttptest\", 1499260537936810000, 1499261869331517000,\n", 334 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], additional_filters=[\n", 335 | " ~(wednesday_df[\"Destination Port\"].isin([33372]))\n", 336 | " ]\n", 337 | " )\n", 338 | "\n", 339 | "# Attack startup artefact\n", 340 | "label_flows(wednesday_df, \"DoS Slowhttptest - Attempted\", 1499260537936810000, 1499261869331517000, [\"172.16.0.1\"],\n", 341 | " [\"192.168.10.50\"], src_port_list=[33372], dst_port_list=[80], attempted_category=2)\n", 342 | "\n", 343 | "#Payload filter (order of this is important, before attempted category 6) (can't flip with boolean function input because of additional filters)\n", 344 | "label_flows(wednesday_df, \"DoS Slowhttptest - Attempted\", 1499260537936810000, 1499261869331517000, [\"172.16.0.1\"],\n", 345 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=0, payload_filter=True, additional_filters=[\n", 346 | " ~(wednesday_df[\"Source Port\"].isin([33372, 37670]))])\n", 347 | "\n", 348 | "label_flows(wednesday_df, \"DoS Slowhttptest - Attempted\", 1499260537936810000, 1499261869331517000,\n", 349 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], attempted_category=0, additional_filters=[\n", 350 | " ~(wednesday_df[\"Destination Port\"].isin([33372, 37670])) & (wednesday_df[\"Total Length of Bwd Packets\"] == 0)\n", 351 | " ]\n", 352 | " )\n", 353 | "\n", 354 | "# Retransmissions because target web server is brought down (No need to flip direction, I double-checked)\n", 355 | "label_flows(wednesday_df, \"DoS Slowhttptest - Attempted\", 1499260537936810000, 1499261869331517000, [\"172.16.0.1\"],\n", 356 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=6, additional_filters=[\n", 357 | " ~(wednesday_df[\"Source Port\"].isin([33372])) & (wednesday_df[\"Total Length of Fwd Packets\"] == 0) &\n", 358 | " (wednesday_df[\"Flow Duration\"] >= 199984) & (wednesday_df[\"Total Backward Packets\"] == 0)\n", 359 | " ]\n", 360 | " )\n", 361 | "\n", 362 | "# Artefact from authors likely checking the webserver\n", 363 | "label_flows(wednesday_df, \"DoS Slowhttptest - Attempted\", 1499260537936810000, 1499261869331517000, [\"172.16.0.1\"],\n", 364 | " [\"192.168.10.50\"], src_port_list=[37670], dst_port_list=[80], attempted_category=4)\n", 365 | "\n", 366 | "\n", 367 | "# DoS Hulk\n", 368 | "# --------\n", 369 | "\n", 370 | "# Note that ports 48678 and 43664 have a benign flow launched by attacker IP while attack is already ongoing,\n", 371 | "# containing benign HTTP request. This will be labelled as Attack artefact\n", 372 | "label_flows(wednesday_df, \"DoS Hulk\", 1499262203194704000, 1499262299999999999, [\"172.16.0.1\"],\n", 373 | " [\"192.168.10.50\"], dst_port_list=[80], additional_filters=[\n", 374 | " ~(wednesday_df[\"Source Port\"].isin([48678 , 43664]))\n", 375 | " ])\n", 376 | "\n", 377 | "label_flows(wednesday_df, \"DoS Hulk\", 1499262203194704000, 1499262299999999999,\n", 378 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], additional_filters=[\n", 379 | " ~(wednesday_df[\"Destination Port\"].isin([48678 , 43664]))\n", 380 | " ])\n", 381 | "\n", 382 | "label_flows(wednesday_df, \"DoS Hulk\", 1499262300000000000, 1499263641326171000, [\"172.16.0.1\"],\n", 383 | " [\"192.168.10.50\"], dst_port_list=[80], also_flip_flow_direction=True)\n", 384 | "\n", 385 | "#Attack artefact - likely authors checking webserver mid-attack.\n", 386 | "label_flows(wednesday_df, \"DoS Hulk - Attempted\", 1499262203194704000, 1499262299999999999, [\"172.16.0.1\"],\n", 387 | " [\"192.168.10.50\"], src_port_list=[48678 , 43664], dst_port_list=[80], attempted_category=4)\n", 388 | "\n", 389 | "#Payload filter (can't flip with boolean function input because of additional filters)\n", 390 | "label_flows(wednesday_df, \"DoS Hulk - Attempted\", 1499262203194704000, 1499263641326171000, [\"172.16.0.1\"],\n", 391 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=0, payload_filter=True, additional_filters=[\n", 392 | " ~(wednesday_df[\"Source Port\"].isin([48678 , 43664]))])\n", 393 | "\n", 394 | "label_flows(wednesday_df, \"DoS Hulk - Attempted\", 1499262203194704000, 1499263641326171000,\n", 395 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], attempted_category=0, additional_filters=[\n", 396 | " ~(wednesday_df[\"Destination Port\"].isin([48678 , 43664])) & (wednesday_df[\"Total Length of Bwd Packets\"] == 0)\n", 397 | " ])\n", 398 | "\n", 399 | "# Artefacts caused by either attack tool or non-empty TCP appendices. Reasoning is that 282 is minimum size of malicious payload\n", 400 | "label_flows(wednesday_df, \"DoS Hulk - Attempted\", 1499262203194704000, 1499263641326171000, [\"172.16.0.1\"],\n", 401 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=3, additional_filters=[\n", 402 | " ~(wednesday_df[\"Source Port\"].isin([48678 , 43664])) & (wednesday_df[\"Total Length of Fwd Packets\"] > 0)\n", 403 | " & (wednesday_df[\"Total Length of Fwd Packets\"] < 282)\n", 404 | " ])\n", 405 | "\n", 406 | "label_flows(wednesday_df, \"DoS Hulk - Attempted\", 1499262203194704000, 1499263641326171000,\n", 407 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], attempted_category=3, additional_filters=[\n", 408 | " ~(wednesday_df[\"Destination Port\"].isin([48678 , 43664])) & (wednesday_df[\"Total Length of Bwd Packets\"] > 0)\n", 409 | " & (wednesday_df[\"Total Length of Bwd Packets\"] <282)\n", 410 | " ])\n", 411 | "\n", 412 | "# DoS GoldenEye\n", 413 | "# -------------\n", 414 | "\n", 415 | "label_flows(wednesday_df, \"DoS GoldenEye\", 1499263803231753000, 1499264408915718000, [\"172.16.0.1\"],\n", 416 | " [\"192.168.10.50\"], dst_port_list=[80], also_flip_flow_direction=True)\n", 417 | "\n", 418 | "#Payload filter\n", 419 | "label_flows(wednesday_df, \"DoS GoldenEye - Attempted\", 1499263803231753000, 1499264408915718000, [\"172.16.0.1\"],\n", 420 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n", 421 | "\n", 422 | "# Heartbleed\n", 423 | "# ----------\n", 424 | "\n", 425 | "label_flows(wednesday_df, \"Heartbleed\", 1499278335650811000, 1499279563294455000, [\"172.16.0.1\"],\n", 426 | " [\"192.168.10.51\"], dst_port_list=[444], src_port_list=[45022], also_flip_flow_direction=True)\n", 427 | "\n", 428 | "#Payload filter\n", 429 | "label_flows(wednesday_df, \"Heartbleed - Attempted\", 1499278335650811000, 1499279563294455000, [\"172.16.0.1\"],\n", 430 | " [\"192.168.10.51\"], dst_port_list=[444], src_port_list=[45022], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n", 431 | "\n", 432 | "label_rest_as_benign_and_write_csv(wednesday_df, OUTPUT_PATH + \"Wednesday-workingHours.pcap_ISCX.csv\")\n", 433 | "\n", 434 | "wednesday_df = None\n" 435 | ], 436 | "metadata": { 437 | "collapsed": false 438 | } 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 5, 443 | "outputs": [ 444 | { 445 | "name": "stderr", 446 | "output_type": "stream", 447 | "text": [ 448 | "/tmp/ipykernel_52950/4104559264.py:5: DtypeWarning: Columns (0,1,3,6,84) have mixed types.Specify dtype option on import or set low_memory=False.\n", 449 | " thursday_df = read_csvs_from_path_and_reformat(DATASET_PATH + \"thursday/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv\")\n" 450 | ] 451 | }, 452 | { 453 | "name": "stdout", 454 | "output_type": "stream", 455 | "text": [ 456 | "labels before pre-processing: BENIGN 168186\n", 457 | "Web Attack – Brute Force 1507\n", 458 | "Web Attack – XSS 652\n", 459 | "Web Attack – Sql Injection 21\n", 460 | "Name: Label, dtype: int64\n", 461 | "labels after pre-processing: NeedManualLabel 458968\n", 462 | "Name: Label, dtype: int64\n", 463 | "label count after labelling:\r\n", 464 | " BENIGN 455536\n", 465 | "Web Attack - Brute Force - Attempted 2660\n", 466 | "Web Attack - XSS - Attempted 616\n", 467 | "Web Attack - Brute Force 74\n", 468 | "Web Attack - SQL Injection - Attempted 39\n", 469 | "Web Attack - SQL Injection 25\n", 470 | "Web Attack - XSS 18\n", 471 | "Name: Label, dtype: int64\n", 472 | "Attempted Category count after labelling:\r\n", 473 | " -1 455653\n", 474 | " 0 3222\n", 475 | " 4 71\n", 476 | " 2 22\n", 477 | "Name: Attempted Category, dtype: int64\n", 478 | "labels before pre-processing: BENIGN 288566\n", 479 | "Infiltration 36\n", 480 | "Name: Label, dtype: int64\n", 481 | "labels after pre-processing: NeedManualLabel 288602\n", 482 | "Name: Label, dtype: int64\n", 483 | "label count after labelling:\r\n", 484 | " BENIGN 227426\n", 485 | "Infiltration - Portscan 61106\n", 486 | "Infiltration 39\n", 487 | "Infiltration - Attempted 31\n", 488 | "Name: Label, dtype: int64\n", 489 | "Attempted Category count after labelling:\r\n", 490 | " -1 288571\n", 491 | " 0 28\n", 492 | " 2 3\n", 493 | "Name: Attempted Category, dtype: int64\n" 494 | ] 495 | } 496 | ], 497 | "source": [ 498 | "#---------------------+\n", 499 | "# THURSDAY 06-07-2017 |\n", 500 | "#---------------------+\n", 501 | "\n", 502 | "thursday_df = read_csvs_from_path_and_reformat(DATASET_PATH + \"thursday/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv\")\n", 503 | "\n", 504 | "# Web Attack - Brute Force\n", 505 | "# ------------------------\n", 506 | "\n", 507 | "label_flows(thursday_df, \"Web Attack - Brute Force - Attempted\", 1499343354880049000, 1499343531179279000,\n", 508 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80], attempted_category=2, also_flip_flow_direction=True)\n", 509 | "\n", 510 | "label_flows(thursday_df, \"Web Attack - Brute Force\", 1499343567660566000, 1499346011622209000,\n", 511 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80], additional_filters=\n", 512 | " [\n", 513 | " (thursday_df[\"Total Fwd Packets\"] > 20) | (thursday_df[\"Source Port\"] == 44464)\n", 514 | " ])\n", 515 | "#Flip\n", 516 | "label_flows(thursday_df, \"Web Attack - Brute Force\", 1499343567660566000, 1499346011622209000,\n", 517 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], additional_filters=\n", 518 | " [\n", 519 | " (thursday_df[\"Total Backward Packets\"] > 20) | (thursday_df[\"Destination Port\"] == 44464)\n", 520 | " ])\n", 521 | "\n", 522 | "#Payload filter (can't use switch_flow_direction because there are additional_filters)\n", 523 | "label_flows(thursday_df, \"Web Attack - Brute Force - Attempted\", 1499343567660566000, 1499346011622209000,\n", 524 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80], payload_filter=True, attempted_category=0,\n", 525 | " additional_filters=\n", 526 | " [~((thursday_df[\"Total Fwd Packets\"] > 20) | (thursday_df[\"Source Port\"] == 44464))])\n", 527 | "\n", 528 | "label_flows(thursday_df, \"Web Attack - Brute Force - Attempted\", 1499343567660566000, 1499346011622209000,\n", 529 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], attempted_category=0,\n", 530 | " additional_filters=\n", 531 | " [\n", 532 | " ~((thursday_df[\"Total Backward Packets\"] > 20) | (thursday_df[\"Destination Port\"] == 44464))\n", 533 | " & (thursday_df[\"Total Length of Bwd Packets\"] == 0)\n", 534 | " ])\n", 535 | "\n", 536 | "label_flows(thursday_df, \"Web Attack - Brute Force - Attempted\", 1499343567660566000, 1499346011622209000,\n", 537 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80], attempted_category=4,\n", 538 | " additional_filters=\n", 539 | " [\n", 540 | " (thursday_df[\"Total Length of Fwd Packets\"] > 0) & ~(thursday_df[\"Source Port\"] == 44464) &\n", 541 | " (thursday_df[\"Total Fwd Packets\"] == 4) & (thursday_df[\"Total Backward Packets\"] == 4)\n", 542 | " ])\n", 543 | "\n", 544 | "label_flows(thursday_df, \"Web Attack - Brute Force - Attempted\", 1499343567660566000, 1499346011622209000,\n", 545 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], attempted_category=4,\n", 546 | " additional_filters=\n", 547 | " [\n", 548 | " (thursday_df[\"Total Length of Bwd Packets\"] > 0) & ~(thursday_df[\"Destination Port\"] == 44464) &\n", 549 | " (thursday_df[\"Total Backward Packets\"] == 4) & (thursday_df[\"Total Fwd Packets\"] == 4)\n", 550 | " ])\n", 551 | "\n", 552 | "# Web Attack - XSS\n", 553 | "# ----------------\n", 554 | "\n", 555 | "label_flows(thursday_df, \"Web Attack - XSS\", 1499346935283859000, 1499348121341704000, [\"172.16.0.1\"],\n", 556 | " [\"192.168.10.50\"], dst_port_list=[80], additional_filters=\n", 557 | " [\n", 558 | " ~(thursday_df[\"Source Port\"].isin([36180, 36182, 36184, 36186, 36188, 36190])) &\n", 559 | " (thursday_df[\"Total Fwd Packets\"] >= 150)\n", 560 | " ])\n", 561 | "#Flip\n", 562 | "label_flows(thursday_df, \"Web Attack - XSS\", 1499346935283859000, 1499348121341704000,\n", 563 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], additional_filters=\n", 564 | " [\n", 565 | " ~(thursday_df[\"Destination Port\"].isin([36180, 36182, 36184, 36186, 36188, 36190])) &\n", 566 | " (thursday_df[\"Total Backward Packets\"] >= 150)\n", 567 | " ])\n", 568 | "\n", 569 | "label_flows(thursday_df, \"Web Attack - XSS - Attempted\", 1499346935283859000, 1499348121341704000, [\"172.16.0.1\"],\n", 570 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=0, payload_filter=True, additional_filters=\n", 571 | " [\n", 572 | " ~(thursday_df[\"Source Port\"].isin([36180, 36182, 36184, 36186, 36188, 36190]))])\n", 573 | "#Flip\n", 574 | "label_flows(thursday_df, \"Web Attack - XSS - Attempted\", 1499346935283859000, 1499348121341704000, [\"172.16.0.1\"],\n", 575 | " [\"192.168.10.50\"], src_port_list=[80], attempted_category=0, additional_filters=\n", 576 | " [\n", 577 | " ~(thursday_df[\"Destination Port\"].isin([36180, 36182, 36184, 36186, 36188, 36190])) &\n", 578 | " (thursday_df[\"Total Length of Bwd Packets\"] == 0)\n", 579 | " ])\n", 580 | "\n", 581 | "label_flows(thursday_df, \"Web Attack - XSS - Attempted\", 1499346935283859000, 1499348121341704000, [\"172.16.0.1\"],\n", 582 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=2, additional_filters=\n", 583 | " [\n", 584 | " ~(thursday_df[\"Source Port\"].isin([36180, 36182, 36184, 36186, 36188, 36190])) &\n", 585 | " (thursday_df[\"Total Length of Fwd Packets\"] > 0) & (thursday_df[\"Total Fwd Packets\"] < 150)\n", 586 | " ])\n", 587 | "\n", 588 | "#Flip\n", 589 | "label_flows(thursday_df, \"Web Attack - XSS - Attempted\", 1499346935283859000, 1499348121341704000,\n", 590 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], attempted_category=2, additional_filters=\n", 591 | " [\n", 592 | " ~(thursday_df[\"Destination Port\"].isin([36180, 36182, 36184, 36186, 36188, 36190])) &\n", 593 | " (thursday_df[\"Total Length of Bwd Packets\"] > 0) & (thursday_df[\"Total Backward Packets\"] < 150)\n", 594 | " ])\n", 595 | "\n", 596 | "# Web Attack - SQL Injection\n", 597 | "# --------------------------\n", 598 | "\n", 599 | "label_flows(thursday_df, \"Web Attack - SQL Injection - Attempted\", 1499348127852814000, 1499348145720612000,\n", 600 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80], attempted_category=2,\n", 601 | " additional_filters=[\n", 602 | " thursday_df[\"Source Port\"].isin([36180, 36182, 36184, 36186, 36188])\n", 603 | " ])\n", 604 | "\n", 605 | "#Flip\n", 606 | "label_flows(thursday_df, \"Web Attack - SQL Injection - Attempted\", 1499348127852814000, 1499348145720612000,\n", 607 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], attempted_category=2,\n", 608 | " additional_filters=[\n", 609 | " thursday_df[\"Destination Port\"].isin([36180, 36182, 36184, 36186, 36188])\n", 610 | " ])\n", 611 | "\n", 612 | "label_flows(thursday_df, \"Web Attack - SQL Injection\", 1499348145732950000, 1499348575320284000,\n", 613 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80],\n", 614 | " additional_filters=[\n", 615 | " ~(thursday_df[\"Source Port\"].isin([36180, 36182, 36184, 36186, 36188]))\n", 616 | " ])\n", 617 | "\n", 618 | "#Flip\n", 619 | "label_flows(thursday_df, \"Web Attack - SQL Injection\", 1499348145732950000, 1499348575320284000,\n", 620 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80],\n", 621 | " additional_filters=[\n", 622 | " ~(thursday_df[\"Destination Port\"].isin([36180, 36182, 36184, 36186, 36188]))\n", 623 | " ])\n", 624 | "#Payload filter\n", 625 | "label_flows(thursday_df, \"Web Attack - SQL Injection - Attempted\", 1499348127852814000, 1499348145720612000,\n", 626 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80], attempted_category=0,\n", 627 | " payload_filter=True, also_flip_flow_direction=True)\n", 628 | "\n", 629 | "\n", 630 | "label_rest_as_benign_and_write_csv(thursday_df,\n", 631 | " OUTPUT_PATH + \"Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv\")\n", 632 | "\n", 633 | "# Infiltration\n", 634 | "# 5.1 Dropbox Download\n", 635 | "# ------------\n", 636 | "thursday_df = read_csvs_from_path_and_reformat(DATASET_PATH +\n", 637 | " \"thursday/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv\")\n", 638 | "\n", 639 | "label_flows(thursday_df, \"Infiltration\", 1499361542547210000, 1499366769364731000, [\"192.168.10.8\"], [\"205.174.165.73\"],\n", 640 | " also_flip_flow_direction=True)\n", 641 | "\n", 642 | "#Payload filter\n", 643 | "label_flows(thursday_df, \"Infiltration - Attempted\", 1499361542547210000, 1499366769364731000, [\"192.168.10.8\"],\n", 644 | " [\"205.174.165.73\"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n", 645 | "\n", 646 | "\n", 647 | "label_flows(thursday_df, \"Infiltration - Attempted\", 1499361228830533000, 1499361301251276000 , [\"192.168.10.9\"],\n", 648 | " [\"205.174.165.73\"], attempted_category=2, also_flip_flow_direction=True)\n", 649 | "\n", 650 | "# 5.2 Cooldisk Mac\n", 651 | "\n", 652 | "label_flows(thursday_df, \"Infiltration\", 1499363616453990000, 1499371339347892000, [\"192.168.10.25\"], [\"205.174.165.73\"],\n", 653 | " also_flip_flow_direction=True)\n", 654 | "\n", 655 | "#Payload filter\n", 656 | "label_flows(thursday_df, \"Infiltration - Attempted\", 1499363616453990000, 1499371339347892000, [\"192.168.10.25\"],\n", 657 | " [\"205.174.165.73\"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n", 658 | "\n", 659 | "\n", 660 | "# 5.3 NMAP + Portscan\n", 661 | "\n", 662 | "# Round 1\n", 663 | "\n", 664 | "label_flows(thursday_df, \"Infiltration - Portscan\", 1499360400000000000, 1499360460000000000, [\"172.16.0.1\"],\n", 665 | " [\"192.168.10.51\"], additional_filters=[\n", 666 | " (thursday_df[\"Source Port\"] == 50122)\n", 667 | " ])\n", 668 | "\n", 669 | "# Round 2\n", 670 | "\n", 671 | "label_flows(thursday_df, \"Infiltration - Portscan\", 1499362410884008000, 1499362444285175000, [\"192.168.10.8\"],\n", 672 | " [\"192.168.10.5\"])\n", 673 | "\n", 674 | "# Round 3\n", 675 | "\n", 676 | "label_flows(thursday_df, \"Infiltration - Portscan\", 1499364314425162000, 1499366764331875000, [\"192.168.10.8\"],\n", 677 | " [\"192.168.10.5\", \"192.168.10.9\", \"192.168.10.12\", \"192.168.10.14\", \"192.168.10.15\", \"192.168.10.16\",\n", 678 | " \"192.168.10.17\", \"192.168.10.19\", \"192.168.10.25\", \"192.168.10.50\", \"192.168.10.51\"], additional_filters= [\n", 679 | " ~((thursday_df[\"Fwd Packet Length Max\"] == 408) & (thursday_df[\"Destination IP\"] == \"192.168.10.50\")) &\n", 680 | " ~((thursday_df[\"Total Length of Fwd Packets\"].isin([176, 20514])) & (thursday_df[\"Destination IP\"] == \"192.168.10.50\"))\n", 681 | " ]\n", 682 | ")\n", 683 | "\n", 684 | "label_rest_as_benign_and_write_csv(thursday_df,\n", 685 | " OUTPUT_PATH + \"Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv\")\n", 686 | "\n", 687 | "thursday_df = None" 688 | ], 689 | "metadata": { 690 | "collapsed": false 691 | } 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": 6, 696 | "outputs": [ 697 | { 698 | "name": "stdout", 699 | "output_type": "stream", 700 | "text": [ 701 | "labels before pre-processing: PortScan 158930\n", 702 | "BENIGN 127537\n", 703 | "Name: Label, dtype: int64\n", 704 | "labels after pre-processing: NeedManualLabel 286467\n", 705 | "Name: Label, dtype: int64\n", 706 | "label count after labelling:\r\n", 707 | " Portscan 158939\n", 708 | "BENIGN 126905\n", 709 | "Botnet - Attempted 623\n", 710 | "Name: Label, dtype: int64\n", 711 | "Attempted Category count after labelling:\r\n", 712 | " -1 285844\n", 713 | " 1 623\n", 714 | "Name: Attempted Category, dtype: int64\n", 715 | "labels before pre-processing: BENIGN 189067\n", 716 | "Bot 1966\n", 717 | "Name: Label, dtype: int64\n", 718 | "labels after pre-processing: NeedManualLabel 191033\n", 719 | "Name: Label, dtype: int64\n", 720 | "label count after labelling:\r\n", 721 | " BENIGN 189071\n", 722 | "Botnet 1472\n", 723 | "Botnet - Attempted 490\n", 724 | "Name: Label, dtype: int64\n", 725 | "Attempted Category count after labelling:\r\n", 726 | " -1 190543\n", 727 | " 1 490\n", 728 | "Name: Attempted Category, dtype: int64\n", 729 | "labels before pre-processing: DDoS 128027\n", 730 | "BENIGN 97718\n", 731 | "Name: Label, dtype: int64\n", 732 | "labels after pre-processing: NeedManualLabel 225745\n", 733 | "Name: Label, dtype: int64\n", 734 | "label count after labelling:\r\n", 735 | " DDoS 159366\n", 736 | "BENIGN 66028\n", 737 | "Botnet - Attempted 350\n", 738 | "DDoS - Attempted 1\n", 739 | "Name: Label, dtype: int64\n", 740 | "Attempted Category count after labelling:\r\n", 741 | " -1 225394\n", 742 | " 1 350\n", 743 | " 0 1\n", 744 | "Name: Attempted Category, dtype: int64\n" 745 | ] 746 | } 747 | ], 748 | "source": [ 749 | "#-------------------+\n", 750 | "# FRIDAY 07-07-2017 |\n", 751 | "#-------------------+\n", 752 | "\n", 753 | "friday_df = read_csvs_from_path_and_reformat(DATASET_PATH + \"friday/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv\")\n", 754 | "\n", 755 | "# Portscan\n", 756 | "# --------\n", 757 | "\n", 758 | "#First round\n", 759 | "label_flows(friday_df, \"Portscan\", 1499446532117090000, 1499447948582083000, [\"172.16.0.1\"], [\"192.168.10.50\"],\n", 760 | " also_flip_flow_direction=True)\n", 761 | "\n", 762 | "\n", 763 | "#Second round\n", 764 | "label_flows(friday_df, \"Portscan\", 1499449860000000000, 1499449919000000000, [\"172.16.0.1\"], [\"192.168.10.50\"],\n", 765 | " additional_filters=[\n", 766 | " ~(friday_df[\"Source Port\"].isin([0, 35952, 35954, 35956, 35958]))\n", 767 | " ]\n", 768 | ")\n", 769 | "\n", 770 | "label_flows(friday_df, \"Portscan\", 1499449920000000000, 1499451841699238000, [\"172.16.0.1\"], [\"192.168.10.50\"])\n", 771 | "\n", 772 | "#Putting Bot labelling in here too because Bot occurs throughout the day\n", 773 | "label_flows(friday_df, \"Botnet\", 1499432653990571000, 1499436122903736000, [\"192.168.10.15\", \"192.168.10.9\",\n", 774 | " \"192.168.10.14\", \"192.168.10.5\", \"192.168.10.8\"], [\"205.174.165.73\"], also_flip_flow_direction=True)\n", 775 | "\n", 776 | "#Payload filter\n", 777 | "label_flows(friday_df, \"Botnet - Attempted\", 1499432653990571000, 1499436122903736000, [\"192.168.10.15\", \"192.168.10.9\",\n", 778 | " \"192.168.10.14\", \"192.168.10.5\", \"192.168.10.8\"], [\"205.174.165.73\"], attempted_category=0,\n", 779 | " payload_filter=True, also_flip_flow_direction=True)\n", 780 | "\n", 781 | "\n", 782 | "label_flows(friday_df, \"Botnet - Attempted\", 1499436180000000000, 1499457684606663000, [\"192.168.10.15\", \"192.168.10.9\",\n", 783 | " \"192.168.10.14\", \"192.168.10.5\", \"192.168.10.8\"], [\"205.174.165.73\"], attempted_category=1, also_flip_flow_direction=True)\n", 784 | "\n", 785 | "label_rest_as_benign_and_write_csv(friday_df,\n", 786 | " OUTPUT_PATH + \"Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv\")\n", 787 | "\n", 788 | "# Botnet\n", 789 | "# ------\n", 790 | "\n", 791 | "friday_df = read_csvs_from_path_and_reformat(DATASET_PATH + \"friday/Friday-WorkingHours-Morning.pcap_ISCX.csv\")\n", 792 | "\n", 793 | "label_flows(friday_df, \"Botnet\", 1499432653990571000, 1499436122903736000, [\"192.168.10.15\", \"192.168.10.9\",\n", 794 | " \"192.168.10.14\", \"192.168.10.5\", \"192.168.10.8\"], [\"205.174.165.73\"], also_flip_flow_direction=True)\n", 795 | "\n", 796 | "#Payload filter\n", 797 | "label_flows(friday_df, \"Botnet - Attempted\", 1499432653990571000, 1499436122903736000, [\"192.168.10.15\", \"192.168.10.9\",\n", 798 | " \"192.168.10.14\", \"192.168.10.5\", \"192.168.10.8\"], [\"205.174.165.73\"], attempted_category=0,\n", 799 | " payload_filter=True, also_flip_flow_direction=True)\n", 800 | "\n", 801 | "label_flows(friday_df, \"Botnet - Attempted\", 1499436180000000000, 1499457684606663000, [\"192.168.10.15\", \"192.168.10.9\",\n", 802 | " \"192.168.10.14\", \"192.168.10.5\", \"192.168.10.8\"], [\"205.174.165.73\"], attempted_category=1, also_flip_flow_direction=True)\n", 803 | "\n", 804 | "label_rest_as_benign_and_write_csv(friday_df,\n", 805 | " OUTPUT_PATH + \"Friday-WorkingHours-Morning.pcap_ISCX.csv\")\n", 806 | "\n", 807 | "# DDoS\n", 808 | "# ----\n", 809 | "\n", 810 | "friday_df = read_csvs_from_path_and_reformat(DATASET_PATH + \"friday/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv\")\n", 811 | "\n", 812 | "label_flows(friday_df, \"DDoS\", 1499453791796937000, 1499454972216560000, [\"172.16.0.1\"], [\"192.168.10.50\"],\n", 813 | " also_flip_flow_direction=True)\n", 814 | "\n", 815 | "# Payload filter\n", 816 | "label_flows(friday_df, \"DDoS - Attempted\", 1499453791796937000, 1499454972216560000, [\"172.16.0.1\"], [\"192.168.10.50\"],\n", 817 | " attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n", 818 | "\n", 819 | "label_flows(friday_df, \"DDoS - Attempted\", 1499453791796937000, 1499454972216560000, [\"192.168.10.50\"], [\"172.16.0.1\"],\n", 820 | " attempted_category=0, additional_filters=[\n", 821 | " (friday_df[\"Total Length of Bwd Packets\"] == 0)\n", 822 | " ])\n", 823 | "\n", 824 | "# Putting Bot labelling in here too because Bot occurs throughout the day\n", 825 | "label_flows(friday_df, \"Botnet\", 1499432653990571000, 1499436122903736000, [\"192.168.10.15\", \"192.168.10.9\",\n", 826 | " \"192.168.10.14\", \"192.168.10.5\", \"192.168.10.8\"], [\"205.174.165.73\"], also_flip_flow_direction=True)\n", 827 | "\n", 828 | "#Payload filter\n", 829 | "label_flows(friday_df, \"Botnet - Attempted\", 1499432653990571000, 1499436122903736000, [\"192.168.10.15\", \"192.168.10.9\",\n", 830 | " \"192.168.10.14\", \"192.168.10.5\", \"192.168.10.8\"], [\"205.174.165.73\"], attempted_category=0,\n", 831 | " payload_filter=True, also_flip_flow_direction=True)\n", 832 | "\n", 833 | "label_flows(friday_df, \"Botnet - Attempted\", 1499436180000000000, 1499457684606663000, [\"192.168.10.15\", \"192.168.10.9\",\n", 834 | " \"192.168.10.14\", \"192.168.10.5\", \"192.168.10.8\"], [\"205.174.165.73\"], attempted_category=1, also_flip_flow_direction=True)\n", 835 | "\n", 836 | "\n", 837 | "label_rest_as_benign_and_write_csv(friday_df, OUTPUT_PATH + \"Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv\")" 838 | ], 839 | "metadata": { 840 | "collapsed": false 841 | } 842 | }, 843 | { 844 | "cell_type": "code", 845 | "execution_count": 19, 846 | "outputs": [], 847 | "source": [ 848 | "\n", 849 | "\n", 850 | "\n", 851 | "\n" 852 | ], 853 | "metadata": { 854 | "collapsed": false 855 | } 856 | } 857 | ], 858 | "metadata": { 859 | "kernelspec": { 860 | "display_name": "Python 3", 861 | "language": "python", 862 | "name": "python3" 863 | }, 864 | "language_info": { 865 | "codemirror_mode": { 866 | "name": "ipython", 867 | "version": 2 868 | }, 869 | "file_extension": ".py", 870 | "mimetype": "text/x-python", 871 | "name": "python", 872 | "nbconvert_exporter": "python", 873 | "pygments_lexer": "ipython2", 874 | "version": "2.7.6" 875 | } 876 | }, 877 | "nbformat": 4, 878 | "nbformat_minor": 0 879 | } 880 | -------------------------------------------------------------------------------- /Labelling/CICIDS2018_labelling_fixed_CICFlowMeter.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "outputs": [], 7 | "source": [ 8 | "import pandas as pd\n", 9 | "import numpy as np\n", 10 | "import glob\n", 11 | "import os\n", 12 | "from sys import platform\n", 13 | "\n", 14 | "# THIS LABELLING SCRIPT IS USED TO LABEL THE CORRECTED VERSION OF CSE-CIC-IDS-2018.\n", 15 | "# FOR DETAILS CONSULT OUR WEBSITE:\n", 16 | "# https://intrusion-detection.distrinet-research.be/CNS2022/index.html\n", 17 | "\n", 18 | "\n", 19 | "pd.set_option('display.max_rows', 100)\n", 20 | "\n", 21 | "# Enter the path that contains the CSV files that were generated by the CICFlowMeter tool. The directory structure should\n", 22 | "# be the following:\n", 23 | "# The dataset path should contain separate subdirectories for each day (e.g. \"Wednesday-14-02-2018\"). In each\n", 24 | "# of these directories, there should be a directory called \"csv\" which contains the CSV files as generated by the\n", 25 | "# CICFlowMeter tool.\n", 26 | "DATASET_PATH = \"\"\n", 27 | "\n", 28 | "# If set to true, a column is added at the front of the CSV with line numbers\n", 29 | "print_index = True" 30 | ], 31 | "metadata": { 32 | "collapsed": false 33 | } 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "outputs": [], 39 | "source": [ 40 | "# Basic preprocessing before getting started on labelling.\n", 41 | "# Deletes rows with \"Infinity\" and NaNs, converts \"Timestamp\" to Pandas Datetime, and converts all necessary columns to\n", 42 | "# numeric values\n", 43 | "def format_csv_for_labelling(df):\n", 44 | " df = df.replace('Infinity', np.nan)\n", 45 | " df['Timestamp'] = pd.to_datetime(df['Timestamp'])\n", 46 | " for column in df.columns:\n", 47 | " if column not in ['Flow ID' , 'Timestamp', 'Src IP', 'Dst IP', 'Label']:\n", 48 | " df[column] = pd.to_numeric(df[column], errors='coerce')\n", 49 | " return df.dropna()\n", 50 | "\n", 51 | "# Reads all csvs of one day and concatenates them into one dataframe\n", 52 | "def read_csvs_from_path_and_reformat(path):\n", 53 | " csv_dataframes = []\n", 54 | "\n", 55 | " all_files = glob.glob(path + \"/*.csv\")\n", 56 | " for file in all_files:\n", 57 | " csv_dataframes.extend([pd.read_csv(file)])\n", 58 | " df = pd.concat(csv_dataframes, ignore_index=True)\n", 59 | "\n", 60 | " print(\"labels before pre-processing:\", df[\"Label\"].value_counts())\n", 61 | " df = format_csv_for_labelling(df)\n", 62 | " print(\"labels after pre-processing:\", df[\"Label\"].value_counts())\n", 63 | "\n", 64 | " df[\"Attempted Category\"] = -1\n", 65 | "\n", 66 | " int64_columns = [\"Total TCP Flow Time\"]\n", 67 | "\n", 68 | " int32_columns = [\"Src Port\", \"Dst Port\", \"Flow Duration\", \"Total Fwd Packet\", \"Total Bwd packets\", \"Total Length of Fwd Packet\", \"Total Length of Bwd Packet\", \"Fwd Packet Length Max\",\n", 69 | " \"Fwd Packet Length Min\", \"Bwd Packet Length Max\", \"Bwd Packet Length Min\", \"Flow IAT Max\", \"Flow IAT Min\", \"Fwd IAT Total\", \"Fwd IAT Max\", \"Fwd IAT Min\", \"Bwd IAT Total\",\n", 70 | " \"Bwd IAT Max\", \"Bwd IAT Min\", \"Fwd PSH Flags\", \"Bwd PSH Flags\", \"Fwd URG Flags\", \"Bwd URG Flags\", \"Packet Length Min\", \"Packet Length Max\", \"FIN Flag Count\", \"SYN Flag Count\", \"RST Flag Count\", \"PSH Flag Count\",\n", 71 | " \"ACK Flag Count\", \"URG Flag Count\", \"CWR Flag Count\", \"ECE Flag Count\", \"Subflow Fwd Packets\", \"Subflow Fwd Bytes\",\n", 72 | " \"Subflow Bwd Packets\", \"Subflow Bwd Bytes\", \"FWD Init Win Bytes\", \"Bwd Init Win Bytes\", \"Fwd Act Data Pkts\", \"Fwd Seg Size Min\", \"Active Max\",\n", 73 | " \"Active Min\", \"Idle Max\", \"Idle Min\"]\n", 74 | "\n", 75 | " int16_columns = [\"Fwd Header Length\", \"Bwd Header Length\", \"ICMP Code\", \"ICMP Type\"]\n", 76 | "\n", 77 | " for column in int64_columns:\n", 78 | " df[column] = df[column].astype('int64')\n", 79 | "\n", 80 | " for column in int32_columns:\n", 81 | " df[column] = df[column].astype('int32')\n", 82 | "\n", 83 | " for column in int16_columns:\n", 84 | " df[column] = df[column].astype('int16')\n", 85 | "\n", 86 | " return df\n", 87 | "\n", 88 | "\n", 89 | "# Main labelling function. Only used for labelling Malicious and Malicious - Attempted flows.\n", 90 | "# Timestamps are in NANOSECONDS (!) Unix time. Note that the CSV files are in the UTC timezone.\n", 91 | "# df = dataframe with flows. Note that labelling happens inplace on the 'df' parameter, and so this function doesn't return anything\n", 92 | "# label = the label that will be given to flows matching the criteria specified in the function\n", 93 | "# additional_filters = add any additional constraints that cannot be covered by the already provided function arguments\n", 94 | "# see examples in the actual labelling logic for correct syntax\n", 95 | "# attempted_category = please consult our website (https://intrusion-detection.distrinet-research.be/CNS2022/Tools_Documentation.html)\n", 96 | "# for details on how the \"Attempted\" categories are defined.\n", 97 | "# payload_filter = When set to true, this will automatically add a constraint [\"Total Length of Fwd Packet\"] == 0. Note that\n", 98 | "# the Attempted label and category still need to be specified manually\n", 99 | "def label_flows(df, label, attack_start_time_nanoseconds, attack_end_time_nanoseconds, src_ip_list=None,\n", 100 | " dst_ip_list=None, dst_port_list=None, attempted_category=-1, additional_filters=[], payload_filter = False):\n", 101 | "\n", 102 | " # Create initial mask with all values set to True. Squeeze is necessary to remove second axis (of size 1)\n", 103 | " # The reason is that a df of shape (X,), if you '&' it with a df of shape (X,1), gets converted to (1,X)\n", 104 | " custom_mask = pd.DataFrame(True, index=df.index, columns=[df.columns[0]]).squeeze()\n", 105 | "\n", 106 | " attack_start_datetime = pd.to_datetime(attack_start_time_nanoseconds, unit='ns')\n", 107 | " attack_end_datetime = pd.to_datetime(attack_end_time_nanoseconds, unit='ns')\n", 108 | "\n", 109 | " custom_mask &= (df[\"Timestamp\"] >= attack_start_datetime)\n", 110 | " custom_mask &= (df[\"Timestamp\"] <= attack_end_datetime)\n", 111 | "\n", 112 | " if src_ip_list is not None:\n", 113 | " custom_mask &= (df[\"Src IP\"].isin(src_ip_list))\n", 114 | " if dst_ip_list is not None:\n", 115 | " custom_mask &= (df[\"Dst IP\"].isin(dst_ip_list))\n", 116 | "\n", 117 | " if dst_port_list is not None:\n", 118 | " custom_mask &= (df[\"Dst Port\"].isin(dst_port_list))\n", 119 | "\n", 120 | " if payload_filter:\n", 121 | " custom_mask &= (df[\"Total Length of Fwd Packet\"] == 0)\n", 122 | "\n", 123 | " for filter in additional_filters:\n", 124 | " custom_mask &= filter\n", 125 | "\n", 126 | " df[\"Label\"].mask(custom_mask, label, inplace=True)\n", 127 | " df[\"Attempted Category\"].mask(custom_mask, attempted_category, inplace=True)\n", 128 | "\n", 129 | "# This function is called when all labelling of malicious flows is completed. Anything that has not yet received a label\n", 130 | "# so far is labelled as Benign.\n", 131 | "def label_rest_as_benign_and_write_csv(df, file_to_write):\n", 132 | " df[\"Label\"].mask(df[\"Label\"] == \"NeedManualLabel\", \"BENIGN\", inplace=True)\n", 133 | "\n", 134 | " # Relabel artefact flows with [Flow Id] = '8.0.6.4-8.6.0.1-0-0-0' to label = 0\n", 135 | " df[\"Label\"].mask(df[\"Flow ID\"] == '8.0.6.4-8.6.0.1-0-0-0', \"BENIGN\", inplace=True)\n", 136 | "\n", 137 | " print(\"label count after labelling:\\r\\n\", df[\"Label\"].value_counts())\n", 138 | " print(\"Attempted Category count after labelling:\\r\\n\", df[\"Attempted Category\"].value_counts())\n", 139 | "\n", 140 | " if print_index:\n", 141 | " df.reset_index(inplace=True, drop=True)\n", 142 | " df.index += 1\n", 143 | " df.index.name = 'id'\n", 144 | " df.to_csv(file_to_write)\n", 145 | " else:\n", 146 | " df.to_csv(file_to_write, index=False)" 147 | ], 148 | "metadata": { 149 | "collapsed": false 150 | } 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 3, 155 | "outputs": [ 156 | { 157 | "name": "stdout", 158 | "output_type": "stream", 159 | "text": [ 160 | "labels before pre-processing: NeedManualLabel 6268692\n", 161 | "Name: Label, dtype: int64\n", 162 | "labels after pre-processing: NeedManualLabel 5898350\n", 163 | "Name: Label, dtype: int64\n", 164 | "label count after labelling:\r\n", 165 | " BENIGN 5610799\n", 166 | "FTP-BruteForce - Attempted 193354\n", 167 | "SSH-BruteForce 94197\n", 168 | "Name: Label, dtype: int64\n", 169 | "Attempted Category count after labelling:\r\n", 170 | " -1 5704996\n", 171 | " 1 193324\n", 172 | " 4 30\n", 173 | "Name: Attempted Category, dtype: int64\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "#----------------------+\n", 179 | "# WEDNESDAY 14-02-2018 |\n", 180 | "#----------------------+\n", 181 | "\n", 182 | "dir_name = \"Wednesday-14-02-2018\"\n", 183 | "wednesday_14022018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + \"/csv\")\n", 184 | "\n", 185 | "#-- FTP-BruteForce\n", 186 | "label_flows(wednesday_14022018_df, \"FTP-BruteForce - Attempted\", 1518618806*(10**9),\n", 187 | " 1518624631*(10**9), [\"18.221.219.4\"], [\"172.31.69.25\"], attempted_category=1)\n", 188 | "\n", 189 | "# FTP-BruteForce - Attempted (tool accidentally got launched in FTP bruteforce mode instead of SSH bruteforce mode)\n", 190 | "# Note that, in order to avoid float imprecisions at the micro- and nanosecond level, the UNIX timestamps such as\n", 191 | "# 1518631281.199541000, which is in seconds, needs to be converted to nanoseconds, so that the number is stored\n", 192 | "# in int64 instead of float.\n", 193 | "label_flows(wednesday_14022018_df, \"FTP-BruteForce - Attempted\", 1518631281199541000,\n", 194 | " 1518631281502585000, [\"13.58.98.64\"], [\"172.31.69.25\"], [21], attempted_category=4)\n", 195 | "\n", 196 | "#-- SSH-BruteForce\n", 197 | "label_flows(wednesday_14022018_df, \"SSH-BruteForce\", 1518631310*(10**9),\n", 198 | " 1518636750*(10**9), [\"13.58.98.64\"], [\"172.31.69.25\"], [22])\n", 199 | "# Payload filter\n", 200 | "label_flows(wednesday_14022018_df, \"SSH-BruteForce - Attempted\", 1518631310*(10**9),\n", 201 | " 1518636750*(10**9), [\"13.58.98.64\"], [\"172.31.69.25\"], [22], attempted_category=0, payload_filter=True)\n", 202 | "\n", 203 | "label_rest_as_benign_and_write_csv(wednesday_14022018_df, DATASET_PATH + dir_name + \".csv\")" 204 | ], 205 | "metadata": { 206 | "collapsed": false 207 | } 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 9, 212 | "outputs": [ 213 | { 214 | "name": "stdout", 215 | "output_type": "stream", 216 | "text": [ 217 | "labels before pre-processing: NeedManualLabel 5762777\n", 218 | "Name: Label, dtype: int64\n", 219 | "labels after pre-processing: NeedManualLabel 5410102\n", 220 | "Name: Label, dtype: int64\n", 221 | "label count after labelling:\r\n", 222 | " BENIGN 5372471\n", 223 | "DoS GoldenEye 22560\n", 224 | "DoS Slowloris 8490\n", 225 | "DoS GoldenEye - Attempted 4301\n", 226 | "DoS Slowloris - Attempted 2280\n", 227 | "Name: Label, dtype: int64\n", 228 | "Attempted Category count after labelling:\r\n", 229 | " -1 5403521\n", 230 | " 4 4248\n", 231 | " 0 2280\n", 232 | " 6 53\n", 233 | "Name: Attempted Category, dtype: int64\n" 234 | ] 235 | } 236 | ], 237 | "source": [ 238 | "#---------------------+\n", 239 | "# THURSDAY 15-02-2018 |\n", 240 | "#---------------------+\n", 241 | "\n", 242 | "dir_name=\"Thursday-15-02-2018\"\n", 243 | "thursday_15022018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + \"/csv\")\n", 244 | "\n", 245 | "#-- DoS GoldenEye\n", 246 | "label_flows(thursday_15022018_df, \"DoS GoldenEye\", 1518701262*(10**9), 1518703905*(10**9), [\"18.219.211.138\"],\n", 247 | " [\"172.31.69.25\"], additional_filters=\n", 248 | " [(thursday_15022018_df[\"Fwd RST Flags\"] == 0) |\n", 249 | " (thursday_15022018_df[\"Flow Duration\"] >= 5050000)])\n", 250 | "\n", 251 | "#-- DoS GoldenEye - Attempted\n", 252 | "label_flows(thursday_15022018_df, \"DoS GoldenEye - Attempted\", 1518701262*(10**9), 1518703905*(10**9), [\"18.219.211.138\"],\n", 253 | " [\"172.31.69.25\"], attempted_category=4, additional_filters=\n", 254 | " [thursday_15022018_df[\"Fwd RST Flags\"] > 0,\n", 255 | " thursday_15022018_df[\"Flow Duration\"] < 5050000])\n", 256 | "\n", 257 | "#-- DoS GoldenEye - Attempted\n", 258 | "label_flows(thursday_15022018_df, \"DoS GoldenEye - Attempted\", 1518701262*(10**9), 1518703905*(10**9), [\"18.219.211.138\"],\n", 259 | " [\"172.31.69.25\"], attempted_category=6, additional_filters=\n", 260 | " [thursday_15022018_df[\"Bwd RST Flags\"] == 1,\n", 261 | " thursday_15022018_df[\"Total Length of Bwd Packet\"] == 0,\n", 262 | " thursday_15022018_df[\"Flow Duration\"] > 100000000])\n", 263 | "\n", 264 | "# Payload filter\n", 265 | "label_flows(thursday_15022018_df, \"DoS GoldenEye - Attempted\", 1518701262*(10**9), 1518703905*(10**9), [\"18.219.211.138\"],\n", 266 | " [\"172.31.69.25\"], attempted_category=0, payload_filter=True)\n", 267 | "\n", 268 | "#-- DoS Slowloris\n", 269 | "label_flows(thursday_15022018_df, \"DoS Slowloris\", 1518706812*(10**9), 1518709321*(10**9), [\"18.217.165.70\"],\n", 270 | " [\"172.31.69.25\"])\n", 271 | "\n", 272 | "# Payload filter\n", 273 | "label_flows(thursday_15022018_df, \"DoS Slowloris - Attempted\", 1518706812*(10**9), 1518709321*(10**9), [\"18.217.165.70\"],\n", 274 | " [\"172.31.69.25\"], attempted_category=0, payload_filter=True)\n", 275 | "\n", 276 | "label_rest_as_benign_and_write_csv(thursday_15022018_df, DATASET_PATH + dir_name + \".csv\")" 277 | ], 278 | "metadata": { 279 | "collapsed": false 280 | } 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 5, 285 | "outputs": [ 286 | { 287 | "name": "stdout", 288 | "output_type": "stream", 289 | "text": [ 290 | "labels before pre-processing: NeedManualLabel 7719001\n", 291 | "Name: Label, dtype: int64\n", 292 | "labels after pre-processing: NeedManualLabel 7390266\n", 293 | "Name: Label, dtype: int64\n", 294 | "label count after labelling:\r\n", 295 | " BENIGN 5481500\n", 296 | "DoS Hulk 1803160\n", 297 | "FTP-BruteForce - Attempted 105520\n", 298 | "DoS Hulk - Attempted 86\n", 299 | "Name: Label, dtype: int64\n", 300 | "Attempted Category count after labelling:\r\n", 301 | " -1 7284660\n", 302 | " 1 105520\n", 303 | " 0 86\n", 304 | "Name: Attempted Category, dtype: int64\n" 305 | ] 306 | } 307 | ], 308 | "source": [ 309 | "#-------------------+\n", 310 | "# FRIDAY 16-02-2018 |\n", 311 | "#-------------------+\n", 312 | "\n", 313 | "dir_name=\"Friday-16-02-2018\"\n", 314 | "friday_16022018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + \"/csv\")\n", 315 | "\n", 316 | "#-- FTP-Patator - Attempted\n", 317 | "label_flows(friday_16022018_df, \"FTP-BruteForce - Attempted\", 1518790334*(10**9), 1518793513*(10**9), [\"13.59.126.31\"],\n", 318 | " [\"172.31.69.25\"], attempted_category=1)\n", 319 | "\n", 320 | "#-- DoS Hulk\n", 321 | "label_flows(friday_16022018_df, \"DoS Hulk\", 1518803127*(10**9), 1518803903*(10**9), [\"18.219.193.20\"], [\"172.31.69.25\"])\n", 322 | "\n", 323 | "# Payload filter\n", 324 | "label_flows(friday_16022018_df, \"DoS Hulk - Attempted\", 1518803127*(10**9), 1518803903*(10**9), [\"18.219.193.20\"],\n", 325 | " [\"172.31.69.25\"], attempted_category=0, payload_filter=True)\n", 326 | "\n", 327 | "#-- Dos Slowhttptest: No actual DoS Slowloris flows are present on this day in this dataset!\n", 328 | "# Instead we only find failed FTP-Patator traffic, which is exactly what is covered earlier in this cell\n", 329 | "\n", 330 | "label_rest_as_benign_and_write_csv(friday_16022018_df, DATASET_PATH + dir_name + \".csv\")" 331 | ], 332 | "metadata": { 333 | "collapsed": false 334 | } 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 6, 339 | "outputs": [ 340 | { 341 | "name": "stdout", 342 | "output_type": "stream", 343 | "text": [ 344 | "labels before pre-processing: NeedManualLabel 6411771\n", 345 | "Name: Label, dtype: int64\n", 346 | "labels after pre-processing: NeedManualLabel 6054702\n", 347 | "Name: Label, dtype: int64\n", 348 | "label count after labelling:\r\n", 349 | " BENIGN 5764497\n", 350 | "DDoS-LOIC-HTTP 289328\n", 351 | "DDoS-LOIC-UDP 797\n", 352 | "DDoS-LOIC-UDP - Attempted 80\n", 353 | "Name: Label, dtype: int64\n", 354 | "Attempted Category count after labelling:\r\n", 355 | " -1 6054622\n", 356 | " 6 80\n", 357 | "Name: Attempted Category, dtype: int64\n" 358 | ] 359 | } 360 | ], 361 | "source": [ 362 | "#--------------------+\n", 363 | "# TUESDAY 20-02-2018 |\n", 364 | "#--------------------+\n", 365 | "\n", 366 | "dir_name=\"Tuesday-20-02-2018\"\n", 367 | "tuesday_20022018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + \"/csv\")\n", 368 | "\n", 369 | "#-- DDoS LOIC HTTP\n", 370 | "label_flows(tuesday_20022018_df, \"DDoS-LOIC-HTTP\", 1519136034*(10**9), 1519139809*(10**9),\n", 371 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n", 372 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n", 373 | " [\"172.31.69.25\"], additional_filters=[\n", 374 | " tuesday_20022018_df[\"Protocol\"] == 6\n", 375 | " ])\n", 376 | "\n", 377 | "# Payload filter\n", 378 | "label_flows(tuesday_20022018_df, \"DDoS-LOIC-HTTP - Attempted\", 1519136034*(10**9), 1519139809*(10**9),\n", 379 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n", 380 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n", 381 | " [\"172.31.69.25\"], attempted_category=0, payload_filter=True, additional_filters=[tuesday_20022018_df[\"Protocol\"] == 6])\n", 382 | "\n", 383 | "#-- DDoS LOIC UDP\n", 384 | "label_flows(tuesday_20022018_df, \"DDoS-LOIC-UDP\", 1519146857*(10**9), 1519147756*(10**9),\n", 385 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n", 386 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n", 387 | " [\"172.31.69.25\"], additional_filters=[\n", 388 | " tuesday_20022018_df[\"Protocol\"] == 17])\n", 389 | "\n", 390 | "# Payload filter\n", 391 | "label_flows(tuesday_20022018_df, \"DDoS-LOIC-UDP - Attempted\", 1519146857*(10**9), 1519147756*(10**9),\n", 392 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n", 393 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n", 394 | " [\"172.31.69.25\"], attempted_category=0, payload_filter=True, additional_filters=[tuesday_20022018_df[\"Protocol\"] == 17])\n", 395 | "\n", 396 | "# Attempted - Target unresponsive (the ICMP destination unreachable answers to the attack - using protocol = 1 for ICMP)\n", 397 | "label_flows(tuesday_20022018_df, \"DDoS-LOIC-UDP - Attempted\", 1519146857*(10**9), 1519147756*(10**9),\n", 398 | " [\"172.31.69.25\"], [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n", 399 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n", 400 | " attempted_category=6, additional_filters=[(tuesday_20022018_df[\"Protocol\"] == 1)])\n", 401 | "\n", 402 | "label_rest_as_benign_and_write_csv(tuesday_20022018_df, DATASET_PATH + dir_name + \".csv\")" 403 | ], 404 | "metadata": { 405 | "collapsed": false 406 | } 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 7, 411 | "outputs": [ 412 | { 413 | "name": "stdout", 414 | "output_type": "stream", 415 | "text": [ 416 | "labels before pre-processing: NeedManualLabel 7295839\n", 417 | "Name: Label, dtype: int64\n", 418 | "labels after pre-processing: NeedManualLabel 6962593\n", 419 | "Name: Label, dtype: int64\n", 420 | "label count after labelling:\r\n", 421 | " BENIGN 5878399\n", 422 | "DDoS-HOIC 1082293\n", 423 | "DDoS-LOIC-UDP 1730\n", 424 | "DDoS-LOIC-UDP - Attempted 171\n", 425 | "Name: Label, dtype: int64\n", 426 | "Attempted Category count after labelling:\r\n", 427 | " -1 6962422\n", 428 | " 6 171\n", 429 | "Name: Attempted Category, dtype: int64\n" 430 | ] 431 | } 432 | ], 433 | "source": [ 434 | "#----------------------+\n", 435 | "# WEDNESDAY 21-02-2018 |\n", 436 | "#----------------------+\n", 437 | "\n", 438 | "dir_name = \"Wednesday-21-02-2018\"\n", 439 | "wednesday_21022018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + \"/csv\")\n", 440 | "\n", 441 | "#-- DDoS LOIC UDP\n", 442 | "label_flows(wednesday_21022018_df, \"DDoS-LOIC-UDP\", 1519222131*(10**9), 1519224219*(10**9),\n", 443 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n", 444 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n", 445 | " [\"172.31.69.28\"], additional_filters=[\n", 446 | " wednesday_21022018_df[\"Protocol\"] == 17\n", 447 | " ])\n", 448 | "\n", 449 | "# Payload filter\n", 450 | "label_flows(wednesday_21022018_df, \"DDoS-LOIC-UDP - Attempted\", 1519222131*(10**9), 1519224219*(10**9),\n", 451 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n", 452 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n", 453 | " [\"172.31.69.28\"], attempted_category=0, payload_filter=True, additional_filters=[wednesday_21022018_df[\"Protocol\"] == 17])\n", 454 | "\n", 455 | "# Attempted - Target unresponsive (the ICMP destination unreachable answers to the attack - using protocol = 1 for ICMP)\n", 456 | "label_flows(wednesday_21022018_df, \"DDoS-LOIC-UDP - Attempted\", 1519222131*(10**9), 1519224219*(10**9),\n", 457 | " [\"172.31.69.28\"], [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n", 458 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n", 459 | " attempted_category=6, additional_filters=[(wednesday_21022018_df[\"Protocol\"] == 1)])\n", 460 | "\n", 461 | "#-- DDoS HOIC\n", 462 | "label_flows(wednesday_21022018_df, \"DDoS-HOIC\", 1519236668*(10**9), 1519239955*(10**9),\n", 463 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n", 464 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n", 465 | " [\"172.31.69.28\"], additional_filters=[\n", 466 | " wednesday_21022018_df[\"Protocol\"] == 6\n", 467 | " ])\n", 468 | "\n", 469 | "# Payload filter\n", 470 | "label_flows(wednesday_21022018_df, \"DDoS-HOIC - Attempted\", 1519236668*(10**9), 1519239955*(10**9),\n", 471 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n", 472 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n", 473 | " [\"172.31.69.28\"], attempted_category=0, payload_filter=True, additional_filters=[wednesday_21022018_df[\"Protocol\"] == 6])\n", 474 | "\n", 475 | "label_rest_as_benign_and_write_csv(wednesday_21022018_df, DATASET_PATH + dir_name + \".csv\")" 476 | ], 477 | "metadata": { 478 | "collapsed": false 479 | } 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 8, 484 | "outputs": [ 485 | { 486 | "name": "stdout", 487 | "output_type": "stream", 488 | "text": [ 489 | "labels before pre-processing: NeedManualLabel 6483351\n", 490 | "Name: Label, dtype: int64\n", 491 | "labels after pre-processing: NeedManualLabel 6071153\n", 492 | "Name: Label, dtype: int64\n", 493 | "label count after labelling:\r\n", 494 | " BENIGN 6070945\n", 495 | "Web Attack - Brute Force - Attempted 76\n", 496 | "Web Attack - Brute Force 69\n", 497 | "Web Attack - XSS 40\n", 498 | "Web Attack - SQL 16\n", 499 | "Web Attack - SQL - Attempted 4\n", 500 | "Web Attack - XSS - Attempted 3\n", 501 | "Name: Label, dtype: int64\n", 502 | "Attempted Category count after labelling:\r\n", 503 | " -1 6071070\n", 504 | " 5 66\n", 505 | " 2 12\n", 506 | " 0 4\n", 507 | " 3 1\n", 508 | "Name: Attempted Category, dtype: int64\n" 509 | ] 510 | } 511 | ], 512 | "source": [ 513 | "#---------------------+\n", 514 | "# THURSDAY 22-02-2018 |\n", 515 | "#---------------------+\n", 516 | "\n", 517 | "dir_name = \"Thursday-22-02-2018\"\n", 518 | "thursday_22022018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + \"/csv\")\n", 519 | "\n", 520 | "#-- Web Attack SQL\n", 521 | "label_flows(thursday_22022018_df, \"Web Attack - SQL\", 1519330590418906000, 1519331276022793000, [\"18.218.115.60\"],\n", 522 | " [\"172.31.69.28\"], additional_filters=\n", 523 | " [thursday_22022018_df[\"Total Length of Fwd Packet\"] > 0,\n", 524 | " thursday_22022018_df[\"Total Length of Bwd Packet\"] > 0])\n", 525 | "\n", 526 | "# Attack startup artefact\n", 527 | "label_flows(thursday_22022018_df, \"Web Attack - SQL - Attempted\", 1519330470169342000, 1519330498599986000, [\"18.218.115.60\"],\n", 528 | " [\"172.31.69.28\"], attempted_category=2)\n", 529 | "\n", 530 | "# Payload filter\n", 531 | "label_flows(thursday_22022018_df, \"Web Attack - SQL - Attempted\", 1519330590418906000, 1519331276022793000, [\"18.218.115.60\"],\n", 532 | " [\"172.31.69.28\"], attempted_category=0, payload_filter=True)\n", 533 | "\n", 534 | "#-- Web Attack XSS\n", 535 | "# Port 63782 is attack setup (navigating to website)\n", 536 | "label_flows(thursday_22022018_df, \"Web Attack - XSS\", 1519321899783923000, 1519324181827037000, [\"18.218.115.60\"],\n", 537 | " [\"172.31.69.28\"], additional_filters=\n", 538 | " [~(thursday_22022018_df[\"Src Port\"].isin([63782, 64144]))])\n", 539 | "\n", 540 | "# Attempted attack setup\n", 541 | "label_flows(thursday_22022018_df, \"Web Attack - XSS - Attempted\", 1519321899783923000, 1519324181827037000, [\"18.218.115.60\"],\n", 542 | " [\"172.31.69.28\"], attempted_category=2, additional_filters=\n", 543 | " [thursday_22022018_df[\"Src Port\"] == 63782])\n", 544 | "\n", 545 | "label_flows(thursday_22022018_df, \"Web Attack - XSS - Attempted\", 1519321899783923000, 1519324181827037000, [\"18.218.115.60\"],\n", 546 | " [\"172.31.69.28\"], attempted_category=3, additional_filters=\n", 547 | " [thursday_22022018_df[\"Src Port\"] == 64144])\n", 548 | "\n", 549 | "# Payload filter\n", 550 | "label_flows(thursday_22022018_df, \"Web Attack - XSS - Attempted\", 1519321899783923000, 1519324181827037000, [\"18.218.115.60\"],\n", 551 | " [\"172.31.69.28\"], attempted_category=0, payload_filter=True, additional_filters=\n", 552 | " [~(thursday_22022018_df[\"Src Port\"].isin([63782, 64144]))])\n", 553 | "\n", 554 | "#-- Web Attack Brute Force & Attempted\n", 555 | "\n", 556 | "label_flows(thursday_22022018_df, \"Web Attack - Brute Force\", 1519309071336902000, 1519313039858533000, [\"18.218.115.60\"],\n", 557 | " [\"172.31.69.28\"], additional_filters=\n", 558 | " [thursday_22022018_df[\"Total Fwd Packet\"] > 20])\n", 559 | "\n", 560 | "label_flows(thursday_22022018_df, \"Web Attack - Brute Force - Attempted\", 1519309071336902000, 1519313039858533000,\n", 561 | " [\"18.218.115.60\"], [\"172.31.69.28\"], attempted_category=5, additional_filters=\n", 562 | " [(thursday_22022018_df[\"Total Fwd Packet\"] <= 20) & (thursday_22022018_df[\"Total Length of Fwd Packet\"] > 0)])\n", 563 | "\n", 564 | "label_flows(thursday_22022018_df, \"Web Attack - Brute Force - Attempted\", 1519308824965705000, 1519308947920399000, [\"18.218.115.60\"],\n", 565 | " [\"172.31.69.28\"], attempted_category=2)\n", 566 | "\n", 567 | "# Payload filter\n", 568 | "label_flows(thursday_22022018_df, \"Web Attack - Brute Force - Attempted\", 1519309071336902000, 1519313039858533000,\n", 569 | " [\"18.218.115.60\"], [\"172.31.69.28\"], attempted_category=0, payload_filter=True)\n", 570 | "\n", 571 | "label_rest_as_benign_and_write_csv(thursday_22022018_df, DATASET_PATH + dir_name + \".csv\")\n" 572 | ], 573 | "metadata": { 574 | "collapsed": false 575 | } 576 | }, 577 | { 578 | "cell_type": "code", 579 | "execution_count": 9, 580 | "outputs": [ 581 | { 582 | "name": "stdout", 583 | "output_type": "stream", 584 | "text": [ 585 | "labels before pre-processing: NeedManualLabel 6313169\n", 586 | "Name: Label, dtype: int64\n", 587 | "labels after pre-processing: NeedManualLabel 5976481\n", 588 | "Name: Label, dtype: int64\n", 589 | "label count after labelling:\r\n", 590 | " BENIGN 5976251\n", 591 | "Web Attack - XSS 73\n", 592 | "Web Attack - Brute Force 62\n", 593 | "Web Attack - Brute Force - Attempted 61\n", 594 | "Web Attack - SQL 23\n", 595 | "Web Attack - SQL - Attempted 10\n", 596 | "Web Attack - XSS - Attempted 1\n", 597 | "Name: Label, dtype: int64\n", 598 | "Attempted Category count after labelling:\r\n", 599 | " -1 5976409\n", 600 | " 5 60\n", 601 | " 0 6\n", 602 | " 2 6\n", 603 | "Name: Attempted Category, dtype: int64\n" 604 | ] 605 | } 606 | ], 607 | "source": [ 608 | "#-------------------+\n", 609 | "# FRIDAY 23-02-2018 |\n", 610 | "#-------------------+\n", 611 | "\n", 612 | "dir_name = \"Friday-23-02-2018\"\n", 613 | "friday_23022018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + \"/csv\")\n", 614 | "\n", 615 | "#-- Web Attack SQL\n", 616 | "label_flows(friday_23022018_df, \"Web Attack - SQL\", 1519412792126122000, 1519413444947957000 , [\"18.218.115.60\"],\n", 617 | " [\"172.31.69.28\"], additional_filters=\n", 618 | " [friday_23022018_df[\"Total Length of Fwd Packet\"] > 0,\n", 619 | " friday_23022018_df[\"Total Length of Bwd Packet\"] > 0])\n", 620 | "\n", 621 | "# Attack startup artefact\n", 622 | "label_flows(friday_23022018_df, \"Web Attack - SQL - Attempted\", 1519412722675686000, 1519412787879296000, [\"18.218.115.60\"],\n", 623 | " [\"172.31.69.28\"], attempted_category=2)\n", 624 | "\n", 625 | "# Payload filter\n", 626 | "label_flows(friday_23022018_df, \"Web Attack - SQL - Attempted\", 1519412792126122000, 1519413444947957000 , [\"18.218.115.60\"],\n", 627 | " [\"172.31.69.28\"], attempted_category=0, payload_filter=True)\n", 628 | "\n", 629 | "#-- Web Attack XSS\n", 630 | "label_flows(friday_23022018_df, \"Web Attack - XSS\", 1519405264559707000, 1519409428237472000, [\"18.218.115.60\"],\n", 631 | " [\"172.31.69.28\"], additional_filters=\n", 632 | " [~(friday_23022018_df[\"Src Port\"].isin([59173]))])\n", 633 | "\n", 634 | "label_flows(friday_23022018_df, \"Web Attack - XSS - Attempted\", 1519405264559707000, 1519409428237472000, [\"18.218.115.60\"],\n", 635 | " [\"172.31.69.28\"], attempted_category=2, additional_filters=\n", 636 | " [(friday_23022018_df[\"Src Port\"].isin([59173]))])\n", 637 | "\n", 638 | "# Payload filter\n", 639 | "label_flows(friday_23022018_df, \"Web Attack - XSS - Attempted\", 1519405264559707000, 1519409428237472000, [\"18.218.115.60\"],\n", 640 | " [\"172.31.69.28\"], attempted_category=0, payload_filter=True)\n", 641 | "\n", 642 | "#-- Web Attack Brute Force & Attempted\n", 643 | "label_flows(friday_23022018_df, \"Web Attack - Brute Force\", 1519394670193975000, 1519398186406294000, [\"18.218.115.60\"],\n", 644 | " [\"172.31.69.28\"], additional_filters=\n", 645 | " [friday_23022018_df[\"Total Fwd Packet\"] > 20])\n", 646 | "\n", 647 | "label_flows(friday_23022018_df, \"Web Attack - Brute Force - Attempted\", 1519394670193975000, 1519398186406294000,\n", 648 | " [\"18.218.115.60\"], [\"172.31.69.28\"], attempted_category=5, additional_filters=\n", 649 | " [(friday_23022018_df[\"Total Fwd Packet\"] <= 20) & (friday_23022018_df[\"Total Length of Fwd Packet\"] > 0)])\n", 650 | "\n", 651 | "# Payload filter:\n", 652 | "label_flows(friday_23022018_df, \"Web Attack - Brute Force - Attempted\", 1519394670193975000, 1519398186406294000,\n", 653 | " [\"18.218.115.60\"], [\"172.31.69.28\"], attempted_category=0, payload_filter=True)\n", 654 | "\n", 655 | "label_rest_as_benign_and_write_csv(friday_23022018_df, DATASET_PATH + dir_name + \".csv\")" 656 | ], 657 | "metadata": { 658 | "collapsed": false 659 | } 660 | }, 661 | { 662 | "cell_type": "code", 663 | "execution_count": 10, 664 | "outputs": [ 665 | { 666 | "name": "stdout", 667 | "output_type": "stream", 668 | "text": [ 669 | "labels before pre-processing: NeedManualLabel 7173690\n", 670 | "Name: Label, dtype: int64\n", 671 | "labels after pre-processing: NeedManualLabel 6568726\n", 672 | "Name: Label, dtype: int64\n", 673 | "label count after labelling:\r\n", 674 | " BENIGN 6518882\n", 675 | "Infiltration - NMAP Portscan 49740\n", 676 | "Infiltration - Dropbox Download 46\n", 677 | "Infiltration - Communication Victim Attacker 43\n", 678 | "Infiltration - Dropbox Download - Attempted 15\n", 679 | "Name: Label, dtype: int64\n", 680 | "Attempted Category count after labelling:\r\n", 681 | " -1 6568711\n", 682 | " 4 15\n", 683 | "Name: Attempted Category, dtype: int64\n" 684 | ] 685 | } 686 | ], 687 | "source": [ 688 | "#----------------------+\n", 689 | "# WEDNESDAY 28-02-2018 |\n", 690 | "#----------------------+\n", 691 | "\n", 692 | "dir_name = \"Wednesday-28-02-2018\"\n", 693 | "wednesday_28022018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + \"/csv\")\n", 694 | "\n", 695 | "#-- Infiltration - Dropbox Download\n", 696 | "label_flows(wednesday_28022018_df, \"Infiltration - Dropbox Download\", 1519828404*(10**9), 1519829172*(10**9),\n", 697 | " [\"172.31.69.24\"],\n", 698 | " [\"162.125.3.1\", \"162.125.3.5\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"])\n", 699 | "\n", 700 | "label_flows(wednesday_28022018_df, \"Infiltration - Dropbox Download\", 1519839771*(10**9), 1519839824*(10**9),\n", 701 | " [\"172.31.69.24\"],\n", 702 | " [\"162.125.3.1\", \"162.125.3.5\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"])\n", 703 | "\n", 704 | "# Payload filter\n", 705 | "label_flows(wednesday_28022018_df, \"Infiltration - Dropbox Download - Attempted\", 1519828404*(10**9), 1519829172*(10**9),\n", 706 | " [\"172.31.69.24\"],\n", 707 | " [\"162.125.3.1\", \"162.125.3.5\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"],\n", 708 | " attempted_category=0, payload_filter=True)\n", 709 | "\n", 710 | "label_flows(wednesday_28022018_df, \"Infiltration - Dropbox Download - Attempted\", 1519839771*(10**9), 1519839824*(10**9),\n", 711 | " [\"172.31.69.24\"],\n", 712 | " [\"162.125.3.1\", \"162.125.3.5\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"],\n", 713 | " attempted_category=0, payload_filter=True)\n", 714 | "\n", 715 | "# Attempted - Attack artefact\n", 716 | "label_flows(wednesday_28022018_df, \"Infiltration - Dropbox Download - Attempted\", 1519828404*(10**9), 1519829172*(10**9),\n", 717 | " [\"172.31.69.24\"],\n", 718 | " [\"104.16.100.29\", \"104.16.99.29\", \"52.84.128.3\", \"52.85.101.236\", \"52.85.131.81\", \"52.85.95.206\"], attempted_category=4)\n", 719 | "\n", 720 | "label_flows(wednesday_28022018_df, \"Infiltration - Dropbox Download - Attempted\", 1519839771*(10**9), 1519839824*(10**9),\n", 721 | " [\"172.31.69.24\"],\n", 722 | " [\"104.16.100.29\", \"104.16.99.29\", \"52.84.128.3\", \"52.85.101.236\", \"52.85.131.81\", \"52.85.95.206\"], attempted_category=4)\n", 723 | "\n", 724 | "#-- Infiltration - Communication Victim Attacker\n", 725 | "label_flows(wednesday_28022018_df, \"Infiltration - Communication Victim Attacker\", 1519829140*(10**9),\n", 726 | " 1519834135*(10**9), [\"172.31.69.24\"], [\"13.58.225.34\"])\n", 727 | "\n", 728 | "label_flows(wednesday_28022018_df, \"Infiltration - Communication Victim Attacker\", 1519839839*(10**9),\n", 729 | " 1519843200*(10**9), [\"172.31.69.24\"], [\"13.58.225.34\"])\n", 730 | "\n", 731 | "# Payload filter\n", 732 | "\n", 733 | "label_flows(wednesday_28022018_df, \"Infiltration - Communication Victim Attacker - Attempted\", 1519829140*(10**9),\n", 734 | " 1519834135*(10**9), [\"172.31.69.24\"], [\"13.58.225.34\"], attempted_category=0, payload_filter=True)\n", 735 | "\n", 736 | "label_flows(wednesday_28022018_df, \"Infiltration - Communication Victim Attacker - Attempted\", 1519839839*(10**9),\n", 737 | " 1519843200*(10**9), [\"172.31.69.24\"], [\"13.58.225.34\"], attempted_category=0, payload_filter=True)\n", 738 | "\n", 739 | "#-- Infiltration - NMAP Portscan\n", 740 | "label_flows(wednesday_28022018_df, \"Infiltration - NMAP Portscan\", 1519829182*(10**9), 1519843140746247000,\n", 741 | " [\"172.31.69.24\"],\n", 742 | " [\"172.31.69.1\", \"172.31.69.10\", \"172.31.69.11\", \"172.31.69.12\", \"172.31.69.13\", \"172.31.69.14\",\n", 743 | " \"172.31.69.16\", \"172.31.69.17\", \"172.31.69.19\", \"172.31.69.20\", \"172.31.69.23\", \"172.31.69.4\",\n", 744 | " \"172.31.69.5\", \"172.31.69.6\", \"172.31.69.8\", \"172.31.69.9\", \"172.31.69.7\", \"172.31.69.22\",\n", 745 | " \"172.31.69.15\", \"172.31.69.21\", \"172.31.69.18\",], additional_filters=\n", 746 | " [~(wednesday_28022018_df[\"Src Port\"] == 68)])\n", 747 | "\n", 748 | "label_rest_as_benign_and_write_csv(wednesday_28022018_df, DATASET_PATH + dir_name + \".csv\")" 749 | ], 750 | "metadata": { 751 | "collapsed": false 752 | } 753 | }, 754 | { 755 | "cell_type": "code", 756 | "execution_count": 11, 757 | "outputs": [ 758 | { 759 | "name": "stdout", 760 | "output_type": "stream", 761 | "text": [ 762 | "labels before pre-processing: NeedManualLabel 7252549\n", 763 | "Name: Label, dtype: int64\n", 764 | "labels after pre-processing: NeedManualLabel 6551401\n", 765 | "Name: Label, dtype: int64\n", 766 | "label count after labelling:\r\n", 767 | " BENIGN 6511554\n", 768 | "Infiltration - NMAP Portscan 39634\n", 769 | "Infiltration - Communication Victim Attacker 161\n", 770 | "Infiltration - Dropbox Download 39\n", 771 | "Infiltration - Dropbox Download - Attempted 13\n", 772 | "Name: Label, dtype: int64\n", 773 | "Attempted Category count after labelling:\r\n", 774 | " -1 6551388\n", 775 | " 4 13\n", 776 | "Name: Attempted Category, dtype: int64\n" 777 | ] 778 | } 779 | ], 780 | "source": [ 781 | "#---------------------+\n", 782 | "# THURSDAY 01-03-2018 |\n", 783 | "#---------------------+\n", 784 | "\n", 785 | "dir_name = \"Thursday-01-03-2018\"\n", 786 | "thursday_01032018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + \"/csv\")\n", 787 | "\n", 788 | "#-- Infiltration - Dropbox Download\n", 789 | "label_flows(thursday_01032018_df, \"Infiltration - Dropbox Download\", 1519912390*(10**9), 1519912760*(10**9),\n", 790 | " [\"172.31.69.13\"], [\"162.125.3.1\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"])\n", 791 | "\n", 792 | "label_flows(thursday_01032018_df, \"Infiltration - Dropbox Download\", 1519913032*(10**9), 1519918454*(10**9),\n", 793 | " [\"172.31.69.13\"], [\"162.125.3.1\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"])\n", 794 | "\n", 795 | "# Payload filter\n", 796 | "label_flows(thursday_01032018_df, \"Infiltration - Dropbox Download - Attempted\", 1519912390*(10**9), 1519912760*(10**9),\n", 797 | " [\"172.31.69.13\"],\n", 798 | " [\"162.125.3.1\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"], attempted_category=0, payload_filter=True)\n", 799 | "\n", 800 | "label_flows(thursday_01032018_df, \"Infiltration - Dropbox Download - Attempted\", 1519913032*(10**9), 1519918454*(10**9),\n", 801 | " [\"172.31.69.13\"],\n", 802 | " [\"162.125.3.1\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"], attempted_category=0, payload_filter=True)\n", 803 | "\n", 804 | "# Attempted - Attack artefact\n", 805 | "label_flows(thursday_01032018_df, \"Infiltration - Dropbox Download - Attempted\", 1519912390*(10**9), 1519912760*(10**9),\n", 806 | " [\"172.31.69.13\"], [\"104.16.100.29\", \"13.32.168.125\", \"52.85.112.72\"], attempted_category=4)\n", 807 | "\n", 808 | "label_flows(thursday_01032018_df, \"Infiltration - Dropbox Download - Attempted\", 1519913032*(10**9), 1519918454*(10**9),\n", 809 | " [\"172.31.69.13\"], [\"104.16.100.29\", \"13.32.168.125\", \"52.85.112.72\"], attempted_category=4)\n", 810 | "\n", 811 | "#-- Infiltration - Communication Victim Attacker\n", 812 | "label_flows(thursday_01032018_df, \"Infiltration - Communication Victim Attacker\", 1519912674*(10**9),\n", 813 | " 1519912745*(10**9), [\"172.31.69.13\"], [\"13.58.225.34\"])\n", 814 | "\n", 815 | "label_flows(thursday_01032018_df, \"Infiltration - Communication Victim Attacker\", 1519913075*(10**9),\n", 816 | " 1519928245*(10**9), [\"172.31.69.13\"], [\"13.58.225.34\"])\n", 817 | "\n", 818 | "label_flows(thursday_01032018_df, \"Infiltration - Communication Victim Attacker\", 1519928295*(10**9),\n", 819 | " 1519933041*(10**9), [\"172.31.69.13\"], [\"13.58.225.34\"])\n", 820 | "\n", 821 | "# Payload filter\n", 822 | "label_flows(thursday_01032018_df, \"Infiltration - Communication Victim Attacker - Attempted\", 1519912674*(10**9),\n", 823 | " 1519912745*(10**9), [\"172.31.69.13\"], [\"13.58.225.34\"], attempted_category=0, payload_filter=True)\n", 824 | "\n", 825 | "label_flows(thursday_01032018_df, \"Infiltration - Communication Victim Attacker - Attempted\", 1519913075*(10**9),\n", 826 | " 1519928245*(10**9), [\"172.31.69.13\"], [\"13.58.225.34\"], attempted_category=0, payload_filter=True)\n", 827 | "\n", 828 | "label_flows(thursday_01032018_df, \"Infiltration - Communication Victim Attacker - Attempted\", 1519928295*(10**9),\n", 829 | " 1519933041*(10**9), [\"172.31.69.13\"], [\"13.58.225.34\"], attempted_category=0, payload_filter=True)\n", 830 | "\n", 831 | "#-- Infiltration - NMAP Portscan (TODO: do we not need to filter out DHCP background traffic on port 68 in NMAP\n", 832 | "# of previous day as well?)\n", 833 | "label_flows(thursday_01032018_df, \"Infiltration - NMAP Portscan\", 1519913388354333000, 1519933092182726000,\n", 834 | " [\"172.31.69.13\"],\n", 835 | " [\"172.31.69.1\", \"172.31.69.11\", \"172.31.69.12\", \"172.31.69.16\", \"172.31.69.8\", \"172.31.69.9\",\n", 836 | " \"172.31.69.10\", \"172.31.69.14\", \"172.31.69.4\", \"172.31.69.5\", \"172.31.69.6\", \"172.31.69.17\",\n", 837 | " \"172.31.69.20\", \"172.31.69.23\", \"172.31.69.24\", \"172.31.69.19\", \"172.31.69.7\", \"172.31.69.15\",\n", 838 | " \"172.31.69.18\", \"172.31.69.22\", \"172.31.69.21\"], additional_filters=\n", 839 | " [thursday_01032018_df[\"Src Port\"] != 68])\n", 840 | "\n", 841 | "label_rest_as_benign_and_write_csv(thursday_01032018_df, DATASET_PATH + dir_name + \".csv\")" 842 | ], 843 | "metadata": { 844 | "collapsed": false 845 | } 846 | }, 847 | { 848 | "cell_type": "code", 849 | "execution_count": 3, 850 | "outputs": [ 851 | { 852 | "name": "stdout", 853 | "output_type": "stream", 854 | "text": [ 855 | "labels before pre-processing: NeedManualLabel 6637636\n", 856 | "Name: Label, dtype: int64\n", 857 | "labels after pre-processing: NeedManualLabel 6311371\n", 858 | "Name: Label, dtype: int64\n", 859 | "label count after labelling:\r\n", 860 | " BENIGN 6168188\n", 861 | "Botnet Ares 142921\n", 862 | "Botnet Ares - Attempted 262\n", 863 | "Name: Label, dtype: int64\n", 864 | "Attempted Category count after labelling:\r\n", 865 | " -1 6311109\n", 866 | " 0 258\n", 867 | " 2 4\n", 868 | "Name: Attempted Category, dtype: int64\n" 869 | ] 870 | } 871 | ], 872 | "source": [ 873 | "#-------------------+\n", 874 | "# FRIDAY 02-03-2018 |\n", 875 | "#-------------------+\n", 876 | "\n", 877 | "dir_name = \"Friday-02-03-2018\"\n", 878 | "friday_02032018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + \"/csv\")\n", 879 | "\n", 880 | "#-- Botnet Ares\n", 881 | "label_flows(friday_02032018_df, \"Botnet Ares\", 1520000008*(10**9), 1520020492*(10**9), additional_filters=\n", 882 | " [(friday_02032018_df[\"Src IP\"] == \"18.219.211.138\") | (friday_02032018_df[\"Dst IP\"] == \"18.219.211.138\")])\n", 883 | "\n", 884 | "#-- Botnet Ares - Attempted: Tear-down artefact. Botnet slave has ongoing TCP connection to master which is prematurely terminated by master sending RST packet\n", 885 | "label_flows(friday_02032018_df, \"Botnet Ares - Attempted\", 1520020424*(10**9), 1520020492*(10**9), attempted_category=2, additional_filters=\n", 886 | " [(friday_02032018_df[\"Dst IP\"] == \"18.219.211.138\") &\n", 887 | " (friday_02032018_df[\"Total Length of Fwd Packet\"] > 0) &\n", 888 | " (friday_02032018_df[\"Bwd RST Flags\"] > 0)])\n", 889 | "\n", 890 | "\n", 891 | "# Payload filter\n", 892 | "label_flows(friday_02032018_df, \"Botnet Ares - Attempted\", 1520000008*(10**9), 1520020492*(10**9), attempted_category=0, additional_filters=\n", 893 | " [((friday_02032018_df[\"Src IP\"] == \"18.219.211.138\") | (friday_02032018_df[\"Dst IP\"] == \"18.219.211.138\")) &\n", 894 | " (friday_02032018_df[\"Total Length of Fwd Packet\"] == 0) & (friday_02032018_df[\"Total Length of Bwd Packet\"] == 0)])\n", 895 | "\n", 896 | "label_rest_as_benign_and_write_csv(friday_02032018_df, DATASET_PATH + dir_name + \".csv\")\n", 897 | "\n" 898 | ], 899 | "metadata": { 900 | "collapsed": false 901 | } 902 | }, 903 | { 904 | "cell_type": "code", 905 | "execution_count": 12, 906 | "outputs": [], 907 | "source": [], 908 | "metadata": { 909 | "collapsed": false 910 | } 911 | } 912 | ], 913 | "metadata": { 914 | "kernelspec": { 915 | "display_name": "Python 3", 916 | "language": "python", 917 | "name": "python3" 918 | }, 919 | "language_info": { 920 | "codemirror_mode": { 921 | "name": "ipython", 922 | "version": 2 923 | }, 924 | "file_extension": ".py", 925 | "mimetype": "text/x-python", 926 | "name": "python", 927 | "nbconvert_exporter": "python", 928 | "pygments_lexer": "ipython2", 929 | "version": "2.7.6" 930 | } 931 | }, 932 | "nbformat": 4, 933 | "nbformat_minor": 0 934 | } 935 | -------------------------------------------------------------------------------- /Labelling/CICIDS2018_original_version_labelling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "import glob\n", 14 | "import os\n", 15 | "from sys import platform\n", 16 | "\n", 17 | "# THIS LABELLING SCRIPT IS USED TO LABEL THE OLD VERSION OF CSE-CIC-IDS-2018. THIS VERSION SHOULD ONLY BE USED IF YOU\n", 18 | "# WISH TO RECREATE OUR RESULTS AS REPORTED IN OUR PAPER: https://intrusion-detection.distrinet-research.be/CNS2022/index.html\n", 19 | "\n", 20 | "# THIS SCRIPT ACCEPTS AS INPUT THE ORIGINAL CSVs AS RELEASED BY THE DATASET AUTHORS: https://www.unb.ca/cic/datasets/ids-2018.html\n", 21 | "\n", 22 | "\n", 23 | "pd.set_option('display.max_rows', 100)\n", 24 | "\n", 25 | "\n", 26 | "DATASET_PATH = \"\"\n", 27 | "\n", 28 | "# unset to remove line index (to refer to line numbers when writing final csv)\n", 29 | "print_index = True" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "def format_csv_for_labelling(df):\n", 39 | " # strip leading whitespaces in column names\n", 40 | " df.columns = df.columns.str.lstrip(\" \")\n", 41 | "\n", 42 | " print(\"labels before pre-processing:\", df[\"Label\"].value_counts())\n", 43 | "\n", 44 | " # Keep track of header rows that occur in the middle of the flow traces. Drop them\n", 45 | " # temporarily for ease of labeling and dataframe manipulation and then merge them\n", 46 | " # back in at the very end. The intention is to preserve the original published files\n", 47 | " # exactly except with the corrected labelling. This makes lining up mismatches between\n", 48 | " # the original and corrected version easier, using line number as the reference.\n", 49 | " # This is for 2018 version only, as the 2017 version does not contain header rows in\n", 50 | " # the middle of flow traces.\n", 51 | " header_rows = df[(df[\"Timestamp\"] == \"Timestamp\") & (df.index > 0)]\n", 52 | " df = df.drop(header_rows.index)\n", 53 | "\n", 54 | " # Since CICIDS 2018 authors used 12-hour format but removed AM/PM, we need to reconstruct it\n", 55 | " # We do this based on the knowledge they collected traffic from roughly 9:00 AM to 5:00 PM.\n", 56 | " df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M:%S')\n", 57 | " #for i, item in enumerate(df['Timestamp']):\n", 58 | " # try:\n", 59 | " # new_item = pd.to_datetime(item, format='%d/%m/%Y %H:%M:%S')\n", 60 | " # except ValueError:\n", 61 | " # print('ERROR at index {}: {}'.format(i, item))\n", 62 | "\n", 63 | " df['Timestamp'] = df['Timestamp'].apply(lambda x: x + pd.DateOffset(hours=12) if x.hour < 7 else x)\n", 64 | "\n", 65 | " # Convert to UTC from New Brunswick winter timezone (UTC-4)\n", 66 | " df['Timestamp'] = df['Timestamp'] + pd.DateOffset(hours=4)\n", 67 | "\n", 68 | " for column in df.columns:\n", 69 | " if column not in ['Flow ID' , 'Timestamp', 'Src IP', 'Dst IP', 'Label']:\n", 70 | " df[column] = pd.to_numeric(df[column])\n", 71 | "\n", 72 | " # Add attempted category column and initialise to -1\n", 73 | " df[\"Attempted Category\"] = -1\n", 74 | "\n", 75 | " # CICIDS 2018 author-released version comes prelabelled. This makes sure previous labels don't interfere\n", 76 | " df[\"Label\"] = \"NeedManualLabel\"\n", 77 | "\n", 78 | " print(\"labels after pre-processing:\", df[\"Label\"].value_counts())\n", 79 | "\n", 80 | " return df, header_rows\n", 81 | "\n", 82 | "def read_csvs_from_path_and_reformat(path):\n", 83 | " df = pd.read_csv(path + \"/merged.csv\")\n", 84 | "\n", 85 | " df, header_rows = format_csv_for_labelling(df)\n", 86 | "\n", 87 | " return df, header_rows\n", 88 | "\n", 89 | "# Important note: you should not use the also_flip_flow_direction if you set the additional_filters with a \"Fwd\" or \"Bwd\"\n", 90 | "# column filtering\n", 91 | "def label_flows(df, label, attack_start_time_nanoseconds, attack_end_time_nanoseconds, src_ip_list=None,\n", 92 | " dst_ip_list= None, src_port_list=None, dst_port_list=None, attempted_category = -1, additional_filters=[],\n", 93 | " also_flip_flow_direction=False, payload_filter=False):\n", 94 | " # Note that labelling happens inplace on the 'df' parameter, and so this function doesn't return anything\n", 95 | "\n", 96 | " # Create initial mask with all values set to True. Squeeze is necessary to remove second axis (with value 1)\n", 97 | " # The reason is that a df of shape (X,) gets converted to (1,X) if you '&' it with a df of shape (X,1)\n", 98 | " custom_mask = pd.DataFrame(True, index=df.index, columns=[df.columns[0]]).squeeze()\n", 99 | "\n", 100 | " # Need to round the start time down to the nearest second to prevent edge-case issues with flows being mislabelled as benign\n", 101 | " attack_start_datetime = pd.to_datetime(attack_start_time_nanoseconds, unit='ns').floor(freq='S')\n", 102 | " attack_end_datetime = pd.to_datetime(attack_end_time_nanoseconds, unit='ns')\n", 103 | "\n", 104 | " custom_mask &= (df[\"Timestamp\"] >= attack_start_datetime)\n", 105 | " custom_mask &= (df[\"Timestamp\"] <= attack_end_datetime)\n", 106 | "\n", 107 | " if src_ip_list is not None:\n", 108 | " custom_mask &= (df[\"Src IP\"].isin(src_ip_list))\n", 109 | " if dst_ip_list is not None:\n", 110 | " custom_mask &= (df[\"Dst IP\"].isin(dst_ip_list))\n", 111 | "\n", 112 | " if src_port_list is not None:\n", 113 | " custom_mask &= (df[\"Src Port\"].isin(src_port_list))\n", 114 | " if dst_port_list is not None:\n", 115 | " custom_mask &= (df[\"Dst Port\"].isin(dst_port_list))\n", 116 | "\n", 117 | " # IMPORTANT NOTE: If you decide to add TotLen Fwd Pkt == 6 for catching RST packets, you still have to manually alter some additional_filters for flipped flows where\n", 118 | " # you couldn't use payload_filter boolean function input value\n", 119 | " if payload_filter:\n", 120 | " custom_mask &= (df[\"TotLen Fwd Pkts\"] == 0)\n", 121 | "\n", 122 | " for filter in additional_filters:\n", 123 | " custom_mask &= filter\n", 124 | "\n", 125 | " df[\"Label\"].mask(custom_mask, label, inplace=True)\n", 126 | " df[\"Attempted Category\"].mask(custom_mask, attempted_category, inplace=True)\n", 127 | "\n", 128 | " if also_flip_flow_direction:\n", 129 | "\n", 130 | " custom_mask = pd.DataFrame(True, index=df.index, columns=[df.columns[0]]).squeeze()\n", 131 | "\n", 132 | " custom_mask &= (df[\"Timestamp\"] >= attack_start_datetime)\n", 133 | " custom_mask &= (df[\"Timestamp\"] <= attack_end_datetime)\n", 134 | "\n", 135 | " if src_ip_list is not None:\n", 136 | " custom_mask &= (df[\"Dst IP\"].isin(src_ip_list))\n", 137 | " if dst_ip_list is not None:\n", 138 | " custom_mask &= (df[\"Src IP\"].isin(dst_ip_list))\n", 139 | "\n", 140 | " if src_port_list is not None:\n", 141 | " custom_mask &= (df[\"Dst Port\"].isin(src_port_list))\n", 142 | " if dst_port_list is not None:\n", 143 | " custom_mask &= (df[\"Src Port\"].isin(dst_port_list))\n", 144 | "\n", 145 | " if payload_filter:\n", 146 | " custom_mask &= (df[\"TotLen Bwd Pkts\"] == 0)\n", 147 | "\n", 148 | " for filter in additional_filters:\n", 149 | " custom_mask &= filter\n", 150 | "\n", 151 | " df[\"Label\"].mask(custom_mask, label, inplace=True)\n", 152 | " df[\"Attempted Category\"].mask(custom_mask, attempted_category, inplace=True)\n", 153 | "\n", 154 | "\n", 155 | "\n", 156 | "def label_rest_as_benign_and_write_csv(df, header_rows, file_to_write):\n", 157 | " df[\"Label\"].mask(df[\"Label\"] == \"NeedManualLabel\", \"BENIGN\", inplace=True)\n", 158 | "\n", 159 | " # Relabel artefact flows with [Flow Id] = '8.0.6.4-8.6.0.1-0-0-0' to label = 0\n", 160 | " df[\"Label\"].mask(df[\"Flow ID\"] == '8.0.6.4-8.6.0.1-0-0-0', \"BENIGN\", inplace=True)\n", 161 | "\n", 162 | " print(\"label count after labelling:\\r\\n\", df[\"Label\"].value_counts())\n", 163 | " print(\"Attempted Category count after labelling:\\r\\n\", df[\"Attempted Category\"].value_counts())\n", 164 | "\n", 165 | " full_df = pd.concat([df, header_rows], sort=False).sort_index()\n", 166 | "\n", 167 | " if print_index:\n", 168 | " full_df.reset_index(inplace=True, drop=True)\n", 169 | " full_df.index += 1\n", 170 | " full_df.index.name = 'id'\n", 171 | " full_df.to_csv(file_to_write)\n", 172 | " else:\n", 173 | " full_df.to_csv(file_to_write, index=False)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 3, 179 | "metadata": {}, 180 | "outputs": [ 181 | { 182 | "name": "stdout", 183 | "output_type": "stream", 184 | "text": [ 185 | "labels before pre-processing: Benign 6702133\n", 186 | "FTP-BruteForce 193360\n", 187 | "SSH-Bruteforce 187589\n", 188 | "Name: Label, dtype: int64\n", 189 | "labels after pre-processing: NeedManualLabel 7083082\n", 190 | "Name: Label, dtype: int64\n", 191 | "label count after labelling:\r\n", 192 | " BENIGN 6701304\n", 193 | "FTP-BruteForce - Attempted 193360\n", 194 | "SSH-BruteForce - Attempted 94211\n", 195 | "SSH-BruteForce 94207\n", 196 | "Name: Label, dtype: int64\n", 197 | "Attempted Category count after labelling:\r\n", 198 | " -1 6795511\n", 199 | " 1 193360\n", 200 | " 0 94211\n", 201 | "Name: Attempted Category, dtype: int64\n" 202 | ] 203 | } 204 | ], 205 | "source": [ 206 | "#----------------------+\n", 207 | "# WEDNESDAY 14-02-2018 |\n", 208 | "#----------------------+\n", 209 | "\n", 210 | "dir_name = \"Wednesday-14-02-2018\"\n", 211 | "wednesday_14022018_df, wednesday_14022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)\n", 212 | "\n", 213 | "#-- FTP-BruteForce\n", 214 | "label_flows(wednesday_14022018_df, \"FTP-BruteForce - Attempted\", 1518618806*(10**9),\n", 215 | " 1518624631*(10**9), [\"18.221.219.4\"], [\"172.31.69.25\"], attempted_category=1, also_flip_flow_direction=True)\n", 216 | "\n", 217 | "# FTP-BruteForce - Attempted (tool accidentally got launched in FTP bruteforce mode instead of SSH bruteforce mode)\n", 218 | "# Note that, in order to avoid float imprecisions at the micro- and nanosecond level, the UNIX timestamps such as\n", 219 | "# 1518631281.199541000, which is in seconds, needs to be converted to nanoseconds, so that the number is stored\n", 220 | "# in int64 instead of float.\n", 221 | "label_flows(wednesday_14022018_df, \"FTP-BruteForce - Attempted\", 1518631281,\n", 222 | " 1518631281, [\"13.58.98.64\"], [\"172.31.69.25\"], dst_port_list=[21], attempted_category=4)\n", 223 | "\n", 224 | "#-- SSH-BruteForce\n", 225 | "label_flows(wednesday_14022018_df, \"SSH-BruteForce\", 1518631310*(10**9),\n", 226 | " 1518636750*(10**9), [\"13.58.98.64\"], [\"172.31.69.25\"], dst_port_list=[22], also_flip_flow_direction=True)\n", 227 | "# Payload filter\n", 228 | "label_flows(wednesday_14022018_df, \"SSH-BruteForce - Attempted\", 1518631310*(10**9),\n", 229 | " 1518636750*(10**9), [\"13.58.98.64\"], [\"172.31.69.25\"], dst_port_list=[22],\n", 230 | " attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n", 231 | "\n", 232 | "label_rest_as_benign_and_write_csv(wednesday_14022018_df, wednesday_14022018_df_header_rows, DATASET_PATH + dir_name + \".csv\")\n", 233 | "\n", 234 | "wednesday_14022018_df = None" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 4, 240 | "metadata": {}, 241 | "outputs": [ 242 | { 243 | "name": "stdout", 244 | "output_type": "stream", 245 | "text": [ 246 | "labels before pre-processing: Benign 6565262\n", 247 | "DoS attacks-GoldenEye 41508\n", 248 | "DoS attacks-Slowloris 10990\n", 249 | "Name: Label, dtype: int64\n", 250 | "labels after pre-processing: NeedManualLabel 6617760\n", 251 | "Name: Label, dtype: int64\n", 252 | "label count after labelling:\r\n", 253 | " BENIGN 6564757\n", 254 | "DoS GoldenEye 27719\n", 255 | "DoS GoldenEye - Attempted 13789\n", 256 | "DoS Slowloris 8585\n", 257 | "DoS Slowloris - Attempted 2910\n", 258 | "Name: Label, dtype: int64\n", 259 | "Attempted Category count after labelling:\r\n", 260 | " -1 6601061\n", 261 | " 0 16638\n", 262 | " 6 53\n", 263 | " 4 8\n", 264 | "Name: Attempted Category, dtype: int64\n" 265 | ] 266 | } 267 | ], 268 | "source": [ 269 | "#---------------------+\n", 270 | "# THURSDAY 15-02-2018 |\n", 271 | "#---------------------+\n", 272 | "\n", 273 | "dir_name=\"Thursday-15-02-2018\"\n", 274 | "thursday_15022018_df, thursday_15022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)\n", 275 | "\n", 276 | "#-- DoS GoldenEye\n", 277 | "label_flows(thursday_15022018_df, \"DoS GoldenEye\", 1518701262*(10**9), 1518703905*(10**9), [\"18.219.211.138\"],\n", 278 | " [\"172.31.69.25\"], also_flip_flow_direction=True)\n", 279 | "\n", 280 | "# Payload filter\n", 281 | "label_flows(thursday_15022018_df, \"DoS GoldenEye - Attempted\", 1518701262*(10**9), 1518703905*(10**9), [\"18.219.211.138\"],\n", 282 | " [\"172.31.69.25\"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n", 283 | "\n", 284 | "# Target system unresponsive\n", 285 | "label_flows(thursday_15022018_df, \"DoS GoldenEye - Attempted\", 1518701262*(10**9), 1518703905*(10**9), [\"18.219.211.138\"],\n", 286 | " [\"172.31.69.25\"], attempted_category=6, additional_filters=[\n", 287 | " (thursday_15022018_df[\"TotLen Bwd Pkts\"] == 0) & (thursday_15022018_df[\"TotLen Fwd Pkts\"] > 0) &\n", 288 | " (thursday_15022018_df[\"Tot Fwd Pkts\"] > 2) & (thursday_15022018_df[\"Flow Duration\"] > 100000000)\n", 289 | " ])\n", 290 | "\n", 291 | "#-- DoS Slowloris\n", 292 | "label_flows(thursday_15022018_df, \"DoS Slowloris\", 1518706812*(10**9), 1518709321*(10**9), [\"18.217.165.70\"],\n", 293 | " [\"172.31.69.25\"], also_flip_flow_direction=True)\n", 294 | "\n", 295 | "# Payload filter\n", 296 | "label_flows(thursday_15022018_df, \"DoS Slowloris - Attempted\", 1518706812*(10**9), 1518709321*(10**9), [\"18.217.165.70\"],\n", 297 | " [\"172.31.69.25\"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n", 298 | "\n", 299 | "# Attack artefact (exclusively for original version\n", 300 | "label_flows(thursday_15022018_df, \"DoS Slowloris - Attempted\", 1518706812*(10**9), 1518709321*(10**9), [\"172.31.69.25\"], [\"18.217.165.70\"],\n", 301 | " attempted_category=4, additional_filters=[\n", 302 | " (thursday_15022018_df[\"Tot Fwd Pkts\"] == 1) & (thursday_15022018_df[\"Tot Bwd Pkts\"] == 2) & (thursday_15022018_df[\"TotLen Fwd Pkts\"] == 0) &\n", 303 | " (thursday_15022018_df[\"TotLen Bwd Pkts\"] == 238)\n", 304 | " ])\n", 305 | "\n", 306 | "label_rest_as_benign_and_write_csv(thursday_15022018_df, thursday_15022018_df_header_rows, DATASET_PATH + dir_name + \".csv\")\n", 307 | "\n", 308 | "thursday_15022018_df = None" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 5, 314 | "metadata": {}, 315 | "outputs": [ 316 | { 317 | "name": "stderr", 318 | "output_type": "stream", 319 | "text": [ 320 | "/tmp/ipykernel_65171/2882869393.py:6: DtypeWarning: Columns (2,4,5,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82) have mixed types.Specify dtype option on import or set low_memory=False.\n", 321 | " friday_16022018_df, friday_16022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)\n" 322 | ] 323 | }, 324 | { 325 | "name": "stdout", 326 | "output_type": "stream", 327 | "text": [ 328 | "labels before pre-processing: Benign 7413958\n", 329 | "DoS attacks-Hulk 923824\n", 330 | "DoS attacks-SlowHTTPTest 182868\n", 331 | "Label 1\n", 332 | "Name: Label, dtype: int64\n", 333 | "labels after pre-processing: NeedManualLabel 8520650\n", 334 | "Name: Label, dtype: int64\n", 335 | "label count after labelling:\r\n", 336 | " BENIGN 6521192\n", 337 | "DoS Hulk 935504\n", 338 | "DoS Hulk - Attempted 881086\n", 339 | "FTP-BruteForce - Attempted 182868\n", 340 | "Name: Label, dtype: int64\n", 341 | "Attempted Category count after labelling:\r\n", 342 | " -1 7456696\n", 343 | " 0 881086\n", 344 | " 1 182868\n", 345 | "Name: Attempted Category, dtype: int64\n" 346 | ] 347 | } 348 | ], 349 | "source": [ 350 | "#-------------------+\n", 351 | "# FRIDAY 16-02-2018 |\n", 352 | "#-------------------+\n", 353 | "\n", 354 | "dir_name=\"Friday-16-02-2018\"\n", 355 | "friday_16022018_df, friday_16022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)\n", 356 | "\n", 357 | "#-- FTP-Patator - Attempted\n", 358 | "label_flows(friday_16022018_df, \"FTP-BruteForce - Attempted\", 1518790334*(10**9), 1518793513*(10**9), [\"13.59.126.31\"],\n", 359 | " [\"172.31.69.25\"], attempted_category=1, also_flip_flow_direction=True)\n", 360 | "\n", 361 | "#-- DoS Hulk\n", 362 | "label_flows(friday_16022018_df, \"DoS Hulk\", 1518803127*(10**9), 1518803903*(10**9), [\"18.219.193.20\"], [\"172.31.69.25\"],\n", 363 | " also_flip_flow_direction=True)\n", 364 | "\n", 365 | "label_flows(friday_16022018_df, \"DoS Hulk - Attempted\", 1518803127*(10**9), 1518803903*(10**9), [\"18.219.193.20\"],\n", 366 | " [\"172.31.69.25\"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n", 367 | "\n", 368 | "#-- Dos Slowhttptest: No actual DoS Slowloris flows are present on this day in this dataset!\n", 369 | "# Instead we only find failed FTP-Patator traffic, which is exactly what is covered earlier in this cell\n", 370 | "\n", 371 | "label_rest_as_benign_and_write_csv(friday_16022018_df, friday_16022018_df_header_rows, DATASET_PATH + dir_name + \".csv\")\n", 372 | "\n", 373 | "friday_16022018_df = None" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": 1, 379 | "metadata": {}, 380 | "outputs": [ 381 | { 382 | "ename": "NameError", 383 | "evalue": "name 'read_csvs_from_path_and_reformat' is not defined", 384 | "output_type": "error", 385 | "traceback": [ 386 | "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", 387 | "\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)", 388 | "Input \u001B[0;32mIn [1]\u001B[0m, in \u001B[0;36m\u001B[0;34m()\u001B[0m\n\u001B[1;32m 1\u001B[0m \u001B[38;5;66;03m#--------------------+\u001B[39;00m\n\u001B[1;32m 2\u001B[0m \u001B[38;5;66;03m# TUESDAY 20-02-2018 |\u001B[39;00m\n\u001B[1;32m 3\u001B[0m \u001B[38;5;66;03m#--------------------+\u001B[39;00m\n\u001B[1;32m 5\u001B[0m dir_name\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mTuesday-20-02-2018\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m----> 6\u001B[0m tuesday_20022018_df, tuesday_20022018_df_header_rows \u001B[38;5;241m=\u001B[39m \u001B[43mread_csvs_from_path_and_reformat\u001B[49m(DATASET_PATH \u001B[38;5;241m+\u001B[39m dir_name)\n\u001B[1;32m 8\u001B[0m \u001B[38;5;66;03m#-- DDoS LOIC HTTP\u001B[39;00m\n\u001B[1;32m 9\u001B[0m label_flows(tuesday_20022018_df, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mDDoS-LOIC-HTTP\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;241m1519136034\u001B[39m\u001B[38;5;241m*\u001B[39m(\u001B[38;5;241m10\u001B[39m\u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39m\u001B[38;5;241m9\u001B[39m), \u001B[38;5;241m1519139809\u001B[39m\u001B[38;5;241m*\u001B[39m(\u001B[38;5;241m10\u001B[39m\u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39m\u001B[38;5;241m9\u001B[39m),\n\u001B[1;32m 10\u001B[0m [\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m18.218.115.60\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m18.219.9.1\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m18.219.32.43\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m18.218.55.126\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m52.14.136.135\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[1;32m 11\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m18.219.5.43\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m18.216.200.189\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m18.218.229.235\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m18.218.11.51\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m18.216.24.42\u001B[39m\u001B[38;5;124m\"\u001B[39m],\n\u001B[1;32m 12\u001B[0m [\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m172.31.69.25\u001B[39m\u001B[38;5;124m\"\u001B[39m], additional_filters\u001B[38;5;241m=\u001B[39m[\n\u001B[1;32m 13\u001B[0m tuesday_20022018_df[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mProtocol\u001B[39m\u001B[38;5;124m\"\u001B[39m] \u001B[38;5;241m==\u001B[39m \u001B[38;5;241m6\u001B[39m\n\u001B[1;32m 14\u001B[0m ])\n", 389 | "\u001B[0;31mNameError\u001B[0m: name 'read_csvs_from_path_and_reformat' is not defined" 390 | ] 391 | } 392 | ], 393 | "source": [ 394 | "#--------------------+\n", 395 | "# TUESDAY 20-02-2018 |\n", 396 | "#--------------------+\n", 397 | "\n", 398 | "dir_name=\"Tuesday-20-02-2018\"\n", 399 | "tuesday_20022018_df, tuesday_20022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)\n", 400 | "\n", 401 | "#-- DDoS LOIC HTTP\n", 402 | "label_flows(tuesday_20022018_df, \"DDoS-LOIC-HTTP\", 1519136034*(10**9), 1519139809*(10**9),\n", 403 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n", 404 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n", 405 | " [\"172.31.69.25\"], additional_filters=[\n", 406 | " tuesday_20022018_df[\"Protocol\"] == 6\n", 407 | " ])\n", 408 | "\n", 409 | "# Payload filter\n", 410 | "label_flows(tuesday_20022018_df, \"DDoS-LOIC-HTTP - Attempted\", 1519136034*(10**9), 1519139809*(10**9),\n", 411 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n", 412 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n", 413 | " [\"172.31.69.25\"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True,\n", 414 | " additional_filters=[tuesday_20022018_df[\"Protocol\"] == 6])\n", 415 | "\n", 416 | "#-- DDoS LOIC UDP\n", 417 | "label_flows(tuesday_20022018_df, \"DDoS-LOIC-UDP\", 1519146857*(10**9), 1519147756*(10**9),\n", 418 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n", 419 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n", 420 | " [\"172.31.69.25\"], additional_filters=[\n", 421 | " tuesday_20022018_df[\"Protocol\"] == 17])\n", 422 | "\n", 423 | "# Payload filter\n", 424 | "label_flows(tuesday_20022018_df, \"DDoS-LOIC-UDP - Attempted\", 1519146857*(10**9), 1519147756*(10**9),\n", 425 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n", 426 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n", 427 | " [\"172.31.69.25\"], attempted_category=0, payload_filter=True,\n", 428 | " additional_filters=[tuesday_20022018_df[\"Protocol\"] == 17])\n", 429 | "\n", 430 | "# Attempted - Target unresponsive (the ICMP destination unreachable answers to the attack - using protocol == 17 (UDP) because original CICFlowMeter does not recognise ICMP)\n", 431 | "label_flows(tuesday_20022018_df, \"DDoS-LOIC-UDP - Attempted\", 1519146857*(10**9), 1519147756*(10**9),\n", 432 | " [\"172.31.69.25\"], [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n", 433 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n", 434 | " attempted_category=6, additional_filters=[tuesday_20022018_df[\"Protocol\"] == 17])\n", 435 | "\n", 436 | "\n", 437 | "label_rest_as_benign_and_write_csv(tuesday_20022018_df, tuesday_20022018_df_header_rows, DATASET_PATH + dir_name + \".csv\")\n", 438 | "\n", 439 | "tuesday_20022018_df = None" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 7, 445 | "metadata": {}, 446 | "outputs": [ 447 | { 448 | "name": "stdout", 449 | "output_type": "stream", 450 | "text": [ 451 | "labels before pre-processing: Benign 8355458\n", 452 | "DDOS attack-HOIC 1246034\n", 453 | "DDOS attack-LOIC-UDP 1730\n", 454 | "Name: Label, dtype: int64\n", 455 | "labels after pre-processing: NeedManualLabel 9603222\n", 456 | "Name: Label, dtype: int64\n", 457 | "label count after labelling:\r\n", 458 | " BENIGN 7435307\n", 459 | "DDoS-HOIC - Attempted 1082294\n", 460 | "DDoS-HOIC 1082293\n", 461 | "DDoS-LOIC-UDP 1730\n", 462 | "DDoS-LOIC-UDP - Attempted 1598\n", 463 | "Name: Label, dtype: int64\n", 464 | "Attempted Category count after labelling:\r\n", 465 | " -1 8519330\n", 466 | " 0 1082294\n", 467 | " 6 1598\n", 468 | "Name: Attempted Category, dtype: int64\n" 469 | ] 470 | } 471 | ], 472 | "source": [ 473 | "#----------------------+\n", 474 | "# WEDNESDAY 21-02-2018 |\n", 475 | "#----------------------+\n", 476 | "\n", 477 | "dir_name = \"Wednesday-21-02-2018\"\n", 478 | "wednesday_21022018_df, wednesday_21022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)\n", 479 | "\n", 480 | "#-- DDoS LOIC UDP\n", 481 | "label_flows(wednesday_21022018_df, \"DDoS-LOIC-UDP\", 1519222131*(10**9), 1519224219*(10**9),\n", 482 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n", 483 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n", 484 | " [\"172.31.69.28\"], additional_filters=[\n", 485 | " wednesday_21022018_df[\"Protocol\"] == 17\n", 486 | " ])\n", 487 | "\n", 488 | "# Payload filter\n", 489 | "label_flows(wednesday_21022018_df, \"DDoS-LOIC-UDP - Attempted\", 1519222131*(10**9), 1519224219*(10**9),\n", 490 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n", 491 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n", 492 | " [\"172.31.69.28\"], attempted_category=0, payload_filter=True,\n", 493 | " additional_filters=[wednesday_21022018_df[\"Protocol\"] == 17])\n", 494 | "\n", 495 | "# Attempted - Target unresponsive (the ICMP destination unreachable answers to the attack - using protocol == 17 (UDP) because original CICFlowMeter does not recognise ICMP)\n", 496 | "label_flows(wednesday_21022018_df, \"DDoS-LOIC-UDP - Attempted\", 1519222131*(10**9), 1519224219*(10**9),\n", 497 | " [\"172.31.69.28\"], [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n", 498 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n", 499 | " attempted_category=6,\n", 500 | " additional_filters=[wednesday_21022018_df[\"Protocol\"] == 17])\n", 501 | "\n", 502 | "#-- DDoS HOIC\n", 503 | "label_flows(wednesday_21022018_df, \"DDoS-HOIC\", 1519236668*(10**9), 1519239954*(10**9),\n", 504 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n", 505 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n", 506 | " [\"172.31.69.28\"], also_flip_flow_direction=True, additional_filters=[\n", 507 | " wednesday_21022018_df[\"Protocol\"] == 6\n", 508 | " ])\n", 509 | "\n", 510 | "# Payload filter\n", 511 | "label_flows(wednesday_21022018_df, \"DDoS-HOIC - Attempted\", 1519236668*(10**9), 1519239954*(10**9),\n", 512 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n", 513 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n", 514 | " [\"172.31.69.28\"], payload_filter=True, also_flip_flow_direction=True,\n", 515 | " attempted_category=0, additional_filters=[wednesday_21022018_df[\"Protocol\"] == 6])\n", 516 | "\n", 517 | "\n", 518 | "label_rest_as_benign_and_write_csv(wednesday_21022018_df, wednesday_21022018_df_header_rows, DATASET_PATH + dir_name + \".csv\")\n", 519 | "\n", 520 | "wednesday_21022018_df = None" 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": 8, 526 | "metadata": {}, 527 | "outputs": [ 528 | { 529 | "name": "stdout", 530 | "output_type": "stream", 531 | "text": [ 532 | "labels before pre-processing: Benign 8179253\n", 533 | "Brute Force -Web 249\n", 534 | "Brute Force -XSS 79\n", 535 | "SQL Injection 34\n", 536 | "Name: Label, dtype: int64\n", 537 | "labels after pre-processing: NeedManualLabel 8179615\n", 538 | "Name: Label, dtype: int64\n", 539 | "label count after labelling:\r\n", 540 | " BENIGN 8179201\n", 541 | "Web Attack - Brute Force - Attempted 221\n", 542 | "Web Attack - Brute Force 69\n", 543 | "Web Attack - XSS - Attempted 44\n", 544 | "Web Attack - XSS 40\n", 545 | "Web Attack - SQL - Attempted 24\n", 546 | "Web Attack - SQL 16\n", 547 | "Name: Label, dtype: int64\n", 548 | "Attempted Category count after labelling:\r\n", 549 | " -1 8179326\n", 550 | " 0 197\n", 551 | " 5 66\n", 552 | " 2 24\n", 553 | " 3 2\n", 554 | "Name: Attempted Category, dtype: int64\n" 555 | ] 556 | } 557 | ], 558 | "source": [ 559 | "#---------------------+\n", 560 | "# THURSDAY 22-02-2018 |\n", 561 | "#---------------------+\n", 562 | "\n", 563 | "dir_name = \"Thursday-22-02-2018\"\n", 564 | "thursday_22022018_df, thursday_22022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)\n", 565 | "\n", 566 | "#-- Web Attack SQL\n", 567 | "label_flows(thursday_22022018_df, \"Web Attack - SQL\", 1519330590418906000, 1519331276022793000, [\"18.218.115.60\"],\n", 568 | " [\"172.31.69.28\"], also_flip_flow_direction=True, additional_filters=\n", 569 | " [thursday_22022018_df[\"TotLen Fwd Pkts\"] > 0,\n", 570 | " thursday_22022018_df[\"TotLen Bwd Pkts\"] > 0])\n", 571 | "\n", 572 | "# Attack startup artefact\n", 573 | "label_flows(thursday_22022018_df, \"Web Attack - SQL - Attempted\", 1519330470*(10**9), 1519330498*(10**9), [\"18.218.115.60\"],\n", 574 | " [\"172.31.69.28\"], attempted_category=2, also_flip_flow_direction=True)\n", 575 | "\n", 576 | "# Payload filter\n", 577 | "label_flows(thursday_22022018_df, \"Web Attack - SQL - Attempted\", 1519330590418906000, 1519331276022793000, [\"18.218.115.60\"],\n", 578 | " [\"172.31.69.28\"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n", 579 | "\n", 580 | "#-- Web Attack XSS\n", 581 | "# Port 63782 is attack setup (navigating to website)\n", 582 | "label_flows(thursday_22022018_df, \"Web Attack - XSS\", 1519321899783923000, 1519324181827037000, [\"18.218.115.60\"],\n", 583 | " [\"172.31.69.28\"], additional_filters=\n", 584 | " [~(thursday_22022018_df[\"Src Port\"].isin([63782, 64144]))])\n", 585 | "\n", 586 | "#Flip\n", 587 | "label_flows(thursday_22022018_df, \"Web Attack - XSS\", 1519321899783923000, 1519324181827037000,\n", 588 | " [\"172.31.69.28\"], [\"18.218.115.60\"], additional_filters=\n", 589 | " [~(thursday_22022018_df[\"Dst Port\"].isin([63782, 64144]))])\n", 590 | "\n", 591 | "# Attempted attack setup\n", 592 | "label_flows(thursday_22022018_df, \"Web Attack - XSS - Attempted\", 1519321899783923000, 1519324181827037000, [\"18.218.115.60\"],\n", 593 | " [\"172.31.69.28\"], attempted_category=2, additional_filters=\n", 594 | " [thursday_22022018_df[\"Src Port\"] == 63782])\n", 595 | "\n", 596 | "label_flows(thursday_22022018_df, \"Web Attack - XSS - Attempted\", 1519321899783923000, 1519324181827037000, [\"18.218.115.60\"],\n", 597 | " [\"172.31.69.28\"], attempted_category=3, additional_filters=\n", 598 | " [thursday_22022018_df[\"Src Port\"] == 64144])\n", 599 | "\n", 600 | "# Payload filter\n", 601 | "label_flows(thursday_22022018_df, \"Web Attack - XSS - Attempted\", 1519321899783923000, 1519324181827037000, [\"18.218.115.60\"],\n", 602 | " [\"172.31.69.28\"], attempted_category=0, payload_filter=True, additional_filters=\n", 603 | " [~(thursday_22022018_df[\"Src Port\"].isin([63782, 64144]))])\n", 604 | "\n", 605 | "#Flip\n", 606 | "label_flows(thursday_22022018_df, \"Web Attack - XSS - Attempted\", 1519321899783923000, 1519324181827037000,\n", 607 | " [\"172.31.69.28\"], [\"18.218.115.60\"], attempted_category=0, additional_filters=\n", 608 | " [~(thursday_22022018_df[\"Dst Port\"].isin([63782, 64144])) &\n", 609 | " (thursday_22022018_df[\"TotLen Bwd Pkts\"] == 0)])\n", 610 | "\n", 611 | "#-- Web Attack Brute Force & Attempted\n", 612 | "label_flows(thursday_22022018_df, \"Web Attack - Brute Force\", 1519309071336902000, 1519313039858533000, [\"18.218.115.60\"],\n", 613 | " [\"172.31.69.28\"], additional_filters=\n", 614 | " [thursday_22022018_df[\"Tot Fwd Pkts\"] > 20])\n", 615 | "\n", 616 | "#Flip\n", 617 | "label_flows(thursday_22022018_df, \"Web Attack - Brute Force\", 1519309071336902000, 1519313039858533000,\n", 618 | " [\"172.31.69.28\"], [\"18.218.115.60\"], additional_filters=\n", 619 | " [thursday_22022018_df[\"Tot Bwd Pkts\"] > 20])\n", 620 | "\n", 621 | "label_flows(thursday_22022018_df, \"Web Attack - Brute Force - Attempted\", 1519309071336902000, 1519313039858533000,\n", 622 | " [\"18.218.115.60\"], [\"172.31.69.28\"], attempted_category=5, additional_filters=\n", 623 | " [(thursday_22022018_df[\"Tot Fwd Pkts\"] <= 20) & (thursday_22022018_df[\"TotLen Fwd Pkts\"] > 0)])\n", 624 | "\n", 625 | "#Flip\n", 626 | "label_flows(thursday_22022018_df, \"Web Attack - Brute Force - Attempted\", 1519309071336902000, 1519313039858533000,\n", 627 | " [\"172.31.69.28\"], [\"18.218.115.60\"], attempted_category=5, additional_filters=\n", 628 | " [(thursday_22022018_df[\"Tot Bwd Pkts\"] <= 20) & (thursday_22022018_df[\"TotLen Bwd Pkts\"] > 0)])\n", 629 | "\n", 630 | "label_flows(thursday_22022018_df, \"Web Attack - Brute Force - Attempted\", 1519308824965705000, 1519308947920399000, [\"18.218.115.60\"],\n", 631 | " [\"172.31.69.28\"], attempted_category=2, also_flip_flow_direction=True)\n", 632 | "\n", 633 | "# Payload filter\n", 634 | "label_flows(thursday_22022018_df, \"Web Attack - Brute Force - Attempted\", 1519309071336902000, 1519313039858533000,\n", 635 | " [\"18.218.115.60\"], [\"172.31.69.28\"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n", 636 | "\n", 637 | "label_rest_as_benign_and_write_csv(thursday_22022018_df, thursday_22022018_df_header_rows, DATASET_PATH + dir_name + \".csv\")\n", 638 | "\n", 639 | "thursday_22022018_df = None\n" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": 9, 645 | "metadata": {}, 646 | "outputs": [ 647 | { 648 | "name": "stdout", 649 | "output_type": "stream", 650 | "text": [ 651 | "labels before pre-processing: Benign 7927630\n", 652 | "Brute Force -Web 362\n", 653 | "Brute Force -XSS 151\n", 654 | "SQL Injection 53\n", 655 | "Name: Label, dtype: int64\n", 656 | "labels after pre-processing: NeedManualLabel 7928196\n", 657 | "Name: Label, dtype: int64\n", 658 | "label count after labelling:\r\n", 659 | " BENIGN 7927736\n", 660 | "Web Attack - Brute Force - Attempted 184\n", 661 | "Web Attack - XSS - Attempted 75\n", 662 | "Web Attack - XSS 73\n", 663 | "Web Attack - Brute Force 62\n", 664 | "Web Attack - SQL - Attempted 43\n", 665 | "Web Attack - SQL 23\n", 666 | "Name: Label, dtype: int64\n", 667 | "Attempted Category count after labelling:\r\n", 668 | " -1 7927894\n", 669 | " 0 231\n", 670 | " 5 60\n", 671 | " 2 11\n", 672 | "Name: Attempted Category, dtype: int64\n" 673 | ] 674 | } 675 | ], 676 | "source": [ 677 | "#-------------------+\n", 678 | "# FRIDAY 23-02-2018 |\n", 679 | "#-------------------+\n", 680 | "\n", 681 | "dir_name = \"Friday-23-02-2018\"\n", 682 | "friday_23022018_df, friday_23022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)\n", 683 | "\n", 684 | "#-- Web Attack SQL\n", 685 | "label_flows(friday_23022018_df, \"Web Attack - SQL\", 1519412792126122000, 1519413444947957000 , [\"18.218.115.60\"],\n", 686 | " [\"172.31.69.28\"], also_flip_flow_direction=True, additional_filters=\n", 687 | " [friday_23022018_df[\"TotLen Fwd Pkts\"] > 0,\n", 688 | " friday_23022018_df[\"TotLen Bwd Pkts\"] > 0])\n", 689 | "\n", 690 | "# Attack startup artefact\n", 691 | "label_flows(friday_23022018_df, \"Web Attack - SQL - Attempted\", 1519412722*(10**9), 1519412787*(10**9) , [\"18.218.115.60\"],\n", 692 | " [\"172.31.69.28\"], attempted_category=2, also_flip_flow_direction=True)\n", 693 | "\n", 694 | "# Payload filter\n", 695 | "label_flows(friday_23022018_df, \"Web Attack - SQL - Attempted\", 1519412792126122000, 1519413444947957000 , [\"18.218.115.60\"],\n", 696 | " [\"172.31.69.28\"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n", 697 | "\n", 698 | "#-- Web Attack XSS\n", 699 | "label_flows(friday_23022018_df, \"Web Attack - XSS\", 1519405264559707000, 1519409428237472000, [\"18.218.115.60\"],\n", 700 | " [\"172.31.69.28\"], additional_filters=\n", 701 | " [~(friday_23022018_df[\"Src Port\"].isin([59173]))])\n", 702 | "\n", 703 | "#Flip\n", 704 | "label_flows(friday_23022018_df, \"Web Attack - XSS\", 1519405264559707000, 1519409428237472000,\n", 705 | " [\"172.31.69.28\"], [\"18.218.115.60\"], additional_filters=\n", 706 | " [~(friday_23022018_df[\"Dst Port\"].isin([59173]))])\n", 707 | "\n", 708 | "label_flows(friday_23022018_df, \"Web Attack - XSS - Attempted\", 1519405264559707000, 1519409428237472000, [\"18.218.115.60\"],\n", 709 | " [\"172.31.69.28\"], attempted_category=2, src_port_list=[59173], also_flip_flow_direction=True)\n", 710 | "\n", 711 | "# Payload filter\n", 712 | "label_flows(friday_23022018_df, \"Web Attack - XSS - Attempted\", 1519405264559707000, 1519409428237472000, [\"18.218.115.60\"],\n", 713 | " [\"172.31.69.28\"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n", 714 | "\n", 715 | "#-- Web Attack Brute Force & Attempted\n", 716 | "label_flows(friday_23022018_df, \"Web Attack - Brute Force\", 1519394670193975000, 1519398186406294000, [\"18.218.115.60\"],\n", 717 | " [\"172.31.69.28\"], additional_filters=\n", 718 | " [friday_23022018_df[\"Tot Fwd Pkts\"] > 20])\n", 719 | "\n", 720 | "#Flip\n", 721 | "label_flows(friday_23022018_df, \"Web Attack - Brute Force\", 1519394670193975000, 1519398186406294000,\n", 722 | " [\"172.31.69.28\"], [\"18.218.115.60\"], additional_filters=\n", 723 | " [friday_23022018_df[\"Tot Bwd Pkts\"] > 20])\n", 724 | "\n", 725 | "label_flows(friday_23022018_df, \"Web Attack - Brute Force - Attempted\", 1519394670193975000, 1519398186406294000,\n", 726 | " [\"18.218.115.60\"], [\"172.31.69.28\"], attempted_category=5, additional_filters=\n", 727 | " [(friday_23022018_df[\"Tot Fwd Pkts\"] <= 20) & (friday_23022018_df[\"TotLen Fwd Pkts\"] > 0)])\n", 728 | "\n", 729 | "#Flip\n", 730 | "label_flows(friday_23022018_df, \"Web Attack - Brute Force - Attempted\", 1519394670193975000, 1519398186406294000,\n", 731 | " [\"172.31.69.28\"], [\"18.218.115.60\"], attempted_category=5, additional_filters=\n", 732 | " [(friday_23022018_df[\"Tot Bwd Pkts\"] <= 20) & (friday_23022018_df[\"TotLen Bwd Pkts\"] > 0)])\n", 733 | "\n", 734 | "# Payload filter:\n", 735 | "label_flows(friday_23022018_df, \"Web Attack - Brute Force - Attempted\", 1519394670193975000, 1519398186406294000,\n", 736 | " [\"18.218.115.60\"], [\"172.31.69.28\"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n", 737 | "\n", 738 | "label_rest_as_benign_and_write_csv(friday_23022018_df, friday_23022018_df_header_rows, DATASET_PATH + dir_name + \".csv\")\n", 739 | "\n", 740 | "friday_23022018_df = None" 741 | ] 742 | }, 743 | { 744 | "cell_type": "code", 745 | "execution_count": 10, 746 | "metadata": {}, 747 | "outputs": [ 748 | { 749 | "name": "stderr", 750 | "output_type": "stream", 751 | "text": [ 752 | "/tmp/ipykernel_65171/1487319557.py:6: DtypeWarning: Columns (2,4,5,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82) have mixed types.Specify dtype option on import or set low_memory=False.\n", 753 | " wednesday_28022018_df, wednesday_28022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)\n" 754 | ] 755 | }, 756 | { 757 | "name": "stdout", 758 | "output_type": "stream", 759 | "text": [ 760 | "labels before pre-processing: Benign 544200\n", 761 | "Infilteration 68871\n", 762 | "Label 33\n", 763 | "Name: Label, dtype: int64\n", 764 | "labels after pre-processing: NeedManualLabel 613072\n", 765 | "Name: Label, dtype: int64\n", 766 | "label count after labelling:\r\n", 767 | " BENIGN 553425\n", 768 | "Infiltration - NMAP Portscan 59494\n", 769 | "Infiltration - Dropbox Download - Attempted 63\n", 770 | "Infiltration - Dropbox Download 46\n", 771 | "Infiltration - Communication Victim Attacker 44\n", 772 | "Name: Label, dtype: int64\n", 773 | "Attempted Category count after labelling:\r\n", 774 | " -1 613009\n", 775 | " 0 39\n", 776 | " 4 24\n", 777 | "Name: Attempted Category, dtype: int64\n" 778 | ] 779 | } 780 | ], 781 | "source": [ 782 | "#----------------------+\n", 783 | "# WEDNESDAY 28-02-2018 |\n", 784 | "#----------------------+\n", 785 | "\n", 786 | "dir_name = \"Wednesday-28-02-2018\"\n", 787 | "wednesday_28022018_df, wednesday_28022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)\n", 788 | "\n", 789 | "#-- Infiltration - Dropbox Download\n", 790 | "label_flows(wednesday_28022018_df, \"Infiltration - Dropbox Download\", 1519828404*(10**9), 1519829172*(10**9),\n", 791 | " [\"172.31.69.24\"],\n", 792 | " [\"162.125.3.1\", \"162.125.3.5\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"],\n", 793 | " also_flip_flow_direction=True)\n", 794 | "\n", 795 | "label_flows(wednesday_28022018_df, \"Infiltration - Dropbox Download\", 1519839771*(10**9), 1519839824*(10**9),\n", 796 | " [\"172.31.69.24\"],\n", 797 | " [\"162.125.3.1\", \"162.125.3.5\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"],\n", 798 | " also_flip_flow_direction=True)\n", 799 | "\n", 800 | "# Payload filter\n", 801 | "label_flows(wednesday_28022018_df, \"Infiltration - Dropbox Download - Attempted\", 1519828404*(10**9), 1519829172*(10**9),\n", 802 | " [\"172.31.69.24\"],\n", 803 | " [\"162.125.3.1\", \"162.125.3.5\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"],\n", 804 | " attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n", 805 | "\n", 806 | "label_flows(wednesday_28022018_df, \"Infiltration - Dropbox Download - Attempted\", 1519839771*(10**9), 1519839824*(10**9),\n", 807 | " [\"172.31.69.24\"],\n", 808 | " [\"162.125.3.1\", \"162.125.3.5\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"],\n", 809 | " attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n", 810 | "\n", 811 | "# Attempted - Attack artefact\n", 812 | "label_flows(wednesday_28022018_df, \"Infiltration - Dropbox Download - Attempted\", 1519828404*(10**9), 1519829172*(10**9),\n", 813 | " [\"172.31.69.24\"],\n", 814 | " [\"104.16.100.29\", \"104.16.99.29\", \"52.84.128.3\", \"52.85.101.236\", \"52.85.131.81\", \"52.85.95.206\"], attempted_category=4, also_flip_flow_direction=True)\n", 815 | "\n", 816 | "label_flows(wednesday_28022018_df, \"Infiltration - Dropbox Download - Attempted\", 1519839771*(10**9), 1519839824*(10**9),\n", 817 | " [\"172.31.69.24\"],\n", 818 | " [\"104.16.100.29\", \"104.16.99.29\", \"52.84.128.3\", \"52.85.101.236\", \"52.85.131.81\", \"52.85.95.206\"], attempted_category=4, also_flip_flow_direction=True)\n", 819 | "\n", 820 | "#-- Infiltration - Communication Victim Attacker\n", 821 | "label_flows(wednesday_28022018_df, \"Infiltration - Communication Victim Attacker\", 1519829140*(10**9),\n", 822 | " 1519834135*(10**9), [\"172.31.69.24\"], [\"13.58.225.34\"], also_flip_flow_direction=True)\n", 823 | "\n", 824 | "label_flows(wednesday_28022018_df, \"Infiltration - Communication Victim Attacker\", 1519839839*(10**9),\n", 825 | " 1519843199*(10**9), [\"172.31.69.24\"], [\"13.58.225.34\"], also_flip_flow_direction=True)\n", 826 | "\n", 827 | "# Payload filter\n", 828 | "\n", 829 | "label_flows(wednesday_28022018_df, \"Infiltration - Communication Victim Attacker - Attempted\", 1519829140*(10**9),\n", 830 | " 1519834135*(10**9), [\"172.31.69.24\"], [\"13.58.225.34\"], payload_filter=True, also_flip_flow_direction=True)\n", 831 | "\n", 832 | "label_flows(wednesday_28022018_df, \"Infiltration - Communication Victim Attacker - Attempted\", 1519839839*(10**9),\n", 833 | " 1519843199*(10**9), [\"172.31.69.24\"], [\"13.58.225.34\"], payload_filter=True, also_flip_flow_direction=True)\n", 834 | "\n", 835 | "#-- Infiltration - NMAP Portscan\n", 836 | "label_flows(wednesday_28022018_df, \"Infiltration - NMAP Portscan\", 1519829182*(10**9), 1519843140746247000,\n", 837 | " [\"172.31.69.24\"],\n", 838 | " [\"172.31.69.1\", \"172.31.69.10\", \"172.31.69.11\", \"172.31.69.12\", \"172.31.69.13\", \"172.31.69.14\",\n", 839 | " \"172.31.69.16\", \"172.31.69.17\", \"172.31.69.19\", \"172.31.69.20\", \"172.31.69.23\", \"172.31.69.4\",\n", 840 | " \"172.31.69.5\", \"172.31.69.6\", \"172.31.69.8\", \"172.31.69.9\", \"172.31.69.7\", \"172.31.69.22\",\n", 841 | " \"172.31.69.15\", \"172.31.69.21\", \"172.31.69.18\",], additional_filters=\n", 842 | " [~(wednesday_28022018_df[\"Src Port\"] == 68)])\n", 843 | "\n", 844 | "label_rest_as_benign_and_write_csv(wednesday_28022018_df, wednesday_28022018_df_header_rows, DATASET_PATH + dir_name + \".csv\")\n", 845 | "\n", 846 | "wednesday_28022018_df = None" 847 | ] 848 | }, 849 | { 850 | "cell_type": "code", 851 | "execution_count": 11, 852 | "metadata": {}, 853 | "outputs": [ 854 | { 855 | "name": "stderr", 856 | "output_type": "stream", 857 | "text": [ 858 | "/tmp/ipykernel_65171/25895083.py:6: DtypeWarning: Columns (2,4,5,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82) have mixed types.Specify dtype option on import or set low_memory=False.\n", 859 | " thursday_01032018_df, thursday_01032018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)\n" 860 | ] 861 | }, 862 | { 863 | "name": "stdout", 864 | "output_type": "stream", 865 | "text": [ 866 | "labels before pre-processing: Benign 238037\n", 867 | "Infilteration 93063\n", 868 | "Label 25\n", 869 | "Name: Label, dtype: int64\n", 870 | "labels after pre-processing: NeedManualLabel 331101\n", 871 | "Name: Label, dtype: int64\n", 872 | "label count after labelling:\r\n", 873 | " BENIGN 290058\n", 874 | "Infiltration - NMAP Portscan 40804\n", 875 | "Infiltration - Communication Victim Attacker 162\n", 876 | "Infiltration - Dropbox Download 39\n", 877 | "Infiltration - Dropbox Download - Attempted 37\n", 878 | "Infiltration - Communication Victim Attacker - Attempted 1\n", 879 | "Name: Label, dtype: int64\n", 880 | "Attempted Category count after labelling:\r\n", 881 | " -1 331063\n", 882 | " 4 21\n", 883 | " 0 17\n", 884 | "Name: Attempted Category, dtype: int64\n" 885 | ] 886 | } 887 | ], 888 | "source": [ 889 | "#---------------------+\n", 890 | "# THURSDAY 01-03-2018 |\n", 891 | "#---------------------+\n", 892 | "\n", 893 | "dir_name = \"Thursday-01-03-2018\"\n", 894 | "thursday_01032018_df, thursday_01032018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)\n", 895 | "\n", 896 | "#-- Infiltration - Dropbox Download\n", 897 | "label_flows(thursday_01032018_df, \"Infiltration - Dropbox Download\", 1519912390*(10**9), 1519916360*(10**9),\n", 898 | " [\"172.31.69.13\"], [\"162.125.3.1\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"], also_flip_flow_direction=True)\n", 899 | "\n", 900 | "label_flows(thursday_01032018_df, \"Infiltration - Dropbox Download\", 1519913032*(10**9), 1519918454*(10**9),\n", 901 | " [\"172.31.69.13\"], [\"162.125.3.1\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"], also_flip_flow_direction=True)\n", 902 | "\n", 903 | "# Payload filter\n", 904 | "label_flows(thursday_01032018_df, \"Infiltration - Dropbox Download - Attempted\", 1519912390*(10**9), 1519916360*(10**9),\n", 905 | " [\"172.31.69.13\"],\n", 906 | " [\"162.125.3.1\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n", 907 | "\n", 908 | "label_flows(thursday_01032018_df, \"Infiltration - Dropbox Download - Attempted\", 1519913032*(10**9), 1519918454*(10**9),\n", 909 | " [\"172.31.69.13\"],\n", 910 | " [\"162.125.3.1\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\", \"104.16.100.29\"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n", 911 | "\n", 912 | "# Attempted - Attack artefact\n", 913 | "label_flows(thursday_01032018_df, \"Infiltration - Dropbox Download - Attempted\", 1519912390*(10**9), 1519916360*(10**9),\n", 914 | " [\"172.31.69.13\"], [\"104.16.100.29\", \"13.32.168.125\", \"52.85.112.72\"], attempted_category=4, also_flip_flow_direction=True)\n", 915 | "\n", 916 | "label_flows(thursday_01032018_df, \"Infiltration - Dropbox Download - Attempted\", 1519913032*(10**9), 1519918454*(10**9),\n", 917 | " [\"172.31.69.13\"], [\"104.16.100.29\", \"13.32.168.125\", \"52.85.112.72\"], attempted_category=4, also_flip_flow_direction=True)\n", 918 | "\n", 919 | "#-- Infiltration - Communication Victim Attacker\n", 920 | "label_flows(thursday_01032018_df, \"Infiltration - Communication Victim Attacker\", 1519912674*(10**9),\n", 921 | " 1519912745*(10**9), [\"172.31.69.13\"], [\"13.58.225.34\"], also_flip_flow_direction=True)\n", 922 | "\n", 923 | "label_flows(thursday_01032018_df, \"Infiltration - Communication Victim Attacker\", 1519913075*(10**9),\n", 924 | " 1519928245*(10**9), [\"172.31.69.13\"], [\"13.58.225.34\"], also_flip_flow_direction=True)\n", 925 | "\n", 926 | "label_flows(thursday_01032018_df, \"Infiltration - Communication Victim Attacker\", 1519928295*(10**9),\n", 927 | " 1519933041*(10**9), [\"172.31.69.13\"], [\"13.58.225.34\"], also_flip_flow_direction=True)\n", 928 | "\n", 929 | "# Payload filter\n", 930 | "label_flows(thursday_01032018_df, \"Infiltration - Communication Victim Attacker - Attempted\", 1519912674*(10**9),\n", 931 | " 1519912745*(10**9), [\"172.31.69.13\"], [\"13.58.225.34\"], attempted_category=0,\n", 932 | " payload_filter=True, also_flip_flow_direction=True)\n", 933 | "\n", 934 | "label_flows(thursday_01032018_df, \"Infiltration - Communication Victim Attacker - Attempted\", 1519913075*(10**9),\n", 935 | " 1519928245*(10**9), [\"172.31.69.13\"], [\"13.58.225.34\"], attempted_category=0,\n", 936 | " payload_filter=True, also_flip_flow_direction=True)\n", 937 | "\n", 938 | "label_flows(thursday_01032018_df, \"Infiltration - Communication Victim Attacker - Attempted\", 1519928295*(10**9),\n", 939 | " 1519933041*(10**9), [\"172.31.69.13\"], [\"13.58.225.34\"], attempted_category=0,\n", 940 | " payload_filter=True, also_flip_flow_direction=True)\n", 941 | "\n", 942 | "#-- Infiltration - NMAP Portscan\n", 943 | "label_flows(thursday_01032018_df, \"Infiltration - NMAP Portscan\", 1519913388*(10**9), 1519933092182726000,\n", 944 | " [\"172.31.69.13\"],\n", 945 | " [\"172.31.69.1\", \"172.31.69.11\", \"172.31.69.12\", \"172.31.69.16\", \"172.31.69.8\", \"172.31.69.9\",\n", 946 | " \"172.31.69.10\", \"172.31.69.14\", \"172.31.69.4\", \"172.31.69.5\", \"172.31.69.6\", \"172.31.69.17\",\n", 947 | " \"172.31.69.20\", \"172.31.69.23\", \"172.31.69.24\", \"172.31.69.19\", \"172.31.69.7\", \"172.31.69.15\",\n", 948 | " \"172.31.69.18\", \"172.31.69.22\", \"172.31.69.21\"], additional_filters=\n", 949 | " [thursday_01032018_df[\"Src Port\"] != 68])\n", 950 | "\n", 951 | "label_rest_as_benign_and_write_csv(thursday_01032018_df, thursday_01032018_df_header_rows, DATASET_PATH + dir_name + \".csv\")\n", 952 | "\n", 953 | "thursday_01032018_df = None" 954 | ] 955 | }, 956 | { 957 | "cell_type": "code", 958 | "execution_count": 12, 959 | "metadata": {}, 960 | "outputs": [ 961 | { 962 | "name": "stdout", 963 | "output_type": "stream", 964 | "text": [ 965 | "labels before pre-processing: Benign 7931011\n", 966 | "Bot 286191\n", 967 | "Name: Label, dtype: int64\n", 968 | "labels after pre-processing: NeedManualLabel 8217202\n", 969 | "Name: Label, dtype: int64\n", 970 | "label count after labelling:\r\n", 971 | " BENIGN 7931011\n", 972 | "Botnet Ares - Attempted 143263\n", 973 | "Botnet Ares 142928\n", 974 | "Name: Label, dtype: int64\n", 975 | "Attempted Category count after labelling:\r\n", 976 | " -1 8073939\n", 977 | " 0 143263\n", 978 | "Name: Attempted Category, dtype: int64\n" 979 | ] 980 | } 981 | ], 982 | "source": [ 983 | "#-------------------+\n", 984 | "# FRIDAY 02-03-2018 |\n", 985 | "#-------------------+\n", 986 | "\n", 987 | "dir_name = \"Friday-02-03-2018\"\n", 988 | "friday_02032018_df, friday_02032018_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)\n", 989 | "\n", 990 | "#-- Botnet Ares\n", 991 | "label_flows(friday_02032018_df, \"Botnet Ares\", 1520000008*(10**9), 1520020492*(10**9), also_flip_flow_direction=True,\n", 992 | " additional_filters=\n", 993 | " [(friday_02032018_df[\"Src IP\"] == \"18.219.211.138\") | (friday_02032018_df[\"Dst IP\"] == \"18.219.211.138\")])\n", 994 | "\n", 995 | "# Payload filter\n", 996 | "label_flows(friday_02032018_df, \"Botnet Ares - Attempted\", 1520000008*(10**9), 1520020492*(10**9), attempted_category=0, additional_filters=\n", 997 | " [((friday_02032018_df[\"Src IP\"] == \"18.219.211.138\") | (friday_02032018_df[\"Dst IP\"] == \"18.219.211.138\")) &\n", 998 | " (friday_02032018_df[\"TotLen Fwd Pkts\"] == 0) & (friday_02032018_df[\"TotLen Bwd Pkts\"] == 0)])\n", 999 | "\n", 1000 | "label_rest_as_benign_and_write_csv(friday_02032018_df, friday_02032018_header_rows, DATASET_PATH + dir_name + \".csv\")\n", 1001 | "\n", 1002 | "friday_02032018_df = None" 1003 | ] 1004 | }, 1005 | { 1006 | "cell_type": "code", 1007 | "execution_count": null, 1008 | "metadata": {}, 1009 | "outputs": [], 1010 | "source": [] 1011 | } 1012 | ], 1013 | "metadata": { 1014 | "kernelspec": { 1015 | "display_name": "Python 3 (ipykernel)", 1016 | "language": "python", 1017 | "name": "python3" 1018 | }, 1019 | "language_info": { 1020 | "codemirror_mode": { 1021 | "name": "ipython", 1022 | "version": 3 1023 | }, 1024 | "file_extension": ".py", 1025 | "mimetype": "text/x-python", 1026 | "name": "python", 1027 | "nbconvert_exporter": "python", 1028 | "pygments_lexer": "ipython3", 1029 | "version": "3.8.12" 1030 | } 1031 | }, 1032 | "nbformat": 4, 1033 | "nbformat_minor": 1 1034 | } 1035 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Error Prevalence in NIDS Datasets: A Case Study on CIC-IDS-2017 and CSE-CIC-IDS-2018 2 | 3 | This repository contains the code used for our paper (Link to be added when proceedings are published). 4 | The code performs the labelling and benchmarking for the [CIC-IDS-2017](https://www.unb.ca/cic/datasets/ids-2017.html) 5 | and [CSE-CIC-IDS-2018](https://www.unb.ca/cic/datasets/ids-2018.html) datasets 6 | after it has been processed by [our modified version of the CICFlowMeter tool](https://github.com/GintsEngelen/CICFlowMeter). 7 | 8 | Note that all of this is *research code*. 9 | 10 | If you use the code in this repository, please cite our paper: 11 | 12 | @inproceedings{liu2022error, 13 | title={Error Prevalence in NIDS datasets: A Case Study on CIC-IDS-2017 and CSE-CIC-IDS-2018}, 14 | author={Liu, Lisa and Engelen, Gints and Lynar, Timothy and Essam, Daryl and Joosen, Wouter}, 15 | booktitle={2022 IEEE Conference on Communications and Network Security (CNS)}, 16 | pages={254--262}, 17 | year={2022}, 18 | organization={IEEE} 19 | } 20 | 21 | 22 | An extended documentation of our paper can be found [here](https://intrusion-detection.distrinet-research.be/CNS2022/). 23 | 24 | ## How to use this repository 25 | 26 | First, head over to the website of the dataset (either CIC-IDS-2017 or CSE-CIC-IDS-2018) and download 27 | the raw version of the dataset (PCAP file format). 28 | 29 | Then, navigate to "Original Network Traffic and Log data/Friday-02-03-2018/pcap" and delete the following file: 'capEC2AMAZ-O4EL3NG-172.31.69 - Copy.24' (This file contains traffic from the previous day and thus leads to duplicate flow entries). 30 | 31 | Then, first run [pcapfix](https://github.com/Rup0rt/pcapfix) and then [reordercap](https://www.wireshark.org/docs/man-pages/reordercap.html) 32 | on the PCAP files. 33 | 34 | Then, run [our modified version of the CICFlowMeter tool](https://github.com/GintsEngelen/CICFlowMeter) on the data 35 | obtained in the previous step: 36 | 37 | 1. Start the CICFlowMeter tool 38 | 2. Under the "NetWork" menu option, select "Offline" 39 | 3. Select the directory or directories containing the PCAP files. Note that for CSE-CIC-IDS-2018 you will have to run the 40 | CICFlowMeter tool multiple times, once for each directory (where each directory corresponds to one day) 41 | 5. Keep the default values for the "Flow TimeOut" and "Activity Timeout" parameters (120000000 and 5000000 respectively) 42 | 43 | This will generate the CSV files with the flows extracted from the raw PCAP files. 44 | 45 | For labelling of the CIC-IDS-2017 files, we used the CICIDS2017_labelling_fixed_CICFlowMeter.ipynb script. For labelling of the CSE-CIC-IDS-2018 files, we used the CICIDS2018_labelling_fixed_CICFlowMeter.ipynb script. 46 | 47 | The two scripts with "original_version" in their name were used for our experiments where we determined the impact of 48 | just the labelling errors on classifiers. These scripts should only be used if you wish to reproduce our experimental results 49 | as published in our paper. 50 | --------------------------------------------------------------------------------