├── .gitignore
├── .idea
├── .gitignore
├── CNS2022_Code.iml
├── inspectionProfiles
│ ├── Project_Default.xml
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
└── vcs.xml
├── Experiments_code
└── rf_feature_importance.py
├── Labelling
├── CICIDS2017_labelling_fixed_CICFlowMeter.ipynb
├── CICIDS2017_original_version_labelling.ipynb
├── CICIDS2018_labelling_fixed_CICFlowMeter.ipynb
└── CICIDS2018_original_version_labelling.ipynb
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Created by https://www.toptal.com/developers/gitignore/api/macos,pycharm,python
3 | # Edit at https://www.toptal.com/developers/gitignore?templates=macos,pycharm,python
4 |
5 | ### macOS ###
6 | # General
7 | .DS_Store
8 | .AppleDouble
9 | .LSOverride
10 |
11 | # Icon must end with two \r
12 | Icon
13 |
14 |
15 | # Thumbnails
16 | ._*
17 |
18 | # Files that might appear in the root of a volume
19 | .DocumentRevisions-V100
20 | .fseventsd
21 | .Spotlight-V100
22 | .TemporaryItems
23 | .Trashes
24 | .VolumeIcon.icns
25 | .com.apple.timemachine.donotpresent
26 |
27 | # Directories potentially created on remote AFP share
28 | .AppleDB
29 | .AppleDesktop
30 | Network Trash Folder
31 | Temporary Items
32 | .apdisk
33 |
34 | ### PyCharm ###
35 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
36 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
37 |
38 | # User-specific stuff
39 | .idea/**/workspace.xml
40 | .idea/**/tasks.xml
41 | .idea/**/usage.statistics.xml
42 | .idea/**/dictionaries
43 | .idea/**/shelf
44 |
45 | # AWS User-specific
46 | .idea/**/aws.xml
47 |
48 | # Generated files
49 | .idea/**/contentModel.xml
50 |
51 | # Sensitive or high-churn files
52 | .idea/**/dataSources/
53 | .idea/**/dataSources.ids
54 | .idea/**/dataSources.local.xml
55 | .idea/**/sqlDataSources.xml
56 | .idea/**/dynamic.xml
57 | .idea/**/uiDesigner.xml
58 | .idea/**/dbnavigator.xml
59 |
60 | # Gradle
61 | .idea/**/gradle.xml
62 | .idea/**/libraries
63 |
64 | # Gradle and Maven with auto-import
65 | # When using Gradle or Maven with auto-import, you should exclude module files,
66 | # since they will be recreated, and may cause churn. Uncomment if using
67 | # auto-import.
68 | # .idea/artifacts
69 | # .idea/compiler.xml
70 | # .idea/jarRepositories.xml
71 | # .idea/modules.xml
72 | # .idea/*.iml
73 | # .idea/modules
74 | # *.iml
75 | # *.ipr
76 |
77 | # CMake
78 | cmake-build-*/
79 |
80 | # Mongo Explorer plugin
81 | .idea/**/mongoSettings.xml
82 |
83 | # File-based project format
84 | *.iws
85 |
86 | # IntelliJ
87 | out/
88 |
89 | # mpeltonen/sbt-idea plugin
90 | .idea_modules/
91 |
92 | # JIRA plugin
93 | atlassian-ide-plugin.xml
94 |
95 | # Cursive Clojure plugin
96 | .idea/replstate.xml
97 |
98 | # SonarLint plugin
99 | .idea/sonarlint/
100 |
101 | # Crashlytics plugin (for Android Studio and IntelliJ)
102 | com_crashlytics_export_strings.xml
103 | crashlytics.properties
104 | crashlytics-build.properties
105 | fabric.properties
106 |
107 | # Editor-based Rest Client
108 | .idea/httpRequests
109 |
110 | # Android studio 3.1+ serialized cache file
111 | .idea/caches/build_file_checksums.ser
112 |
113 | ### PyCharm Patch ###
114 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
115 |
116 | # *.iml
117 | # modules.xml
118 | # .idea/misc.xml
119 | # *.ipr
120 |
121 | # Sonarlint plugin
122 | # https://plugins.jetbrains.com/plugin/7973-sonarlint
123 | .idea/**/sonarlint/
124 |
125 | # SonarQube Plugin
126 | # https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
127 | .idea/**/sonarIssues.xml
128 |
129 | # Markdown Navigator plugin
130 | # https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
131 | .idea/**/markdown-navigator.xml
132 | .idea/**/markdown-navigator-enh.xml
133 | .idea/**/markdown-navigator/
134 |
135 | # Cache file creation bug
136 | # See https://youtrack.jetbrains.com/issue/JBR-2257
137 | .idea/$CACHE_FILE$
138 |
139 | # CodeStream plugin
140 | # https://plugins.jetbrains.com/plugin/12206-codestream
141 | .idea/codestream.xml
142 |
143 | ### Python ###
144 | # Byte-compiled / optimized / DLL files
145 | __pycache__/
146 | *.py[cod]
147 | *$py.class
148 |
149 | # C extensions
150 | *.so
151 |
152 | # Distribution / packaging
153 | .Python
154 | build/
155 | develop-eggs/
156 | dist/
157 | downloads/
158 | eggs/
159 | .eggs/
160 | lib/
161 | lib64/
162 | parts/
163 | sdist/
164 | var/
165 | wheels/
166 | share/python-wheels/
167 | *.egg-info/
168 | .installed.cfg
169 | *.egg
170 | MANIFEST
171 |
172 | # PyInstaller
173 | # Usually these files are written by a python script from a template
174 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
175 | *.manifest
176 | *.spec
177 |
178 | # Installer logs
179 | pip-log.txt
180 | pip-delete-this-directory.txt
181 |
182 | # Unit test / coverage reports
183 | htmlcov/
184 | .tox/
185 | .nox/
186 | .coverage
187 | .coverage.*
188 | .cache
189 | nosetests.xml
190 | coverage.xml
191 | *.cover
192 | *.py,cover
193 | .hypothesis/
194 | .pytest_cache/
195 | cover/
196 |
197 | # Translations
198 | *.mo
199 | *.pot
200 |
201 | # Django stuff:
202 | *.log
203 | local_settings.py
204 | db.sqlite3
205 | db.sqlite3-journal
206 |
207 | # Flask stuff:
208 | instance/
209 | .webassets-cache
210 |
211 | # Scrapy stuff:
212 | .scrapy
213 |
214 | # Sphinx documentation
215 | docs/_build/
216 |
217 | # PyBuilder
218 | .pybuilder/
219 | target/
220 |
221 | # Jupyter Notebook
222 | .ipynb_checkpoints
223 |
224 | # IPython
225 | profile_default/
226 | ipython_config.py
227 |
228 | # pyenv
229 | # For a library or package, you might want to ignore these files since the code is
230 | # intended to run in multiple environments; otherwise, check them in:
231 | # .python-version
232 |
233 | # pipenv
234 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
235 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
236 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
237 | # install all needed dependencies.
238 | #Pipfile.lock
239 |
240 | # poetry
241 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
242 | # This is especially recommended for binary packages to ensure reproducibility, and is more
243 | # commonly ignored for libraries.
244 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
245 | #poetry.lock
246 |
247 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
248 | __pypackages__/
249 |
250 | # Celery stuff
251 | celerybeat-schedule
252 | celerybeat.pid
253 |
254 | # SageMath parsed files
255 | *.sage.py
256 |
257 | # Environments
258 | .env
259 | .venv
260 | env/
261 | venv/
262 | ENV/
263 | env.bak/
264 | venv.bak/
265 |
266 | # Spyder project settings
267 | .spyderproject
268 | .spyproject
269 |
270 | # Rope project settings
271 | .ropeproject
272 |
273 | # mkdocs documentation
274 | /site
275 |
276 | # mypy
277 | .mypy_cache/
278 | .dmypy.json
279 | dmypy.json
280 |
281 | # Pyre type checker
282 | .pyre/
283 |
284 | # pytype static type analyzer
285 | .pytype/
286 |
287 | # Cython debug symbols
288 | cython_debug/
289 |
290 | # PyCharm
291 | # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
292 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
293 | # and can be added to the global gitignore or merged into this file. For a more nuclear
294 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
295 | #.idea/
296 |
297 | # End of https://www.toptal.com/developers/gitignore/api/macos,pycharm,python
298 |
299 | Feature Importance/Random Forest/SHAP Figures/*
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Datasource local storage ignored files
5 | /dataSources/
6 | /dataSources.local.xml
7 | # Editor-based HTTP Client requests
8 | /httpRequests/
9 |
--------------------------------------------------------------------------------
/.idea/CNS2022_Code.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/Experiments_code/rf_feature_importance.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | from sklearn.model_selection import train_test_split
4 |
5 | sys.path = sorted(sys.path, key=lambda s:'envs' not in s)
6 | import os
7 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
8 | os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
9 | os.environ["MODIN_ENGINE"] = "dask"
10 |
11 | import time
12 | import random as rn
13 | import numpy as np
14 | from sklearn.metrics import classification_report
15 | import pandas as pd
16 | import json
17 | import cudf
18 | from cuml.dask.ensemble import RandomForestClassifier as cuRF
19 | import collections
20 | import dask_cudf
21 | from dask_cuda import LocalCUDACluster
22 | from dask.distributed import Client
23 | from helpers import load_data_database
24 | from glob import glob
25 | from feature_engine.selection import DropCorrelatedFeatures
26 |
27 |
28 | seed = 1 # [1] 2017, 2018 orig, 2018 fixed [123] for 2017 fixed
29 | np.random.seed(seed)
30 | rn.seed(seed)
31 | year = 2018
32 | old = False
33 | ngpus = 2
34 |
35 |
36 | def makehash():
37 | return collections.defaultdict(makehash)
38 |
39 | def sort_importances(unsorted):
40 | benchmark = unsorted["benchmark"]
41 | res = makehash()
42 |
43 | for features, values in unsorted.items():
44 | if features == 'benchmark':
45 | continue
46 | for label, results in values.items():
47 | if label.isnumeric():
48 | prec = results['precision']
49 | recall = results['recall']
50 | f1 = results['f1-score']
51 |
52 | benchmark_prec = benchmark[label]['precision']
53 | benchmark_recall = benchmark[label]['recall']
54 | benchmark_f1 = benchmark[label]['f1-score']
55 |
56 | diff_prec = prec - benchmark_prec
57 | diff_recall = recall - benchmark_recall
58 | diff_f1 = f1 - benchmark_f1
59 |
60 | res[label]['precision'][features] = diff_prec
61 | res[label]['recall'][features] = diff_recall
62 | res[label]['f1-score'][features] = diff_f1
63 |
64 | # sort dictionary
65 | sorted_feature_importance = makehash()
66 |
67 | for label, vals in res.items():
68 | for metric, metric_results in vals.items():
69 | a = metric_results.items()
70 | sort_orders = sorted(a, key=lambda x: x[1])
71 | sorted_feature_importance[label][metric] = sort_orders
72 |
73 | filename = f"{year}_feature_importance_rf_sorted.json"
74 |
75 | with open(filename, "w") as outfile:
76 | json.dump(sorted_feature_importance, outfile, indent=4)
77 |
78 |
79 | def drop_col_feat_imp(X_full, y_full):
80 | #X_full[X_full.select_dtypes(np.float64).columns] = X_full.select_dtypes(np.float64).astype(np.float32)
81 | X_full_corr = X_full.astype('float32')
82 | y_full = y_full.astype('int32')
83 |
84 | tr = DropCorrelatedFeatures(variables=None, method='pearson', threshold=0.9)
85 |
86 | print("Calculating correlated features:")
87 | X_full = tr.fit_transform(X_full_corr)
88 | print(f'Correlated Feature Sets: {str(tr.correlated_feature_sets_)}')
89 | print(f'Dropped features: {str(tr.features_to_drop_)}')
90 |
91 | X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, random_state=seed, stratify=y_full, shuffle=True)
92 |
93 | # due to not yet implemented feature - sharding causes issues with a mysterious issue with the classifier and the
94 | # labels not being in order
95 | X_train = X_train.reset_index(drop=True)
96 | X_test = X_test.reset_index(drop=True)
97 | y_train = y_train.reset_index(drop=True)
98 | y_test = y_test.reset_index(drop=True)
99 |
100 |
101 | X_train = dask_cudf.from_cudf(cudf.from_pandas(X_train), npartitions=ngpus).persist()
102 | X_test = dask_cudf.from_cudf(cudf.from_pandas(X_test), npartitions=ngpus).persist()
103 | y_train = dask_cudf.from_cudf(cudf.from_pandas(y_train), npartitions=ngpus).persist()
104 | y_test = dask_cudf.from_cudf(cudf.from_pandas(y_test), npartitions=ngpus).persist()
105 |
106 | # list for storing feature importances
107 | importances = {}
108 | rf = cuRF(max_depth=30, n_estimators=100, random_state=seed, verbose=True, n_streams=25)
109 | st = time.time()
110 | print(f"Starting base- {st}")
111 | rf.fit(X_train, y_train, convert_dtype=True)
112 | y_pred = rf.predict(X_test)
113 | importances['benchmark'] = classification_report(y_test.compute().to_numpy(), y_pred.compute().to_numpy(), output_dict=True)
114 | print(f'Elapsed time time: {time.time()-st}')
115 |
116 | # iterating over all columns and storing feature importance (difference between benchmark and new model)
117 | for i, col in enumerate(X_train.columns):
118 | print(f"Doing col: {col} [{i}/{len(X_train.columns)}]")
119 | model_clone = cuRF(max_depth=30, n_estimators=100, random_state=seed, verbose=True, n_streams=25)
120 | model_clone.fit(X_train.drop(col, axis=1), y_train, convert_dtype=True)
121 | model_pred = model_clone.predict(X_test.drop(col, axis=1))
122 | importances[col] = classification_report(y_test.compute().to_numpy(), model_pred.compute().to_numpy(), output_dict=True)
123 | print(f'[{year}] - Finished col {col}. Elapsed time time: {time.time() - st}')
124 |
125 | print("Saving importance feature dict")
126 | if old:
127 | filename = f'{year}_old_feature_importance_rf.json'
128 | else:
129 | filename = f'{year}_new_feature_importance_rf.json'
130 |
131 | with open(filename, 'w') as fp:
132 | json.dump(importances, fp, indent=4)
133 | fp.write(f'Correlated Feature Sets: {str(tr.correlated_feature_sets_)}')
134 | fp.write(f'Dropped features: {str(tr.features_to_drop_)}')
135 |
136 | return importances
137 |
138 |
139 | def load_from_local(folder_path):
140 | files = glob(folder_path + "/*.csv")
141 | csv_dataframes = []
142 | for file in files:
143 | print(f"-- Reading in {file}")
144 | df = pd.read_csv(file)
145 | print(df.columns)
146 | df.columns = df.columns.str.lstrip(" ")
147 | df.drop(['id', 'Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Timestamp', 'Attempted Category'], axis=1, inplace=True)
148 | df = df.replace('Infinity', np.nan)
149 | df = df.replace(np.inf, np.nan)
150 | df = df.dropna()
151 |
152 | for column in df.columns:
153 | if column != 'Label':
154 | df[column] = pd.to_numeric(df[column], errors='coerce', downcast="float")
155 |
156 | csv_dataframes.extend([df])
157 |
158 | df = pd.concat(csv_dataframes, ignore_index=True)
159 | labels = df['Label'].astype('category')
160 | y = pd.Series(labels.cat.codes)
161 | train = df.drop(['Label'], axis=1)
162 |
163 | with open(f'{folder_path}/label_mapping.txt', 'w') as f:
164 | f.write(str(dict(enumerate(labels.cat.categories))))
165 |
166 |
167 | return train, y
168 |
169 |
170 | if __name__ == '__main__':
171 | # Create a Dask Cluster with one worker per GPU
172 | cluster = LocalCUDACluster()
173 | client = Client(cluster)
174 |
175 | file_path = #
176 |
177 | training_featuresdf, labeldf = load_from_local(file_path)
178 |
179 | importances = drop_col_feat_imp(training_featuresdf, labeldf)
180 | sort_importances(importances)
--------------------------------------------------------------------------------
/Labelling/CICIDS2017_labelling_fixed_CICFlowMeter.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import pandas as pd\n",
12 | "import numpy as np\n",
13 | "import glob\n",
14 | "import os\n",
15 | "from sys import platform\n",
16 | "import datetime\n",
17 | "\n",
18 | "# THIS LABELLING SCRIPT IS USED TO LABEL THE CORRECTED VERSION OF CIC-IDS-2017. FOR DETAILS CONSULT OUR WEBSITE:\n",
19 | "# https://intrusion-detection.distrinet-research.be/CNS2022/index.html\n",
20 | "\n",
21 | "pd.set_option('display.max_rows', 100)\n",
22 | "\n",
23 | "# Enter the path that contains the CSV files that were generated by the CICFlowMeter tool. There should be five CSV\n",
24 | "# files in total, one per day.\n",
25 | "DATASET_PATH = \"\"\n",
26 | "\n",
27 | "# Enter the output path for the fully labelled CSV files\n",
28 | "OUTPUT_PATH = \"\"\n",
29 | "\n",
30 | "# If set to true, a column is added at the front of the CSV with line numbers\n",
31 | "print_index = True"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 2,
37 | "outputs": [],
38 | "source": [
39 | "# Basic preprocessing before getting started on labelling.\n",
40 | "# Deletes rows with \"Infinity\" and NaNs, converts \"Timestamp\" to Pandas Datetime, and converts all necessary columns to\n",
41 | "# numeric values\n",
42 | "def format_csv_for_labelling(df):\n",
43 | " df = df.replace('Infinity', np.nan)\n",
44 | " df['Timestamp'] = pd.to_datetime(df['Timestamp'])\n",
45 | " for column in df.columns:\n",
46 | " if column not in ['Flow ID' , 'Timestamp', 'Src IP', 'Dst IP', 'Label']:\n",
47 | " df[column] = pd.to_numeric(df[column], errors='coerce')\n",
48 | "\n",
49 | " df.dropna()\n",
50 | "\n",
51 | " return df.dropna()\n",
52 | "\n",
53 | "def read_csvs_from_path_and_reformat(path):\n",
54 | " df = pd.read_csv(path, encoding='cp1252')\n",
55 | "\n",
56 | " df = format_csv_for_labelling(df)\n",
57 | " print(\"labels after pre-processing:\", df[\"Label\"].value_counts())\n",
58 | "\n",
59 | " df[\"Attempted Category\"] = -1\n",
60 | "\n",
61 | " int64_columns = [\"Total TCP Flow Time\"]\n",
62 | "\n",
63 | " int32_columns = [\"Src Port\", \"Dst Port\", \"Flow Duration\", \"Total Fwd Packet\", \"Total Bwd packets\", \"Total Length of Fwd Packet\", \"Total Length of Bwd Packet\", \"Fwd Packet Length Max\",\n",
64 | " \"Fwd Packet Length Min\", \"Bwd Packet Length Max\", \"Bwd Packet Length Min\", \"Flow IAT Max\", \"Flow IAT Min\", \"Fwd IAT Total\", \"Fwd IAT Max\", \"Fwd IAT Min\", \"Bwd IAT Total\",\n",
65 | " \"Bwd IAT Max\", \"Bwd IAT Min\", \"Fwd PSH Flags\", \"Bwd PSH Flags\", \"Fwd URG Flags\", \"Bwd URG Flags\", \"Packet Length Min\", \"Packet Length Max\", \"FIN Flag Count\", \"SYN Flag Count\", \"RST Flag Count\", \"PSH Flag Count\",\n",
66 | " \"ACK Flag Count\", \"URG Flag Count\", \"CWR Flag Count\", \"ECE Flag Count\", \"Subflow Fwd Packets\", \"Subflow Fwd Bytes\",\n",
67 | " \"Subflow Bwd Packets\", \"Subflow Bwd Bytes\", \"FWD Init Win Bytes\", \"Bwd Init Win Bytes\", \"Fwd Act Data Pkts\", \"Fwd Seg Size Min\", \"Active Max\",\n",
68 | " \"Active Min\", \"Idle Max\", \"Idle Min\"]\n",
69 | "\n",
70 | " int16_columns = [\"Fwd Header Length\", \"Bwd Header Length\", \"ICMP Code\", \"ICMP Type\"]\n",
71 | "\n",
72 | " for column in int64_columns:\n",
73 | " df[column] = df[column].astype('int64')\n",
74 | "\n",
75 | " for column in int32_columns:\n",
76 | " df[column] = df[column].astype('int32')\n",
77 | "\n",
78 | " for column in int16_columns:\n",
79 | " df[column] = df[column].astype('int16')\n",
80 | "\n",
81 | " return df\n",
82 | "\n",
83 | "# Main labelling function. Only used for labelling Malicious and Malicious - Attempted flows.\n",
84 | "# Timestamps are in NANOSECONDS (!) Unix time. Note that the CSV files are in the UTC timezone.\n",
85 | "# df = dataframe with flows. Note that labelling happens inplace on the 'df' parameter, and so this function doesn't return anything\n",
86 | "# label = the label that will be given to flows matching the criteria specified in the function\n",
87 | "# additional_filters = add any additional constraints that cannot be covered by the already provided function arguments\n",
88 | "# see examples in the actual labelling logic for correct syntax\n",
89 | "# attempted_category = please consult our website (https://intrusion-detection.distrinet-research.be/CNS2022/Tools_Documentation.html)\n",
90 | "# for details on how the \"Attempted\" categories are defined.\n",
91 | "# payload_filter = When set to true, this will automatically add a constraint [\"Total Length of Fwd Packet\"] == 0. Note that\n",
92 | "# the Attempted label and category still need to be specified manually\n",
93 | "def label_flows(df, label, attack_start_time_nanoseconds, attack_end_time_nanoseconds, src_ip_list=None,\n",
94 | " dst_ip_list= None, src_port_list=None, dst_port_list=None, additional_filters=[], attempted_category=-1, payload_filter=False):\n",
95 | "\n",
96 | "\n",
97 | " # Create initial mask for whole df with all values set to True. Squeeze is necessary to remove second axis (with value 1)\n",
98 | " # The reason is that a df of shape (X,) gets converted to (1,X) if you '&' it with a df of shape (X,1)\n",
99 | " mask = pd.DataFrame(True,index=df.index,columns=[df.columns[0]]).squeeze()\n",
100 | "\n",
101 | " attack_start_datetime = pd.to_datetime(attack_start_time_nanoseconds, unit='ns')\n",
102 | " attack_end_datetime = pd.to_datetime(attack_end_time_nanoseconds, unit='ns')\n",
103 | "\n",
104 | " mask &= (df[\"Timestamp\"] >= attack_start_datetime)\n",
105 | " mask &= (df[\"Timestamp\"] <= attack_end_datetime)\n",
106 | "\n",
107 | " if src_ip_list is not None:\n",
108 | " mask &= (df[\"Src IP\"].isin(src_ip_list))\n",
109 | " if dst_ip_list is not None:\n",
110 | " mask &= (df[\"Dst IP\"].isin(dst_ip_list))\n",
111 | "\n",
112 | " if src_port_list is not None:\n",
113 | " mask &= (df[\"Src Port\"].isin(src_port_list))\n",
114 | " if dst_port_list is not None:\n",
115 | " mask &= (df[\"Dst Port\"].isin(dst_port_list))\n",
116 | "\n",
117 | " if payload_filter:\n",
118 | " mask &= (df[\"Total Length of Fwd Packet\"] == 0)\n",
119 | "\n",
120 | " for filter in additional_filters:\n",
121 | " mask &= filter\n",
122 | "\n",
123 | " df[\"Label\"].mask(mask, label, inplace=True)\n",
124 | " df[\"Attempted Category\"].mask(mask, attempted_category, inplace=True)\n",
125 | "\n",
126 | "# This function is called when all labelling of malicious flows is completed. Anything that has not yet received a label\n",
127 | "# so far is labelled as Benign.\n",
128 | "def label_rest_as_benign_and_write_csv(df, file_to_write):\n",
129 | " df[\"Label\"].mask(df[\"Label\"] == \"NeedManualLabel\", \"BENIGN\", inplace=True)\n",
130 | "\n",
131 | " # Relabel artefact flows with [Flow Id] = '8.0.6.4-8.6.0.1-0-0-0' to label = 0\n",
132 | " df[\"Label\"].mask(df[\"Flow ID\"] == '8.0.6.4-8.6.0.1-0-0-0', \"BENIGN\", inplace=True)\n",
133 | "\n",
134 | " print(\"label count after labelling:\\r\\n\", df[\"Label\"].value_counts())\n",
135 | " print(\"Attempted Category count after labelling:\\r\\n\", df[\"Attempted Category\"].value_counts())\n",
136 | "\n",
137 | " # Adds line numbers in the first column if print_index is set to true\n",
138 | " if print_index:\n",
139 | " df.reset_index(inplace=True, drop=True)\n",
140 | " df.index += 1\n",
141 | " df.index.name = 'id'\n",
142 | " df.to_csv(file_to_write)\n",
143 | " else:\n",
144 | " df.to_csv(file_to_write, index=False)\n"
145 | ],
146 | "metadata": {
147 | "collapsed": false
148 | }
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": 3,
153 | "outputs": [
154 | {
155 | "name": "stdout",
156 | "output_type": "stream",
157 | "text": [
158 | "labels after pre-processing: NeedManualLabel 371624\n",
159 | "Name: Label, dtype: int64\n",
160 | "label count after labelling:\r\n",
161 | " BENIGN 371624\n",
162 | "Name: Label, dtype: int64\n",
163 | "Attempted Category count after labelling:\r\n",
164 | " -1 371624\n",
165 | "Name: Attempted Category, dtype: int64\n"
166 | ]
167 | }
168 | ],
169 | "source": [
170 | "monday_df = read_csvs_from_path_and_reformat(DATASET_PATH + \"Monday-WorkingHours.pcap_Flow.csv\")\n",
171 | "\n",
172 | "label_rest_as_benign_and_write_csv(monday_df, OUTPUT_PATH + \"monday.csv\")"
173 | ],
174 | "metadata": {
175 | "collapsed": false
176 | }
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": 4,
181 | "outputs": [
182 | {
183 | "name": "stdout",
184 | "output_type": "stream",
185 | "text": [
186 | "labels after pre-processing: NeedManualLabel 322078\n",
187 | "Name: Label, dtype: int64\n",
188 | "label count after labelling:\r\n",
189 | " BENIGN 315106\n",
190 | "FTP-Patator 3972\n",
191 | "SSH-Patator 2961\n",
192 | "SSH-Patator - Attempted 27\n",
193 | "FTP-Patator - Attempted 12\n",
194 | "Name: Label, dtype: int64\n",
195 | "Attempted Category count after labelling:\r\n",
196 | " -1 322039\n",
197 | " 3 27\n",
198 | " 0 10\n",
199 | " 2 2\n",
200 | "Name: Attempted Category, dtype: int64\n"
201 | ]
202 | }
203 | ],
204 | "source": [
205 | "#--------------------+\n",
206 | "# TUESDAY 04-07-2017 |\n",
207 | "#--------------------+\n",
208 | "\n",
209 | "tuesday_df = read_csvs_from_path_and_reformat(DATASET_PATH + \"Tuesday-WorkingHours.pcap_Flow.csv\")\n",
210 | "\n",
211 | "# FTP-PATATOR\n",
212 | "# -----------\n",
213 | "\n",
214 | "label_flows(tuesday_df, \"FTP-Patator\", 1499170672838272000, 1499174416931403000, [\"172.16.0.1\"],\n",
215 | " [\"192.168.10.50\"], dst_port_list=[21])\n",
216 | "\n",
217 | "# Default payload filter\n",
218 | "label_flows(tuesday_df, \"FTP-Patator - Attempted\", 1499170672838272000, 1499174416931403000, [\"172.16.0.1\"],\n",
219 | " [\"192.168.10.50\"], dst_port_list=[21], payload_filter=True, attempted_category=0)\n",
220 | "\n",
221 | "label_flows(tuesday_df, \"FTP-Patator - Attempted\", 1499170672838272000, 1499174416931403000, [\"172.16.0.1\"],\n",
222 | " [\"192.168.10.50\"], dst_port_list=[21], additional_filters=[(tuesday_df[\"Src Port\"] == 52108)],\n",
223 | " attempted_category=2)\n",
224 | "\n",
225 | "# SSH-Patator\n",
226 | "# -----------\n",
227 | "\n",
228 | "label_flows(tuesday_df, \"SSH-Patator\", 1499188141049616000, 1499195059018486000, [\"172.16.0.1\"],\n",
229 | " [\"192.168.10.50\"], dst_port_list=[22])\n",
230 | "\n",
231 | "label_flows(tuesday_df, \"SSH-Patator - Attempted\", 1499188141049616000, 1499195059018486000, [\"172.16.0.1\"],\n",
232 | " [\"192.168.10.50\"], dst_port_list=[22], payload_filter=True, attempted_category=0)\n",
233 | "\n",
234 | "label_flows(tuesday_df, \"SSH-Patator - Attempted\", 1499188141049616000, 1499195059018486000, [\"172.16.0.1\"],\n",
235 | " [\"192.168.10.50\"], dst_port_list=[22], additional_filters=\n",
236 | " [\n",
237 | " (tuesday_df[\"Total Length of Fwd Packet\"] <= 32) & (tuesday_df[\"Total Length of Bwd Packet\"] == 0)\n",
238 | " ], attempted_category=3)\n",
239 | "\n",
240 | "label_rest_as_benign_and_write_csv(tuesday_df, OUTPUT_PATH + \"tuesday.csv\")\n",
241 | "\n",
242 | "tuesday_df = None"
243 | ],
244 | "metadata": {
245 | "collapsed": false
246 | }
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": 5,
251 | "outputs": [
252 | {
253 | "name": "stdout",
254 | "output_type": "stream",
255 | "text": [
256 | "labels after pre-processing: NeedManualLabel 496641\n",
257 | "Name: Label, dtype: int64\n",
258 | "label count after labelling:\r\n",
259 | " BENIGN 319120\n",
260 | "DoS Hulk 158468\n",
261 | "DoS GoldenEye 7567\n",
262 | "DoS Slowloris 3859\n",
263 | "DoS Slowhttptest - Attempted 3368\n",
264 | "DoS Slowloris - Attempted 1847\n",
265 | "DoS Slowhttptest 1740\n",
266 | "DoS Hulk - Attempted 581\n",
267 | "DoS GoldenEye - Attempted 80\n",
268 | "Heartbleed 11\n",
269 | "Name: Label, dtype: int64\n",
270 | "Attempted Category count after labelling:\r\n",
271 | " -1 490765\n",
272 | " 0 2927\n",
273 | " 6 2804\n",
274 | " 5 138\n",
275 | " 4 4\n",
276 | " 2 3\n",
277 | "Name: Attempted Category, dtype: int64\n"
278 | ]
279 | }
280 | ],
281 | "source": [
282 | "#----------------------+\n",
283 | "# WEDNESDAY 05-07-2017 |\n",
284 | "#----------------------+\n",
285 | "\n",
286 | "wednesday_df = read_csvs_from_path_and_reformat(DATASET_PATH + \"Wednesday-WorkingHours.pcap_Flow.csv\")\n",
287 | "\n",
288 | "# DoS Slowloris\n",
289 | "# -------------\n",
290 | "\n",
291 | "# Accidental early launch of the tool with wrong parameters\n",
292 | "label_flows(wednesday_df, \"DoS Slowloris - Attempted\", 1499258926211817000, 1499258927000000000, [\"172.16.0.1\"],\n",
293 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=5)\n",
294 | "\n",
295 | "# Normal attack\n",
296 | "label_flows(wednesday_df, \"DoS Slowloris\", 1499258934539220000, 1499260278500956000, [\"172.16.0.1\"],\n",
297 | " [\"192.168.10.50\"], dst_port_list=[80], additional_filters=[\n",
298 | " ~(wednesday_df[\"Src Port\"].isin([33358, 33360, 33362, 54114]))\n",
299 | " ])\n",
300 | "\n",
301 | "# port 33358, 33360 and 33362 contain attack teardown flows\n",
302 | "label_flows(wednesday_df, \"DoS Slowloris - Attempted\", 1499258934539220000, 1499260278500956000, [\"172.16.0.1\"],\n",
303 | " [\"192.168.10.50\"], src_port_list=[33358, 33360, 33362], dst_port_list=[80], attempted_category=2)\n",
304 | "\n",
305 | "#Payload filter (order is important, this part needs to come before Attempted category 6)\n",
306 | "label_flows(wednesday_df, \"DoS Slowloris - Attempted\", 1499258934539220000, 1499260278500956000, [\"172.16.0.1\"],\n",
307 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=0, payload_filter=True, additional_filters=[\n",
308 | " ~(wednesday_df[\"Src Port\"].isin([33358, 33360, 33362, 54114]))\n",
309 | " ])\n",
310 | "\n",
311 | "#Target unresponsive because of DoS, no payloads in these flows\n",
312 | "label_flows(wednesday_df, \"DoS Slowloris - Attempted\", 1499258934539220000, 1499260278500956000,\n",
313 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], attempted_category=6, additional_filters=[\n",
314 | " ~(wednesday_df[\"Dst Port\"].isin([33358, 33360, 33362, 54114])) & (wednesday_df[\"Total Length of Bwd Packet\"] == 0)\n",
315 | " & (wednesday_df[\"Flow Duration\"] >= 199800)\n",
316 | " ])\n",
317 | "\n",
318 | "# Artefact likely from authors checking the webserver\n",
319 | "label_flows(wednesday_df, \"DoS Slowloris - Attempted\", 1499258934539220000, 1499260278500956000, [\"172.16.0.1\"],\n",
320 | " [\"192.168.10.50\"], src_port_list=[54114], dst_port_list=[80], attempted_category=4)\n",
321 | "\n",
322 | "# DoS Slowhttptest\n",
323 | "# ----------------\n",
324 | "\n",
325 | "label_flows(wednesday_df, \"DoS Slowhttptest\", 1499260537936810000, 1499261869331517000, [\"172.16.0.1\"],\n",
326 | " [\"192.168.10.50\"], dst_port_list=[80], additional_filters=[\n",
327 | " ~(wednesday_df[\"Src Port\"].isin([33372]))])\n",
328 | "\n",
329 | "\n",
330 | "# Attack startup artefact\n",
331 | "label_flows(wednesday_df, \"DoS Slowhttptest - Attempted\", 1499260537936810000, 1499261869331517000, [\"172.16.0.1\"],\n",
332 | " [\"192.168.10.50\"], src_port_list=[33372], dst_port_list=[80], attempted_category=2)\n",
333 | "\n",
334 | "# Payload filter\n",
335 | "label_flows(wednesday_df, \"DoS Slowhttptest - Attempted\", 1499260537936810000, 1499261869331517000, [\"172.16.0.1\"],\n",
336 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=0, payload_filter=True, additional_filters=[\n",
337 | " ~(wednesday_df[\"Src Port\"].isin([33372, 37670]))])\n",
338 | "\n",
339 | "# Retransmissions because target web server is brought down\n",
340 | "label_flows(wednesday_df, \"DoS Slowhttptest - Attempted\", 1499260537936810000, 1499261869331517000, [\"172.16.0.1\"],\n",
341 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=6, additional_filters=[\n",
342 | " ~(wednesday_df[\"Src Port\"].isin([33372, 37670])) & (wednesday_df[\"Total Length of Fwd Packet\"] == 0) &\n",
343 | " (wednesday_df[\"Flow Duration\"] >= 199984) & (wednesday_df[\"Total Bwd packets\"] == 0)\n",
344 | " ]\n",
345 | " )\n",
346 | "\n",
347 | "# Artefact from authors likely checking the webserver\n",
348 | "label_flows(wednesday_df, \"DoS Slowhttptest - Attempted\", 1499260537936810000, 1499261869331517000, [\"172.16.0.1\"],\n",
349 | " [\"192.168.10.50\"], src_port_list=[37670], dst_port_list=[80], attempted_category=4)\n",
350 | "\n",
351 | "\n",
352 | "# DoS Hulk\n",
353 | "# --------\n",
354 | "\n",
355 | "# Note that ports 48678 and 43664 have a benign flow launched by attacker IP while attack is already ongoing,\n",
356 | "# containing benign HTTP request. This will be labelled as Attack artefact\n",
357 | "label_flows(wednesday_df, \"DoS Hulk\", 1499262203194704000, 1499262299999999999, [\"172.16.0.1\"],\n",
358 | " [\"192.168.10.50\"], dst_port_list=[80], additional_filters=[\n",
359 | " ~(wednesday_df[\"Src Port\"].isin([48678 , 43664]))\n",
360 | " ])\n",
361 | "\n",
362 | "#Attack artefact - likely authors checking webserver mid-attack.\n",
363 | "label_flows(wednesday_df, \"DoS Hulk - Attempted\", 1499262203194704000, 1499262299999999999, [\"172.16.0.1\"],\n",
364 | " [\"192.168.10.50\"], src_port_list=[48678 , 43664], dst_port_list=[80], attempted_category=4)\n",
365 | "\n",
366 | "# Normal DoS Hulk\n",
367 | "label_flows(wednesday_df, \"DoS Hulk\", 1499262300000000000, 1499263641326171000, [\"172.16.0.1\"],\n",
368 | " [\"192.168.10.50\"], dst_port_list=[80])\n",
369 | "\n",
370 | "# Payload filter\n",
371 | "label_flows(wednesday_df, \"DoS Hulk - Attempted\", 1499262203194704000, 1499263641326171000, [\"172.16.0.1\"],\n",
372 | " [\"192.168.10.50\"], dst_port_list=[80], payload_filter=True, attempted_category=0, additional_filters=[\n",
373 | " ~(wednesday_df[\"Src Port\"].isin([48678 , 43664]))])\n",
374 | "\n",
375 | "# Artefacts caused by either attack tool or non-empty TCP appendices. Reasoning is that 282 is minimum size of malicious payload\n",
376 | "label_flows(wednesday_df, \"DoS Hulk - Attempted\", 1499262203194704000, 1499263641326171000, [\"172.16.0.1\"],\n",
377 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=3, additional_filters=[\n",
378 | " ~(wednesday_df[\"Src Port\"].isin([48678 , 43664])) & (wednesday_df[\"Total Length of Fwd Packet\"] > 0)\n",
379 | " & (wednesday_df[\"Total Length of Fwd Packet\"] < 282)\n",
380 | " ])\n",
381 | "\n",
382 | "# DoS GoldenEye\n",
383 | "# -------------\n",
384 | "\n",
385 | "label_flows(wednesday_df, \"DoS GoldenEye\", 1499263803231753000, 1499264408915718000, [\"172.16.0.1\"],\n",
386 | " [\"192.168.10.50\"], dst_port_list=[80])\n",
387 | "\n",
388 | "label_flows(wednesday_df, \"DoS GoldenEye - Attempted\", 1499263803231753000, 1499264408915718000, [\"172.16.0.1\"],\n",
389 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=0, payload_filter=True)\n",
390 | "\n",
391 | "# Heartbleed\n",
392 | "# ----------\n",
393 | "\n",
394 | "label_flows(wednesday_df, \"Heartbleed\", 1499278335650811000, 1499279563294455000, [\"172.16.0.1\"],\n",
395 | " [\"192.168.10.51\"], dst_port_list=[444], additional_filters=[\n",
396 | " (wednesday_df[\"Src Port\"] == 45022)\n",
397 | " ])\n",
398 | "\n",
399 | "label_flows(wednesday_df, \"Heartbleed - Attempted\", 1499278335650811000, 1499279563294455000, [\"172.16.0.1\"],\n",
400 | " [\"192.168.10.51\"], dst_port_list=[444], attempted_category=0, payload_filter=True, additional_filters=[\n",
401 | " (wednesday_df[\"Src Port\"] == 45022)])\n",
402 | "\n",
403 | "label_rest_as_benign_and_write_csv(wednesday_df, OUTPUT_PATH + \"wednesday.csv\")\n",
404 | "\n",
405 | "wednesday_df = None"
406 | ],
407 | "metadata": {
408 | "collapsed": false
409 | }
410 | },
411 | {
412 | "cell_type": "code",
413 | "execution_count": 6,
414 | "outputs": [
415 | {
416 | "name": "stdout",
417 | "output_type": "stream",
418 | "text": [
419 | "labels after pre-processing: NeedManualLabel 362076\n",
420 | "Name: Label, dtype: int64\n",
421 | "label count after labelling:\r\n",
422 | " BENIGN 288172\n",
423 | "Infiltration - Portscan 71767\n",
424 | "Web Attack - Brute Force - Attempted 1292\n",
425 | "Web Attack - XSS - Attempted 655\n",
426 | "Web Attack - Brute Force 73\n",
427 | "Infiltration - Attempted 45\n",
428 | "Infiltration 36\n",
429 | "Web Attack - XSS 18\n",
430 | "Web Attack - SQL Injection 13\n",
431 | "Web Attack - SQL Injection - Attempted 5\n",
432 | "Name: Label, dtype: int64\n",
433 | "Attempted Category count after labelling:\r\n",
434 | " -1 360079\n",
435 | " 0 1908\n",
436 | " 4 71\n",
437 | " 2 18\n",
438 | "Name: Attempted Category, dtype: int64\n"
439 | ]
440 | }
441 | ],
442 | "source": [
443 | "#---------------------+\n",
444 | "# THURSDAY 06-07-2017 |\n",
445 | "#---------------------+\n",
446 | "\n",
447 | "thursday_df = read_csvs_from_path_and_reformat(DATASET_PATH + \"Thursday-WorkingHours.pcap_Flow.csv\")\n",
448 | "\n",
449 | "# Web Attack - Brute Force\n",
450 | "# ------------------------\n",
451 | "\n",
452 | "label_flows(thursday_df, \"Web Attack - Brute Force - Attempted\", 1499343354880049000, 1499343531179279000,\n",
453 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80], attempted_category=2)\n",
454 | "\n",
455 | "label_flows(thursday_df, \"Web Attack - Brute Force\", 1499343567660566000, 1499346011622209000,\n",
456 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80], additional_filters=\n",
457 | " [\n",
458 | " (thursday_df[\"Total Fwd Packet\"] > 20) | (thursday_df[\"Src Port\"] == 44464)\n",
459 | " ])\n",
460 | "\n",
461 | "label_flows(thursday_df, \"Web Attack - Brute Force - Attempted\", 1499343567660566000, 1499346011622209000,\n",
462 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80], payload_filter=True, attempted_category=0,\n",
463 | " additional_filters=\n",
464 | " [~((thursday_df[\"Total Fwd Packet\"] > 20) | (thursday_df[\"Src Port\"] == 44464))])\n",
465 | "\n",
466 | "label_flows(thursday_df, \"Web Attack - Brute Force - Attempted\", 1499343567660566000, 1499346011622209000,\n",
467 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80], attempted_category=4,\n",
468 | " additional_filters=\n",
469 | " [\n",
470 | " (thursday_df[\"Total Length of Fwd Packet\"] > 0) & ~(thursday_df[\"Src Port\"] == 44464) &\n",
471 | " (thursday_df[\"Total Fwd Packet\"] == 5) & (thursday_df[\"Total Bwd packets\"] == 5)\n",
472 | " ])\n",
473 | "\n",
474 | "# Web Attack - XSS\n",
475 | "# ----------------\n",
476 | "\n",
477 | "label_flows(thursday_df, \"Web Attack - XSS\", 1499346935283859000, 1499348121341704000, [\"172.16.0.1\"],\n",
478 | " [\"192.168.10.50\"], dst_port_list=[80], additional_filters=\n",
479 | " [\n",
480 | " ~(thursday_df[\"Src Port\"].isin([36180, 36182, 36184, 36186, 36188, 36190])) &\n",
481 | " (thursday_df[\"Total Fwd Packet\"] >= 150)\n",
482 | " ])\n",
483 | "\n",
484 | "label_flows(thursday_df, \"Web Attack - XSS - Attempted\", 1499346935283859000, 1499348121341704000, [\"172.16.0.1\"],\n",
485 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=0, payload_filter=True, additional_filters=\n",
486 | " [~(thursday_df[\"Src Port\"].isin([36180, 36182, 36184, 36186, 36188, 36190]))])\n",
487 | "\n",
488 | "label_flows(thursday_df, \"Web Attack - XSS - Attempted\", 1499346935283859000, 1499348121341704000, [\"172.16.0.1\"],\n",
489 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=2, additional_filters=\n",
490 | " [\n",
491 | " ~(thursday_df[\"Src Port\"].isin([36180, 36182, 36184, 36186, 36188, 36190])) &\n",
492 | " (thursday_df[\"Total Length of Fwd Packet\"] > 0) & (thursday_df[\"Total Fwd Packet\"] < 150)\n",
493 | " ])\n",
494 | "\n",
495 | "# Web Attack - SQL Injection\n",
496 | "# --------------------------\n",
497 | "\n",
498 | "label_flows(thursday_df, \"Web Attack - SQL Injection - Attempted\", 1499348127852814000, 1499348145720612000,\n",
499 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80], attempted_category=2,\n",
500 | " additional_filters=[\n",
501 | " thursday_df[\"Src Port\"].isin([36180, 36182, 36184, 36186, 36188])\n",
502 | " ])\n",
503 | "\n",
504 | "label_flows(thursday_df, \"Web Attack - SQL Injection\", 1499348145732950000, 1499348575320284000,\n",
505 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80],\n",
506 | " additional_filters=[\n",
507 | " ~(thursday_df[\"Src Port\"].isin([36180, 36182, 36184, 36186, 36188]))\n",
508 | " ])\n",
509 | "\n",
510 | "label_flows(thursday_df, \"Web Attack - SQL Injection - Attempted\", 1499348127852814000, 1499348145720612000,\n",
511 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80], attempted_category=0,\n",
512 | " payload_filter=True)\n",
513 | "\n",
514 | "\n",
515 | "# Infiltration\n",
516 | "# 5.1 Dropbox Download\n",
517 | "# ------------\n",
518 | "\n",
519 | "\n",
520 | "label_flows(thursday_df, \"Infiltration\", 1499361542547210000, 1499366769364731000, [\"192.168.10.8\"], [\"205.174.165.73\"])\n",
521 | "\n",
522 | "label_flows(thursday_df, \"Infiltration - Attempted\", 1499361542547210000, 1499366769364731000, [\"192.168.10.8\"],\n",
523 | " [\"205.174.165.73\"], attempted_category=0, payload_filter=True)\n",
524 | "\n",
525 | "label_flows(thursday_df, \"Infiltration - Attempted\", 1499361228830533000, 1499361301251276000 , [\"192.168.10.9\"],\n",
526 | " [\"205.174.165.73\"], attempted_category=2)\n",
527 | "\n",
528 | "# 5.2 Cooldisk Mac\n",
529 | "\n",
530 | "label_flows(thursday_df, \"Infiltration\", 1499363616453990000, 1499371339347892000, [\"192.168.10.25\"], [\"205.174.165.73\"])\n",
531 | "\n",
532 | "label_flows(thursday_df, \"Infiltration - Attempted\", 1499363616453990000, 1499371339347892000, [\"192.168.10.25\"],\n",
533 | " [\"205.174.165.73\"], attempted_category=0, payload_filter=True)\n",
534 | "\n",
535 | "# 5.3 NMAP + Portscan\n",
536 | "\n",
537 | "# Round 1\n",
538 | "\n",
539 | "label_flows(thursday_df, \"Infiltration - Portscan\", 1499360431706755000, 1499360445728887000, [\"172.16.0.1\"],\n",
540 | " [\"192.168.10.51\"], additional_filters=[\n",
541 | " (thursday_df[\"Src Port\"] == 50122) | (thursday_df[\"Src Port\"] == 50133)\n",
542 | " ])\n",
543 | "\n",
544 | "# Round 2\n",
545 | "\n",
546 | "label_flows(thursday_df, \"Infiltration - Portscan\", 1499362410884008000, 1499362444285175000, [\"192.168.10.8\"],\n",
547 | " [\"192.168.10.5\"])\n",
548 | "\n",
549 | "# Round 3\n",
550 | "\n",
551 | "label_flows(thursday_df, \"Infiltration - Portscan\", 1499364314425162000, 1499366764331875000, [\"192.168.10.8\"],\n",
552 | " [\"192.168.10.5\", \"192.168.10.9\", \"192.168.10.12\", \"192.168.10.14\", \"192.168.10.15\", \"192.168.10.16\",\n",
553 | " \"192.168.10.17\", \"192.168.10.19\", \"192.168.10.25\", \"192.168.10.50\", \"192.168.10.51\"], additional_filters= [\n",
554 | " ~((thursday_df[\"Fwd Packet Length Max\"] == 408) & (thursday_df[\"Dst IP\"] == \"192.168.10.50\")) &\n",
555 | " ~((thursday_df[\"Total Length of Fwd Packet\"].isin([176, 20514])) & (thursday_df[\"Dst IP\"] == \"192.168.10.50\"))\n",
556 | " ]\n",
557 | ")\n",
558 | "\n",
559 | "label_rest_as_benign_and_write_csv(thursday_df, OUTPUT_PATH + \"thursday.csv\")\n",
560 | "\n",
561 | "thursday_df = None"
562 | ],
563 | "metadata": {
564 | "collapsed": false
565 | }
566 | },
567 | {
568 | "cell_type": "code",
569 | "execution_count": 7,
570 | "outputs": [
571 | {
572 | "name": "stdout",
573 | "output_type": "stream",
574 | "text": [
575 | "labels after pre-processing: NeedManualLabel 547557\n",
576 | "Name: Label, dtype: int64\n",
577 | "label count after labelling:\r\n",
578 | " BENIGN 288544\n",
579 | "Portscan 159066\n",
580 | "DDoS 95144\n",
581 | "Botnet - Attempted 4067\n",
582 | "Botnet 736\n",
583 | "Name: Label, dtype: int64\n",
584 | "Attempted Category count after labelling:\r\n",
585 | " -1 543490\n",
586 | " 1 4067\n",
587 | "Name: Attempted Category, dtype: int64\n"
588 | ]
589 | }
590 | ],
591 | "source": [
592 | "#---------------------+\n",
593 | "# FRIDAY 07-07-2017 |\n",
594 | "#---------------------+\n",
595 | "\n",
596 | "friday_df = read_csvs_from_path_and_reformat(DATASET_PATH + \"Friday-WorkingHours.pcap_Flow.csv\")\n",
597 | "\n",
598 | "# Portscan\n",
599 | "# --------\n",
600 | "\n",
601 | "#First round\n",
602 | "label_flows(friday_df, \"Portscan\", 1499446532117090000, 1499447948582083000, [\"172.16.0.1\"], [\"192.168.10.50\"])\n",
603 | "\n",
604 | "\n",
605 | "#Second round\n",
606 | "label_flows(friday_df, \"Portscan\", 1499449905450532000, 1499451841699238000, [\"172.16.0.1\"], [\"192.168.10.50\"])\n",
607 | "\n",
608 | "# Botnet\n",
609 | "# ------\n",
610 | "\n",
611 | "label_flows(friday_df, \"Botnet\", 1499432653990571000, 1499436122903736000, [\"192.168.10.15\", \"192.168.10.9\",\n",
612 | " \"192.168.10.14\", \"192.168.10.5\", \"192.168.10.8\"], [\"205.174.165.73\"])\n",
613 | "\n",
614 | "label_flows(friday_df, \"Botnet - Attempted\", 1499432653990571000, 1499436122903736000, [\"192.168.10.15\", \"192.168.10.9\",\n",
615 | " \"192.168.10.14\", \"192.168.10.5\", \"192.168.10.8\"], [\"205.174.165.73\"], attempted_category=0,\n",
616 | " payload_filter=True)\n",
617 | "\n",
618 | "label_flows(friday_df, \"Botnet - Attempted\", 1499436180000000000, 1499457684606663000, [\"192.168.10.15\", \"192.168.10.9\",\n",
619 | " \"192.168.10.14\", \"192.168.10.5\", \"192.168.10.8\"], [\"205.174.165.73\"], attempted_category=1)\n",
620 | "\n",
621 | "\n",
622 | "# DDoS\n",
623 | "# ----\n",
624 | "\n",
625 | "label_flows(friday_df, \"DDoS\", 1499453791796937000, 1499454972216560000, [\"172.16.0.1\"], [\"192.168.10.50\"])\n",
626 | "\n",
627 | "label_flows(friday_df, \"DDoS - Attempted\", 1499453791796937000, 1499454972216560000, [\"172.16.0.1\"], [\"192.168.10.50\"],\n",
628 | " attempted_category=0, payload_filter=True)\n",
629 | "\n",
630 | "label_rest_as_benign_and_write_csv(friday_df, OUTPUT_PATH + \"friday.csv\")\n",
631 | "\n",
632 | "friday_df = None"
633 | ],
634 | "metadata": {
635 | "collapsed": false
636 | }
637 | },
638 | {
639 | "cell_type": "code",
640 | "execution_count": 16,
641 | "outputs": [],
642 | "source": [],
643 | "metadata": {
644 | "collapsed": false
645 | }
646 | }
647 | ],
648 | "metadata": {
649 | "kernelspec": {
650 | "display_name": "Python 3",
651 | "language": "python",
652 | "name": "python3"
653 | },
654 | "language_info": {
655 | "codemirror_mode": {
656 | "name": "ipython",
657 | "version": 2
658 | },
659 | "file_extension": ".py",
660 | "mimetype": "text/x-python",
661 | "name": "python",
662 | "nbconvert_exporter": "python",
663 | "pygments_lexer": "ipython2",
664 | "version": "2.7.6"
665 | }
666 | },
667 | "nbformat": 4,
668 | "nbformat_minor": 0
669 | }
670 |
--------------------------------------------------------------------------------
/Labelling/CICIDS2017_original_version_labelling.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import pandas as pd\n",
12 | "import numpy as np\n",
13 | "import glob\n",
14 | "import os\n",
15 | "from sys import platform\n",
16 | "import datetime\n",
17 | "\n",
18 | "# THIS LABELLING SCRIPT IS USED TO LABEL THE OLD VERSION OF CIC-IDS-2017. THIS VERSION SHOULD ONLY BE USED IF YOU\n",
19 | "# WISH TO RECREATE OUR RESULTS AS REPORTED IN OUR PAPER: https://intrusion-detection.distrinet-research.be/CNS2022/index.html\n",
20 | "\n",
21 | "# THIS SCRIPT ACCEPTS AS INPUT THE ORIGINAL CSVs AS RELEASED BY THE DATASET AUTHORS: https://www.unb.ca/cic/datasets/ids-2017.html\n",
22 | "\n",
23 | "pd.set_option('display.max_rows', 100)\n",
24 | "\n",
25 | "\n",
26 | "DATASET_PATH = \"\"\n",
27 | "OUTPUT_PATH = \"\"\n",
28 | "\n",
29 | "# unset to remove line index (to refer to line numbers when writing final csv)\n",
30 | "print_index = True"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 2,
36 | "outputs": [],
37 | "source": [
38 | "def format_csv_for_labelling(df):\n",
39 | " # strip leading whitespaces in column names\n",
40 | " df.columns = df.columns.str.lstrip(\" \")\n",
41 | "\n",
42 | " print(\"labels before pre-processing:\", df[\"Label\"].value_counts())\n",
43 | "\n",
44 | " # Since CICIDS 2017 authors used 12-hour format but removed AM/PM, we need to reconstruct it\n",
45 | " # We do this based on the knowledge they collected traffic from 9:00 AM to 5:00 PM.\n",
46 | " df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M')\n",
47 | " df['Timestamp'] = df['Timestamp'].apply(lambda x: x + pd.DateOffset(hours=12) if x.hour < 7 else x)\n",
48 | "\n",
49 | " # Convert to UTC from New Brunswick summer timezone (UTC-3)\n",
50 | " df['Timestamp'] = df['Timestamp'] + pd.DateOffset(hours=3)\n",
51 | "\n",
52 | " for column in df.columns:\n",
53 | " if column not in ['Flow ID' , 'Timestamp', 'Source IP', 'Destination IP', 'Label']:\n",
54 | " df[column] = pd.to_numeric(df[column])\n",
55 | "\n",
56 | " # Add attempted category column and initialise to -1\n",
57 | " df[\"Attempted Category\"] = -1\n",
58 | "\n",
59 | " # CICIDS 2017 author-released version comes prelabelled. This makes sure previous labels don't interfere\n",
60 | " df[\"Label\"] = \"NeedManualLabel\"\n",
61 | "\n",
62 | " print(\"labels after pre-processing:\", df[\"Label\"].value_counts())\n",
63 | "\n",
64 | " return df\n",
65 | "\n",
66 | "def read_csvs_from_path_and_reformat(path):\n",
67 | " df= pd.read_csv(path, encoding='cp1252')\n",
68 | "\n",
69 | " df = format_csv_for_labelling(df)\n",
70 | "\n",
71 | " return df\n",
72 | "\n",
73 | "def label_flows(df, label, attack_start_time_nanoseconds, attack_end_time_nanoseconds, src_ip_list=None,\n",
74 | " dst_ip_list= None, src_port_list=None, dst_port_list=None, attempted_category = -1, additional_filters=[],\n",
75 | " also_flip_flow_direction=False, payload_filter=False):\n",
76 | " # Note that labelling happens inplace on the 'df' parameter, and so this function doesn't return anything\n",
77 | "\n",
78 | " # Create initial mask with all values set to True. Squeeze is necessary to remove second axis (with value 1)\n",
79 | " # The reason is that a df of shape (X,) gets converted to (1,X) if you '&' it with a df of shape (X,1)\n",
80 | " custom_mask = pd.DataFrame(True, index=df.index, columns=[df.columns[0]]).squeeze()\n",
81 | "\n",
82 | " # Need to round the start time down to the nearest minute because otherwise some flows at the start of the attack\n",
83 | " # are labelled as benign\n",
84 | " attack_start_datetime = pd.to_datetime(attack_start_time_nanoseconds, unit='ns').floor(freq='T')\n",
85 | " attack_end_datetime = pd.to_datetime(attack_end_time_nanoseconds, unit='ns')\n",
86 | "\n",
87 | " custom_mask &= (df[\"Timestamp\"] >= attack_start_datetime)\n",
88 | " custom_mask &= (df[\"Timestamp\"] <= attack_end_datetime)\n",
89 | "\n",
90 | " if src_ip_list is not None:\n",
91 | " custom_mask &= (df[\"Source IP\"].isin(src_ip_list))\n",
92 | " if dst_ip_list is not None:\n",
93 | " custom_mask &= (df[\"Destination IP\"].isin(dst_ip_list))\n",
94 | "\n",
95 | " if src_port_list is not None:\n",
96 | " custom_mask &= (df[\"Source Port\"].isin(src_port_list))\n",
97 | " if dst_port_list is not None:\n",
98 | " custom_mask &= (df[\"Destination Port\"].isin(dst_port_list))\n",
99 | "\n",
100 | " # IMPORTANT NOTE: If you decide to add TotLen Fwd Pkt == 6 for catching RST packets, you still have to manually alter some additional_filters for flipped flows where\n",
101 | " # you couldn't use payload_filter boolean function input value\n",
102 | " if payload_filter:\n",
103 | " custom_mask &= (df[\"Total Length of Fwd Packets\"] == 0)\n",
104 | "\n",
105 | " for filter in additional_filters:\n",
106 | " custom_mask &= filter\n",
107 | "\n",
108 | " df[\"Label\"].mask(custom_mask, label, inplace=True)\n",
109 | " df[\"Attempted Category\"].mask(custom_mask, attempted_category, inplace=True)\n",
110 | "\n",
111 | " if also_flip_flow_direction:\n",
112 | " if additional_filters:\n",
113 | " raise AttributeError(\"Cannot set also_flip_flow_direction to True when additional_filters is not empty\")\n",
114 | "\n",
115 | " custom_mask = pd.DataFrame(True, index=df.index, columns=[df.columns[0]]).squeeze()\n",
116 | "\n",
117 | " custom_mask &= (df[\"Timestamp\"] >= attack_start_datetime)\n",
118 | " custom_mask &= (df[\"Timestamp\"] <= attack_end_datetime)\n",
119 | "\n",
120 | " if src_ip_list is not None:\n",
121 | " custom_mask &= (df[\"Destination IP\"].isin(src_ip_list))\n",
122 | " if dst_ip_list is not None:\n",
123 | " custom_mask &= (df[\"Source IP\"].isin(dst_ip_list))\n",
124 | "\n",
125 | " if src_port_list is not None:\n",
126 | " custom_mask &= (df[\"Destination Port\"].isin(src_port_list))\n",
127 | " if dst_port_list is not None:\n",
128 | " custom_mask &= (df[\"Source Port\"].isin(dst_port_list))\n",
129 | "\n",
130 | " if payload_filter:\n",
131 | " custom_mask &= (df[\"Total Length of Bwd Packets\"] == 0)\n",
132 | "\n",
133 | " for filter in additional_filters:\n",
134 | " custom_mask &= filter\n",
135 | "\n",
136 | " df[\"Label\"].mask(custom_mask, label, inplace=True)\n",
137 | " df[\"Attempted Category\"].mask(custom_mask, attempted_category, inplace=True)\n",
138 | "\n",
139 | "def label_rest_as_benign_and_write_csv(df, file_to_write):\n",
140 | " df[\"Label\"].mask(df[\"Label\"] == \"NeedManualLabel\", \"BENIGN\", inplace=True)\n",
141 | "\n",
142 | " # Relabel artefact flows with [Flow Id] = '8.0.6.4-8.6.0.1-0-0-0' to label = 0\n",
143 | " df[\"Label\"].mask(df[\"Flow ID\"] == '8.0.6.4-8.6.0.1-0-0-0', \"BENIGN\", inplace=True)\n",
144 | "\n",
145 | " print(\"label count after labelling:\\r\\n\", df[\"Label\"].value_counts())\n",
146 | " print(\"Attempted Category count after labelling:\\r\\n\", df[\"Attempted Category\"].value_counts())\n",
147 | "\n",
148 | " if print_index:\n",
149 | " df.reset_index(inplace=True, drop=True)\n",
150 | " df.index += 1\n",
151 | " df.index.name = 'id'\n",
152 | " df.to_csv(file_to_write)\n",
153 | " else:\n",
154 | " df.to_csv(file_to_write, index=False)\n"
155 | ],
156 | "metadata": {
157 | "collapsed": false
158 | }
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": 3,
163 | "outputs": [
164 | {
165 | "name": "stdout",
166 | "output_type": "stream",
167 | "text": [
168 | "labels before pre-processing: BENIGN 432074\n",
169 | "FTP-Patator 7938\n",
170 | "SSH-Patator 5897\n",
171 | "Name: Label, dtype: int64\n",
172 | "labels after pre-processing: NeedManualLabel 445909\n",
173 | "Name: Label, dtype: int64\n",
174 | "label count after labelling:\r\n",
175 | " BENIGN 430465\n",
176 | "FTP-Patator - Attempted 5489\n",
177 | "FTP-Patator 3991\n",
178 | "SSH-Patator - Attempted 3003\n",
179 | "SSH-Patator 2961\n",
180 | "Name: Label, dtype: int64\n",
181 | "Attempted Category count after labelling:\r\n",
182 | " -1 437417\n",
183 | " 3 6918\n",
184 | " 0 1571\n",
185 | " 2 3\n",
186 | "Name: Attempted Category, dtype: int64\n"
187 | ]
188 | }
189 | ],
190 | "source": [
191 | "#--------------------+\n",
192 | "# TUESDAY 04-07-2017 |\n",
193 | "#--------------------+\n",
194 | "\n",
195 | "tuesday_df = read_csvs_from_path_and_reformat(DATASET_PATH + \"tuesday/Tuesday-WorkingHours.pcap_ISCX.csv\")\n",
196 | "\n",
197 | "# FTP-PATATOR\n",
198 | "# -----------\n",
199 | "\n",
200 | "label_flows(tuesday_df, \"FTP-Patator\", 1499170672838272000, 1499174416931403000, [\"172.16.0.1\"],\n",
201 | " [\"192.168.10.50\"], dst_port_list=[21], also_flip_flow_direction=True)\n",
202 | "\n",
203 | "# Default payload filter\n",
204 | "label_flows(tuesday_df, \"FTP-Patator - Attempted\", 1499170672838272000, 1499174416931403000, [\"172.16.0.1\"],\n",
205 | " [\"192.168.10.50\"], dst_port_list=[21], payload_filter=True, also_flip_flow_direction=True, attempted_category=0)\n",
206 | "\n",
207 | "label_flows(tuesday_df, \"FTP-Patator - Attempted\", 1499170672838272000, 1499174416931403000, [\"172.16.0.1\"],\n",
208 | " [\"192.168.10.50\"], dst_port_list=[21], src_port_list=[52108], attempted_category=2)\n",
209 | "\n",
210 | "# Flows with RSTs that are technically TCP appendices, but not picked up by payload filter because of non-zero payload\n",
211 | "label_flows(tuesday_df, \"FTP-Patator - Attempted\", 1499170672838272000, 1499174416931403000, [\"172.16.0.1\"],\n",
212 | " [\"192.168.10.50\"], dst_port_list=[21], additional_filters=\n",
213 | " [\n",
214 | " (tuesday_df[\"Source Port\"] != 52108) & (tuesday_df[\"Total Length of Bwd Packets\"] == 0) &\n",
215 | " (tuesday_df[\"Total Length of Fwd Packets\"] > 0)\n",
216 | " ], attempted_category=3)\n",
217 | "\n",
218 | "label_flows(tuesday_df, \"FTP-Patator - Attempted\", 1499170672838272000, 1499174416931403000, [\"192.168.10.50\"],\n",
219 | " [\"172.16.0.1\"], src_port_list=[21], additional_filters=\n",
220 | " [\n",
221 | " (tuesday_df[\"Destination Port\"] != 52108) & (tuesday_df[\"Total Length of Fwd Packets\"] == 0) &\n",
222 | " (tuesday_df[\"Total Length of Bwd Packets\"] > 0)\n",
223 | " ], attempted_category=3)\n",
224 | "\n",
225 | "\n",
226 | "# SSH-Patator\n",
227 | "# -----------\n",
228 | "\n",
229 | "label_flows(tuesday_df, \"SSH-Patator\", 1499188141049616000, 1499195059018486000, [\"172.16.0.1\"],\n",
230 | " [\"192.168.10.50\"], dst_port_list=[22], also_flip_flow_direction=True)\n",
231 | "\n",
232 | "#Payload filter\n",
233 | "label_flows(tuesday_df, \"SSH-Patator - Attempted\", 1499188141049616000, 1499195059018486000, [\"172.16.0.1\"],\n",
234 | " [\"192.168.10.50\"], dst_port_list=[22], payload_filter=True, also_flip_flow_direction=True, attempted_category=0)\n",
235 | "\n",
236 | "label_flows(tuesday_df, \"SSH-Patator - Attempted\", 1499188141049616000, 1499195059018486000, [\"172.16.0.1\"],\n",
237 | " [\"192.168.10.50\"], dst_port_list=[22], additional_filters=\n",
238 | " [\n",
239 | " (tuesday_df[\"Total Length of Fwd Packets\"] <= 32) & (tuesday_df[\"Total Length of Bwd Packets\"] == 0)\n",
240 | " ], attempted_category=3)\n",
241 | "\n",
242 | "label_rest_as_benign_and_write_csv(tuesday_df, OUTPUT_PATH + \"Tuesday-WorkingHours.pcap_ISCX.csv\")\n",
243 | "\n",
244 | "tuesday_df = None"
245 | ],
246 | "metadata": {
247 | "collapsed": false
248 | }
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": 3,
253 | "outputs": [
254 | {
255 | "ename": "FileNotFoundError",
256 | "evalue": "[Errno 2] No such file or directory: '/media/farodin/AEAA59A1AA59673D/CICIDS2017/CSV_newest_CICFlowMeter_20220728/Unlabelled/wednesday/Wednesday-workingHours.pcap_ISCX.csv'",
257 | "output_type": "error",
258 | "traceback": [
259 | "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
260 | "\u001B[0;31mFileNotFoundError\u001B[0m Traceback (most recent call last)",
261 | "Input \u001B[0;32mIn [3]\u001B[0m, in \u001B[0;36m\u001B[0;34m()\u001B[0m\n\u001B[1;32m 1\u001B[0m \u001B[38;5;66;03m#----------------------+\u001B[39;00m\n\u001B[1;32m 2\u001B[0m \u001B[38;5;66;03m# WEDNESDAY 05-07-2017 |\u001B[39;00m\n\u001B[1;32m 3\u001B[0m \u001B[38;5;66;03m#----------------------+\u001B[39;00m\n\u001B[0;32m----> 5\u001B[0m wednesday_df \u001B[38;5;241m=\u001B[39m \u001B[43mread_csvs_from_path_and_reformat\u001B[49m\u001B[43m(\u001B[49m\u001B[43mDATASET_PATH\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m+\u001B[39;49m\u001B[43m \u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mwednesday/Wednesday-workingHours.pcap_ISCX.csv\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[1;32m 7\u001B[0m \u001B[38;5;66;03m# DoS Slowloris\u001B[39;00m\n\u001B[1;32m 8\u001B[0m \u001B[38;5;66;03m# -------------\u001B[39;00m\n\u001B[1;32m 9\u001B[0m \n\u001B[1;32m 10\u001B[0m \u001B[38;5;66;03m# Accidental early launch of the tool with wrong parameters\u001B[39;00m\n\u001B[1;32m 11\u001B[0m label_flows(wednesday_df, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mDoS Slowloris - Attempted\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;241m1499258926211817000\u001B[39m, \u001B[38;5;241m1499258927000000000\u001B[39m, [\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m172.16.0.1\u001B[39m\u001B[38;5;124m\"\u001B[39m],\n\u001B[1;32m 12\u001B[0m [\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m192.168.10.50\u001B[39m\u001B[38;5;124m\"\u001B[39m], dst_port_list\u001B[38;5;241m=\u001B[39m[\u001B[38;5;241m80\u001B[39m], attempted_category\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m5\u001B[39m)\n",
262 | "Input \u001B[0;32mIn [2]\u001B[0m, in \u001B[0;36mread_csvs_from_path_and_reformat\u001B[0;34m(path)\u001B[0m\n\u001B[1;32m 29\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mread_csvs_from_path_and_reformat\u001B[39m(path):\n\u001B[0;32m---> 30\u001B[0m df\u001B[38;5;241m=\u001B[39m \u001B[43mpd\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mread_csv\u001B[49m\u001B[43m(\u001B[49m\u001B[43mpath\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mencoding\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mcp1252\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[1;32m 32\u001B[0m df \u001B[38;5;241m=\u001B[39m format_csv_for_labelling(df)\n\u001B[1;32m 34\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m df\n",
263 | "File \u001B[0;32m~/anaconda3/envs/pytorch/lib/python3.8/site-packages/pandas/util/_decorators.py:311\u001B[0m, in \u001B[0;36mdeprecate_nonkeyword_arguments..decorate..wrapper\u001B[0;34m(*args, **kwargs)\u001B[0m\n\u001B[1;32m 305\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mlen\u001B[39m(args) \u001B[38;5;241m>\u001B[39m num_allow_args:\n\u001B[1;32m 306\u001B[0m warnings\u001B[38;5;241m.\u001B[39mwarn(\n\u001B[1;32m 307\u001B[0m msg\u001B[38;5;241m.\u001B[39mformat(arguments\u001B[38;5;241m=\u001B[39marguments),\n\u001B[1;32m 308\u001B[0m \u001B[38;5;167;01mFutureWarning\u001B[39;00m,\n\u001B[1;32m 309\u001B[0m stacklevel\u001B[38;5;241m=\u001B[39mstacklevel,\n\u001B[1;32m 310\u001B[0m )\n\u001B[0;32m--> 311\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mfunc\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n",
264 | "File \u001B[0;32m~/anaconda3/envs/pytorch/lib/python3.8/site-packages/pandas/io/parsers/readers.py:586\u001B[0m, in \u001B[0;36mread_csv\u001B[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001B[0m\n\u001B[1;32m 571\u001B[0m kwds_defaults \u001B[38;5;241m=\u001B[39m _refine_defaults_read(\n\u001B[1;32m 572\u001B[0m dialect,\n\u001B[1;32m 573\u001B[0m delimiter,\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 582\u001B[0m defaults\u001B[38;5;241m=\u001B[39m{\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mdelimiter\u001B[39m\u001B[38;5;124m\"\u001B[39m: \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m,\u001B[39m\u001B[38;5;124m\"\u001B[39m},\n\u001B[1;32m 583\u001B[0m )\n\u001B[1;32m 584\u001B[0m kwds\u001B[38;5;241m.\u001B[39mupdate(kwds_defaults)\n\u001B[0;32m--> 586\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43m_read\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfilepath_or_buffer\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mkwds\u001B[49m\u001B[43m)\u001B[49m\n",
265 | "File \u001B[0;32m~/anaconda3/envs/pytorch/lib/python3.8/site-packages/pandas/io/parsers/readers.py:482\u001B[0m, in \u001B[0;36m_read\u001B[0;34m(filepath_or_buffer, kwds)\u001B[0m\n\u001B[1;32m 479\u001B[0m _validate_names(kwds\u001B[38;5;241m.\u001B[39mget(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mnames\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;28;01mNone\u001B[39;00m))\n\u001B[1;32m 481\u001B[0m \u001B[38;5;66;03m# Create the parser.\u001B[39;00m\n\u001B[0;32m--> 482\u001B[0m parser \u001B[38;5;241m=\u001B[39m \u001B[43mTextFileReader\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfilepath_or_buffer\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwds\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 484\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m chunksize \u001B[38;5;129;01mor\u001B[39;00m iterator:\n\u001B[1;32m 485\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m parser\n",
266 | "File \u001B[0;32m~/anaconda3/envs/pytorch/lib/python3.8/site-packages/pandas/io/parsers/readers.py:811\u001B[0m, in \u001B[0;36mTextFileReader.__init__\u001B[0;34m(self, f, engine, **kwds)\u001B[0m\n\u001B[1;32m 808\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mhas_index_names\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;129;01min\u001B[39;00m kwds:\n\u001B[1;32m 809\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39moptions[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mhas_index_names\u001B[39m\u001B[38;5;124m\"\u001B[39m] \u001B[38;5;241m=\u001B[39m kwds[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mhas_index_names\u001B[39m\u001B[38;5;124m\"\u001B[39m]\n\u001B[0;32m--> 811\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_engine \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_make_engine\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mengine\u001B[49m\u001B[43m)\u001B[49m\n",
267 | "File \u001B[0;32m~/anaconda3/envs/pytorch/lib/python3.8/site-packages/pandas/io/parsers/readers.py:1040\u001B[0m, in \u001B[0;36mTextFileReader._make_engine\u001B[0;34m(self, engine)\u001B[0m\n\u001B[1;32m 1036\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[1;32m 1037\u001B[0m \u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mUnknown engine: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mengine\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m (valid options are \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mmapping\u001B[38;5;241m.\u001B[39mkeys()\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m)\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m 1038\u001B[0m )\n\u001B[1;32m 1039\u001B[0m \u001B[38;5;66;03m# error: Too many arguments for \"ParserBase\"\u001B[39;00m\n\u001B[0;32m-> 1040\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mmapping\u001B[49m\u001B[43m[\u001B[49m\u001B[43mengine\u001B[49m\u001B[43m]\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mf\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43moptions\u001B[49m\u001B[43m)\u001B[49m\n",
268 | "File \u001B[0;32m~/anaconda3/envs/pytorch/lib/python3.8/site-packages/pandas/io/parsers/c_parser_wrapper.py:51\u001B[0m, in \u001B[0;36mCParserWrapper.__init__\u001B[0;34m(self, src, **kwds)\u001B[0m\n\u001B[1;32m 48\u001B[0m kwds[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124musecols\u001B[39m\u001B[38;5;124m\"\u001B[39m] \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39musecols\n\u001B[1;32m 50\u001B[0m \u001B[38;5;66;03m# open handles\u001B[39;00m\n\u001B[0;32m---> 51\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_open_handles\u001B[49m\u001B[43m(\u001B[49m\u001B[43msrc\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mkwds\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 52\u001B[0m \u001B[38;5;28;01massert\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mhandles \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m\n\u001B[1;32m 54\u001B[0m \u001B[38;5;66;03m# Have to pass int, would break tests using TextReader directly otherwise :(\u001B[39;00m\n",
269 | "File \u001B[0;32m~/anaconda3/envs/pytorch/lib/python3.8/site-packages/pandas/io/parsers/base_parser.py:222\u001B[0m, in \u001B[0;36mParserBase._open_handles\u001B[0;34m(self, src, kwds)\u001B[0m\n\u001B[1;32m 218\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21m_open_handles\u001B[39m(\u001B[38;5;28mself\u001B[39m, src: FilePathOrBuffer, kwds: \u001B[38;5;28mdict\u001B[39m[\u001B[38;5;28mstr\u001B[39m, Any]) \u001B[38;5;241m-\u001B[39m\u001B[38;5;241m>\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[1;32m 219\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m\"\"\"\u001B[39;00m\n\u001B[1;32m 220\u001B[0m \u001B[38;5;124;03m Let the readers open IOHandles after they are done with their potential raises.\u001B[39;00m\n\u001B[1;32m 221\u001B[0m \u001B[38;5;124;03m \"\"\"\u001B[39;00m\n\u001B[0;32m--> 222\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mhandles \u001B[38;5;241m=\u001B[39m \u001B[43mget_handle\u001B[49m\u001B[43m(\u001B[49m\n\u001B[1;32m 223\u001B[0m \u001B[43m \u001B[49m\u001B[43msrc\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 224\u001B[0m \u001B[43m \u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mr\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\n\u001B[1;32m 225\u001B[0m \u001B[43m \u001B[49m\u001B[43mencoding\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mkwds\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mencoding\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mNone\u001B[39;49;00m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 226\u001B[0m \u001B[43m \u001B[49m\u001B[43mcompression\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mkwds\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mcompression\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mNone\u001B[39;49;00m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 227\u001B[0m \u001B[43m \u001B[49m\u001B[43mmemory_map\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mkwds\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mmemory_map\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mFalse\u001B[39;49;00m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 228\u001B[0m \u001B[43m \u001B[49m\u001B[43mstorage_options\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mkwds\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mstorage_options\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mNone\u001B[39;49;00m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 229\u001B[0m \u001B[43m \u001B[49m\u001B[43merrors\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mkwds\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mget\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mencoding_errors\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mstrict\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 230\u001B[0m \u001B[43m \u001B[49m\u001B[43m)\u001B[49m\n",
270 | "File \u001B[0;32m~/anaconda3/envs/pytorch/lib/python3.8/site-packages/pandas/io/common.py:701\u001B[0m, in \u001B[0;36mget_handle\u001B[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001B[0m\n\u001B[1;32m 696\u001B[0m \u001B[38;5;28;01melif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(handle, \u001B[38;5;28mstr\u001B[39m):\n\u001B[1;32m 697\u001B[0m \u001B[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001B[39;00m\n\u001B[1;32m 698\u001B[0m \u001B[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001B[39;00m\n\u001B[1;32m 699\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m ioargs\u001B[38;5;241m.\u001B[39mencoding \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mb\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m ioargs\u001B[38;5;241m.\u001B[39mmode:\n\u001B[1;32m 700\u001B[0m \u001B[38;5;66;03m# Encoding\u001B[39;00m\n\u001B[0;32m--> 701\u001B[0m handle \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mopen\u001B[39;49m\u001B[43m(\u001B[49m\n\u001B[1;32m 702\u001B[0m \u001B[43m \u001B[49m\u001B[43mhandle\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 703\u001B[0m \u001B[43m \u001B[49m\u001B[43mioargs\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mmode\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 704\u001B[0m \u001B[43m \u001B[49m\u001B[43mencoding\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mioargs\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mencoding\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 705\u001B[0m \u001B[43m \u001B[49m\u001B[43merrors\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43merrors\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 706\u001B[0m \u001B[43m \u001B[49m\u001B[43mnewline\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\n\u001B[1;32m 707\u001B[0m \u001B[43m \u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 708\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m 709\u001B[0m \u001B[38;5;66;03m# Binary mode\u001B[39;00m\n\u001B[1;32m 710\u001B[0m handle \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mopen\u001B[39m(handle, ioargs\u001B[38;5;241m.\u001B[39mmode)\n",
271 | "\u001B[0;31mFileNotFoundError\u001B[0m: [Errno 2] No such file or directory: '/media/farodin/AEAA59A1AA59673D/CICIDS2017/CSV_newest_CICFlowMeter_20220728/Unlabelled/wednesday/Wednesday-workingHours.pcap_ISCX.csv'"
272 | ]
273 | }
274 | ],
275 | "source": [
276 | "#----------------------+\n",
277 | "# WEDNESDAY 05-07-2017 |\n",
278 | "#----------------------+\n",
279 | "\n",
280 | "wednesday_df = read_csvs_from_path_and_reformat(DATASET_PATH + \"wednesday/Wednesday-workingHours.pcap_ISCX.csv\")\n",
281 | "\n",
282 | "# DoS Slowloris\n",
283 | "# -------------\n",
284 | "\n",
285 | "# Accidental early launch of the tool with wrong parameters\n",
286 | "label_flows(wednesday_df, \"DoS Slowloris - Attempted\", 1499258926211817000, 1499258927000000000, [\"172.16.0.1\"],\n",
287 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=5)\n",
288 | "\n",
289 | "label_flows(wednesday_df, \"DoS Slowloris\", 1499258934539220000, 1499260278500956000, [\"172.16.0.1\"],\n",
290 | " [\"192.168.10.50\"], dst_port_list=[80], additional_filters=[\n",
291 | " ~(wednesday_df[\"Source Port\"].isin([33358, 33360, 33362, 54114]))\n",
292 | " ])\n",
293 | "\n",
294 | "label_flows(wednesday_df, \"DoS Slowloris\", 1499258934539220000, 1499260278500956000,\n",
295 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], additional_filters=[\n",
296 | " ~(wednesday_df[\"Destination Port\"].isin([33358, 33360, 33362, 54114]))\n",
297 | " ])\n",
298 | "\n",
299 | "# port 33358, 33360 and 33362 contain attack teardown flows\n",
300 | "label_flows(wednesday_df, \"DoS Slowloris - Attempted\", 1499258934539220000, 1499260278500956000, [\"172.16.0.1\"],\n",
301 | " [\"192.168.10.50\"], src_port_list=[33358, 33360, 33362], dst_port_list=[80], attempted_category=2)\n",
302 | "\n",
303 | "#Payload filter (order is important, this part needs to come before Attempted category 6) (can't flip with boolean function input because of additional filters)\n",
304 | "label_flows(wednesday_df, \"DoS Slowloris - Attempted\", 1499258934539220000, 1499260278500956000, [\"172.16.0.1\"],\n",
305 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=0, payload_filter=True, additional_filters=[\n",
306 | " ~(wednesday_df[\"Source Port\"].isin([33358, 33360, 33362, 54114]))])\n",
307 | "\n",
308 | "label_flows(wednesday_df, \"DoS Slowloris - Attempted\", 1499258934539220000, 1499260278500956000,\n",
309 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], attempted_category=0, additional_filters=[\n",
310 | " ~(wednesday_df[\"Destination Port\"].isin([33358, 33360, 33362, 54114])) & (wednesday_df[\"Total Length of Bwd Packets\"] == 0)\n",
311 | " ])\n",
312 | "\n",
313 | "#Target unresponsive because of DoS, no payloads in these flows\n",
314 | "label_flows(wednesday_df, \"DoS Slowloris - Attempted\", 1499258934539220000, 1499260278500956000,\n",
315 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], attempted_category=6, additional_filters=[\n",
316 | " ~(wednesday_df[\"Destination Port\"].isin([33358, 33360, 33362, 54114])) & (wednesday_df[\"Total Length of Bwd Packets\"] == 0)\n",
317 | " & (wednesday_df[\"Flow Duration\"] >= 199800)\n",
318 | " ])\n",
319 | "\n",
320 | "# Artefact likely from authors checking the webserver\n",
321 | "label_flows(wednesday_df, \"DoS Slowloris - Attempted\", 1499258934539220000, 1499260278500956000, [\"172.16.0.1\"],\n",
322 | " [\"192.168.10.50\"], src_port_list=[54114], dst_port_list=[80], attempted_category=4)\n",
323 | "\n",
324 | "# DoS Slowhttptest\n",
325 | "# ----------------\n",
326 | "\n",
327 | "label_flows(wednesday_df, \"DoS Slowhttptest\", 1499260537936810000, 1499261869331517000, [\"172.16.0.1\"],\n",
328 | " [\"192.168.10.50\"], dst_port_list=[80], additional_filters=[\n",
329 | " ~(wednesday_df[\"Source Port\"].isin([33372]))\n",
330 | " ]\n",
331 | " )\n",
332 | "\n",
333 | "label_flows(wednesday_df, \"DoS Slowhttptest\", 1499260537936810000, 1499261869331517000,\n",
334 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], additional_filters=[\n",
335 | " ~(wednesday_df[\"Destination Port\"].isin([33372]))\n",
336 | " ]\n",
337 | " )\n",
338 | "\n",
339 | "# Attack startup artefact\n",
340 | "label_flows(wednesday_df, \"DoS Slowhttptest - Attempted\", 1499260537936810000, 1499261869331517000, [\"172.16.0.1\"],\n",
341 | " [\"192.168.10.50\"], src_port_list=[33372], dst_port_list=[80], attempted_category=2)\n",
342 | "\n",
343 | "#Payload filter (order of this is important, before attempted category 6) (can't flip with boolean function input because of additional filters)\n",
344 | "label_flows(wednesday_df, \"DoS Slowhttptest - Attempted\", 1499260537936810000, 1499261869331517000, [\"172.16.0.1\"],\n",
345 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=0, payload_filter=True, additional_filters=[\n",
346 | " ~(wednesday_df[\"Source Port\"].isin([33372, 37670]))])\n",
347 | "\n",
348 | "label_flows(wednesday_df, \"DoS Slowhttptest - Attempted\", 1499260537936810000, 1499261869331517000,\n",
349 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], attempted_category=0, additional_filters=[\n",
350 | " ~(wednesday_df[\"Destination Port\"].isin([33372, 37670])) & (wednesday_df[\"Total Length of Bwd Packets\"] == 0)\n",
351 | " ]\n",
352 | " )\n",
353 | "\n",
354 | "# Retransmissions because target web server is brought down (No need to flip direction, I double-checked)\n",
355 | "label_flows(wednesday_df, \"DoS Slowhttptest - Attempted\", 1499260537936810000, 1499261869331517000, [\"172.16.0.1\"],\n",
356 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=6, additional_filters=[\n",
357 | " ~(wednesday_df[\"Source Port\"].isin([33372])) & (wednesday_df[\"Total Length of Fwd Packets\"] == 0) &\n",
358 | " (wednesday_df[\"Flow Duration\"] >= 199984) & (wednesday_df[\"Total Backward Packets\"] == 0)\n",
359 | " ]\n",
360 | " )\n",
361 | "\n",
362 | "# Artefact from authors likely checking the webserver\n",
363 | "label_flows(wednesday_df, \"DoS Slowhttptest - Attempted\", 1499260537936810000, 1499261869331517000, [\"172.16.0.1\"],\n",
364 | " [\"192.168.10.50\"], src_port_list=[37670], dst_port_list=[80], attempted_category=4)\n",
365 | "\n",
366 | "\n",
367 | "# DoS Hulk\n",
368 | "# --------\n",
369 | "\n",
370 | "# Note that ports 48678 and 43664 have a benign flow launched by attacker IP while attack is already ongoing,\n",
371 | "# containing benign HTTP request. This will be labelled as Attack artefact\n",
372 | "label_flows(wednesday_df, \"DoS Hulk\", 1499262203194704000, 1499262299999999999, [\"172.16.0.1\"],\n",
373 | " [\"192.168.10.50\"], dst_port_list=[80], additional_filters=[\n",
374 | " ~(wednesday_df[\"Source Port\"].isin([48678 , 43664]))\n",
375 | " ])\n",
376 | "\n",
377 | "label_flows(wednesday_df, \"DoS Hulk\", 1499262203194704000, 1499262299999999999,\n",
378 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], additional_filters=[\n",
379 | " ~(wednesday_df[\"Destination Port\"].isin([48678 , 43664]))\n",
380 | " ])\n",
381 | "\n",
382 | "label_flows(wednesday_df, \"DoS Hulk\", 1499262300000000000, 1499263641326171000, [\"172.16.0.1\"],\n",
383 | " [\"192.168.10.50\"], dst_port_list=[80], also_flip_flow_direction=True)\n",
384 | "\n",
385 | "#Attack artefact - likely authors checking webserver mid-attack.\n",
386 | "label_flows(wednesday_df, \"DoS Hulk - Attempted\", 1499262203194704000, 1499262299999999999, [\"172.16.0.1\"],\n",
387 | " [\"192.168.10.50\"], src_port_list=[48678 , 43664], dst_port_list=[80], attempted_category=4)\n",
388 | "\n",
389 | "#Payload filter (can't flip with boolean function input because of additional filters)\n",
390 | "label_flows(wednesday_df, \"DoS Hulk - Attempted\", 1499262203194704000, 1499263641326171000, [\"172.16.0.1\"],\n",
391 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=0, payload_filter=True, additional_filters=[\n",
392 | " ~(wednesday_df[\"Source Port\"].isin([48678 , 43664]))])\n",
393 | "\n",
394 | "label_flows(wednesday_df, \"DoS Hulk - Attempted\", 1499262203194704000, 1499263641326171000,\n",
395 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], attempted_category=0, additional_filters=[\n",
396 | " ~(wednesday_df[\"Destination Port\"].isin([48678 , 43664])) & (wednesday_df[\"Total Length of Bwd Packets\"] == 0)\n",
397 | " ])\n",
398 | "\n",
399 | "# Artefacts caused by either attack tool or non-empty TCP appendices. Reasoning is that 282 is minimum size of malicious payload\n",
400 | "label_flows(wednesday_df, \"DoS Hulk - Attempted\", 1499262203194704000, 1499263641326171000, [\"172.16.0.1\"],\n",
401 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=3, additional_filters=[\n",
402 | " ~(wednesday_df[\"Source Port\"].isin([48678 , 43664])) & (wednesday_df[\"Total Length of Fwd Packets\"] > 0)\n",
403 | " & (wednesday_df[\"Total Length of Fwd Packets\"] < 282)\n",
404 | " ])\n",
405 | "\n",
406 | "label_flows(wednesday_df, \"DoS Hulk - Attempted\", 1499262203194704000, 1499263641326171000,\n",
407 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], attempted_category=3, additional_filters=[\n",
408 | " ~(wednesday_df[\"Destination Port\"].isin([48678 , 43664])) & (wednesday_df[\"Total Length of Bwd Packets\"] > 0)\n",
409 | " & (wednesday_df[\"Total Length of Bwd Packets\"] <282)\n",
410 | " ])\n",
411 | "\n",
412 | "# DoS GoldenEye\n",
413 | "# -------------\n",
414 | "\n",
415 | "label_flows(wednesday_df, \"DoS GoldenEye\", 1499263803231753000, 1499264408915718000, [\"172.16.0.1\"],\n",
416 | " [\"192.168.10.50\"], dst_port_list=[80], also_flip_flow_direction=True)\n",
417 | "\n",
418 | "#Payload filter\n",
419 | "label_flows(wednesday_df, \"DoS GoldenEye - Attempted\", 1499263803231753000, 1499264408915718000, [\"172.16.0.1\"],\n",
420 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n",
421 | "\n",
422 | "# Heartbleed\n",
423 | "# ----------\n",
424 | "\n",
425 | "label_flows(wednesday_df, \"Heartbleed\", 1499278335650811000, 1499279563294455000, [\"172.16.0.1\"],\n",
426 | " [\"192.168.10.51\"], dst_port_list=[444], src_port_list=[45022], also_flip_flow_direction=True)\n",
427 | "\n",
428 | "#Payload filter\n",
429 | "label_flows(wednesday_df, \"Heartbleed - Attempted\", 1499278335650811000, 1499279563294455000, [\"172.16.0.1\"],\n",
430 | " [\"192.168.10.51\"], dst_port_list=[444], src_port_list=[45022], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n",
431 | "\n",
432 | "label_rest_as_benign_and_write_csv(wednesday_df, OUTPUT_PATH + \"Wednesday-workingHours.pcap_ISCX.csv\")\n",
433 | "\n",
434 | "wednesday_df = None\n"
435 | ],
436 | "metadata": {
437 | "collapsed": false
438 | }
439 | },
440 | {
441 | "cell_type": "code",
442 | "execution_count": 5,
443 | "outputs": [
444 | {
445 | "name": "stderr",
446 | "output_type": "stream",
447 | "text": [
448 | "/tmp/ipykernel_52950/4104559264.py:5: DtypeWarning: Columns (0,1,3,6,84) have mixed types.Specify dtype option on import or set low_memory=False.\n",
449 | " thursday_df = read_csvs_from_path_and_reformat(DATASET_PATH + \"thursday/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv\")\n"
450 | ]
451 | },
452 | {
453 | "name": "stdout",
454 | "output_type": "stream",
455 | "text": [
456 | "labels before pre-processing: BENIGN 168186\n",
457 | "Web Attack – Brute Force 1507\n",
458 | "Web Attack – XSS 652\n",
459 | "Web Attack – Sql Injection 21\n",
460 | "Name: Label, dtype: int64\n",
461 | "labels after pre-processing: NeedManualLabel 458968\n",
462 | "Name: Label, dtype: int64\n",
463 | "label count after labelling:\r\n",
464 | " BENIGN 455536\n",
465 | "Web Attack - Brute Force - Attempted 2660\n",
466 | "Web Attack - XSS - Attempted 616\n",
467 | "Web Attack - Brute Force 74\n",
468 | "Web Attack - SQL Injection - Attempted 39\n",
469 | "Web Attack - SQL Injection 25\n",
470 | "Web Attack - XSS 18\n",
471 | "Name: Label, dtype: int64\n",
472 | "Attempted Category count after labelling:\r\n",
473 | " -1 455653\n",
474 | " 0 3222\n",
475 | " 4 71\n",
476 | " 2 22\n",
477 | "Name: Attempted Category, dtype: int64\n",
478 | "labels before pre-processing: BENIGN 288566\n",
479 | "Infiltration 36\n",
480 | "Name: Label, dtype: int64\n",
481 | "labels after pre-processing: NeedManualLabel 288602\n",
482 | "Name: Label, dtype: int64\n",
483 | "label count after labelling:\r\n",
484 | " BENIGN 227426\n",
485 | "Infiltration - Portscan 61106\n",
486 | "Infiltration 39\n",
487 | "Infiltration - Attempted 31\n",
488 | "Name: Label, dtype: int64\n",
489 | "Attempted Category count after labelling:\r\n",
490 | " -1 288571\n",
491 | " 0 28\n",
492 | " 2 3\n",
493 | "Name: Attempted Category, dtype: int64\n"
494 | ]
495 | }
496 | ],
497 | "source": [
498 | "#---------------------+\n",
499 | "# THURSDAY 06-07-2017 |\n",
500 | "#---------------------+\n",
501 | "\n",
502 | "thursday_df = read_csvs_from_path_and_reformat(DATASET_PATH + \"thursday/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv\")\n",
503 | "\n",
504 | "# Web Attack - Brute Force\n",
505 | "# ------------------------\n",
506 | "\n",
507 | "label_flows(thursday_df, \"Web Attack - Brute Force - Attempted\", 1499343354880049000, 1499343531179279000,\n",
508 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80], attempted_category=2, also_flip_flow_direction=True)\n",
509 | "\n",
510 | "label_flows(thursday_df, \"Web Attack - Brute Force\", 1499343567660566000, 1499346011622209000,\n",
511 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80], additional_filters=\n",
512 | " [\n",
513 | " (thursday_df[\"Total Fwd Packets\"] > 20) | (thursday_df[\"Source Port\"] == 44464)\n",
514 | " ])\n",
515 | "#Flip\n",
516 | "label_flows(thursday_df, \"Web Attack - Brute Force\", 1499343567660566000, 1499346011622209000,\n",
517 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], additional_filters=\n",
518 | " [\n",
519 | " (thursday_df[\"Total Backward Packets\"] > 20) | (thursday_df[\"Destination Port\"] == 44464)\n",
520 | " ])\n",
521 | "\n",
522 | "#Payload filter (can't use switch_flow_direction because there are additional_filters)\n",
523 | "label_flows(thursday_df, \"Web Attack - Brute Force - Attempted\", 1499343567660566000, 1499346011622209000,\n",
524 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80], payload_filter=True, attempted_category=0,\n",
525 | " additional_filters=\n",
526 | " [~((thursday_df[\"Total Fwd Packets\"] > 20) | (thursday_df[\"Source Port\"] == 44464))])\n",
527 | "\n",
528 | "label_flows(thursday_df, \"Web Attack - Brute Force - Attempted\", 1499343567660566000, 1499346011622209000,\n",
529 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], attempted_category=0,\n",
530 | " additional_filters=\n",
531 | " [\n",
532 | " ~((thursday_df[\"Total Backward Packets\"] > 20) | (thursday_df[\"Destination Port\"] == 44464))\n",
533 | " & (thursday_df[\"Total Length of Bwd Packets\"] == 0)\n",
534 | " ])\n",
535 | "\n",
536 | "label_flows(thursday_df, \"Web Attack - Brute Force - Attempted\", 1499343567660566000, 1499346011622209000,\n",
537 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80], attempted_category=4,\n",
538 | " additional_filters=\n",
539 | " [\n",
540 | " (thursday_df[\"Total Length of Fwd Packets\"] > 0) & ~(thursday_df[\"Source Port\"] == 44464) &\n",
541 | " (thursday_df[\"Total Fwd Packets\"] == 4) & (thursday_df[\"Total Backward Packets\"] == 4)\n",
542 | " ])\n",
543 | "\n",
544 | "label_flows(thursday_df, \"Web Attack - Brute Force - Attempted\", 1499343567660566000, 1499346011622209000,\n",
545 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], attempted_category=4,\n",
546 | " additional_filters=\n",
547 | " [\n",
548 | " (thursday_df[\"Total Length of Bwd Packets\"] > 0) & ~(thursday_df[\"Destination Port\"] == 44464) &\n",
549 | " (thursday_df[\"Total Backward Packets\"] == 4) & (thursday_df[\"Total Fwd Packets\"] == 4)\n",
550 | " ])\n",
551 | "\n",
552 | "# Web Attack - XSS\n",
553 | "# ----------------\n",
554 | "\n",
555 | "label_flows(thursday_df, \"Web Attack - XSS\", 1499346935283859000, 1499348121341704000, [\"172.16.0.1\"],\n",
556 | " [\"192.168.10.50\"], dst_port_list=[80], additional_filters=\n",
557 | " [\n",
558 | " ~(thursday_df[\"Source Port\"].isin([36180, 36182, 36184, 36186, 36188, 36190])) &\n",
559 | " (thursday_df[\"Total Fwd Packets\"] >= 150)\n",
560 | " ])\n",
561 | "#Flip\n",
562 | "label_flows(thursday_df, \"Web Attack - XSS\", 1499346935283859000, 1499348121341704000,\n",
563 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], additional_filters=\n",
564 | " [\n",
565 | " ~(thursday_df[\"Destination Port\"].isin([36180, 36182, 36184, 36186, 36188, 36190])) &\n",
566 | " (thursday_df[\"Total Backward Packets\"] >= 150)\n",
567 | " ])\n",
568 | "\n",
569 | "label_flows(thursday_df, \"Web Attack - XSS - Attempted\", 1499346935283859000, 1499348121341704000, [\"172.16.0.1\"],\n",
570 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=0, payload_filter=True, additional_filters=\n",
571 | " [\n",
572 | " ~(thursday_df[\"Source Port\"].isin([36180, 36182, 36184, 36186, 36188, 36190]))])\n",
573 | "#Flip\n",
574 | "label_flows(thursday_df, \"Web Attack - XSS - Attempted\", 1499346935283859000, 1499348121341704000, [\"172.16.0.1\"],\n",
575 | " [\"192.168.10.50\"], src_port_list=[80], attempted_category=0, additional_filters=\n",
576 | " [\n",
577 | " ~(thursday_df[\"Destination Port\"].isin([36180, 36182, 36184, 36186, 36188, 36190])) &\n",
578 | " (thursday_df[\"Total Length of Bwd Packets\"] == 0)\n",
579 | " ])\n",
580 | "\n",
581 | "label_flows(thursday_df, \"Web Attack - XSS - Attempted\", 1499346935283859000, 1499348121341704000, [\"172.16.0.1\"],\n",
582 | " [\"192.168.10.50\"], dst_port_list=[80], attempted_category=2, additional_filters=\n",
583 | " [\n",
584 | " ~(thursday_df[\"Source Port\"].isin([36180, 36182, 36184, 36186, 36188, 36190])) &\n",
585 | " (thursday_df[\"Total Length of Fwd Packets\"] > 0) & (thursday_df[\"Total Fwd Packets\"] < 150)\n",
586 | " ])\n",
587 | "\n",
588 | "#Flip\n",
589 | "label_flows(thursday_df, \"Web Attack - XSS - Attempted\", 1499346935283859000, 1499348121341704000,\n",
590 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], attempted_category=2, additional_filters=\n",
591 | " [\n",
592 | " ~(thursday_df[\"Destination Port\"].isin([36180, 36182, 36184, 36186, 36188, 36190])) &\n",
593 | " (thursday_df[\"Total Length of Bwd Packets\"] > 0) & (thursday_df[\"Total Backward Packets\"] < 150)\n",
594 | " ])\n",
595 | "\n",
596 | "# Web Attack - SQL Injection\n",
597 | "# --------------------------\n",
598 | "\n",
599 | "label_flows(thursday_df, \"Web Attack - SQL Injection - Attempted\", 1499348127852814000, 1499348145720612000,\n",
600 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80], attempted_category=2,\n",
601 | " additional_filters=[\n",
602 | " thursday_df[\"Source Port\"].isin([36180, 36182, 36184, 36186, 36188])\n",
603 | " ])\n",
604 | "\n",
605 | "#Flip\n",
606 | "label_flows(thursday_df, \"Web Attack - SQL Injection - Attempted\", 1499348127852814000, 1499348145720612000,\n",
607 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80], attempted_category=2,\n",
608 | " additional_filters=[\n",
609 | " thursday_df[\"Destination Port\"].isin([36180, 36182, 36184, 36186, 36188])\n",
610 | " ])\n",
611 | "\n",
612 | "label_flows(thursday_df, \"Web Attack - SQL Injection\", 1499348145732950000, 1499348575320284000,\n",
613 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80],\n",
614 | " additional_filters=[\n",
615 | " ~(thursday_df[\"Source Port\"].isin([36180, 36182, 36184, 36186, 36188]))\n",
616 | " ])\n",
617 | "\n",
618 | "#Flip\n",
619 | "label_flows(thursday_df, \"Web Attack - SQL Injection\", 1499348145732950000, 1499348575320284000,\n",
620 | " [\"192.168.10.50\"], [\"172.16.0.1\"], src_port_list=[80],\n",
621 | " additional_filters=[\n",
622 | " ~(thursday_df[\"Destination Port\"].isin([36180, 36182, 36184, 36186, 36188]))\n",
623 | " ])\n",
624 | "#Payload filter\n",
625 | "label_flows(thursday_df, \"Web Attack - SQL Injection - Attempted\", 1499348127852814000, 1499348145720612000,\n",
626 | " [\"172.16.0.1\"], [\"192.168.10.50\"], dst_port_list=[80], attempted_category=0,\n",
627 | " payload_filter=True, also_flip_flow_direction=True)\n",
628 | "\n",
629 | "\n",
630 | "label_rest_as_benign_and_write_csv(thursday_df,\n",
631 | " OUTPUT_PATH + \"Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv\")\n",
632 | "\n",
633 | "# Infiltration\n",
634 | "# 5.1 Dropbox Download\n",
635 | "# ------------\n",
636 | "thursday_df = read_csvs_from_path_and_reformat(DATASET_PATH +\n",
637 | " \"thursday/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv\")\n",
638 | "\n",
639 | "label_flows(thursday_df, \"Infiltration\", 1499361542547210000, 1499366769364731000, [\"192.168.10.8\"], [\"205.174.165.73\"],\n",
640 | " also_flip_flow_direction=True)\n",
641 | "\n",
642 | "#Payload filter\n",
643 | "label_flows(thursday_df, \"Infiltration - Attempted\", 1499361542547210000, 1499366769364731000, [\"192.168.10.8\"],\n",
644 | " [\"205.174.165.73\"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n",
645 | "\n",
646 | "\n",
647 | "label_flows(thursday_df, \"Infiltration - Attempted\", 1499361228830533000, 1499361301251276000 , [\"192.168.10.9\"],\n",
648 | " [\"205.174.165.73\"], attempted_category=2, also_flip_flow_direction=True)\n",
649 | "\n",
650 | "# 5.2 Cooldisk Mac\n",
651 | "\n",
652 | "label_flows(thursday_df, \"Infiltration\", 1499363616453990000, 1499371339347892000, [\"192.168.10.25\"], [\"205.174.165.73\"],\n",
653 | " also_flip_flow_direction=True)\n",
654 | "\n",
655 | "#Payload filter\n",
656 | "label_flows(thursday_df, \"Infiltration - Attempted\", 1499363616453990000, 1499371339347892000, [\"192.168.10.25\"],\n",
657 | " [\"205.174.165.73\"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n",
658 | "\n",
659 | "\n",
660 | "# 5.3 NMAP + Portscan\n",
661 | "\n",
662 | "# Round 1\n",
663 | "\n",
664 | "label_flows(thursday_df, \"Infiltration - Portscan\", 1499360400000000000, 1499360460000000000, [\"172.16.0.1\"],\n",
665 | " [\"192.168.10.51\"], additional_filters=[\n",
666 | " (thursday_df[\"Source Port\"] == 50122)\n",
667 | " ])\n",
668 | "\n",
669 | "# Round 2\n",
670 | "\n",
671 | "label_flows(thursday_df, \"Infiltration - Portscan\", 1499362410884008000, 1499362444285175000, [\"192.168.10.8\"],\n",
672 | " [\"192.168.10.5\"])\n",
673 | "\n",
674 | "# Round 3\n",
675 | "\n",
676 | "label_flows(thursday_df, \"Infiltration - Portscan\", 1499364314425162000, 1499366764331875000, [\"192.168.10.8\"],\n",
677 | " [\"192.168.10.5\", \"192.168.10.9\", \"192.168.10.12\", \"192.168.10.14\", \"192.168.10.15\", \"192.168.10.16\",\n",
678 | " \"192.168.10.17\", \"192.168.10.19\", \"192.168.10.25\", \"192.168.10.50\", \"192.168.10.51\"], additional_filters= [\n",
679 | " ~((thursday_df[\"Fwd Packet Length Max\"] == 408) & (thursday_df[\"Destination IP\"] == \"192.168.10.50\")) &\n",
680 | " ~((thursday_df[\"Total Length of Fwd Packets\"].isin([176, 20514])) & (thursday_df[\"Destination IP\"] == \"192.168.10.50\"))\n",
681 | " ]\n",
682 | ")\n",
683 | "\n",
684 | "label_rest_as_benign_and_write_csv(thursday_df,\n",
685 | " OUTPUT_PATH + \"Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv\")\n",
686 | "\n",
687 | "thursday_df = None"
688 | ],
689 | "metadata": {
690 | "collapsed": false
691 | }
692 | },
693 | {
694 | "cell_type": "code",
695 | "execution_count": 6,
696 | "outputs": [
697 | {
698 | "name": "stdout",
699 | "output_type": "stream",
700 | "text": [
701 | "labels before pre-processing: PortScan 158930\n",
702 | "BENIGN 127537\n",
703 | "Name: Label, dtype: int64\n",
704 | "labels after pre-processing: NeedManualLabel 286467\n",
705 | "Name: Label, dtype: int64\n",
706 | "label count after labelling:\r\n",
707 | " Portscan 158939\n",
708 | "BENIGN 126905\n",
709 | "Botnet - Attempted 623\n",
710 | "Name: Label, dtype: int64\n",
711 | "Attempted Category count after labelling:\r\n",
712 | " -1 285844\n",
713 | " 1 623\n",
714 | "Name: Attempted Category, dtype: int64\n",
715 | "labels before pre-processing: BENIGN 189067\n",
716 | "Bot 1966\n",
717 | "Name: Label, dtype: int64\n",
718 | "labels after pre-processing: NeedManualLabel 191033\n",
719 | "Name: Label, dtype: int64\n",
720 | "label count after labelling:\r\n",
721 | " BENIGN 189071\n",
722 | "Botnet 1472\n",
723 | "Botnet - Attempted 490\n",
724 | "Name: Label, dtype: int64\n",
725 | "Attempted Category count after labelling:\r\n",
726 | " -1 190543\n",
727 | " 1 490\n",
728 | "Name: Attempted Category, dtype: int64\n",
729 | "labels before pre-processing: DDoS 128027\n",
730 | "BENIGN 97718\n",
731 | "Name: Label, dtype: int64\n",
732 | "labels after pre-processing: NeedManualLabel 225745\n",
733 | "Name: Label, dtype: int64\n",
734 | "label count after labelling:\r\n",
735 | " DDoS 159366\n",
736 | "BENIGN 66028\n",
737 | "Botnet - Attempted 350\n",
738 | "DDoS - Attempted 1\n",
739 | "Name: Label, dtype: int64\n",
740 | "Attempted Category count after labelling:\r\n",
741 | " -1 225394\n",
742 | " 1 350\n",
743 | " 0 1\n",
744 | "Name: Attempted Category, dtype: int64\n"
745 | ]
746 | }
747 | ],
748 | "source": [
749 | "#-------------------+\n",
750 | "# FRIDAY 07-07-2017 |\n",
751 | "#-------------------+\n",
752 | "\n",
753 | "friday_df = read_csvs_from_path_and_reformat(DATASET_PATH + \"friday/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv\")\n",
754 | "\n",
755 | "# Portscan\n",
756 | "# --------\n",
757 | "\n",
758 | "#First round\n",
759 | "label_flows(friday_df, \"Portscan\", 1499446532117090000, 1499447948582083000, [\"172.16.0.1\"], [\"192.168.10.50\"],\n",
760 | " also_flip_flow_direction=True)\n",
761 | "\n",
762 | "\n",
763 | "#Second round\n",
764 | "label_flows(friday_df, \"Portscan\", 1499449860000000000, 1499449919000000000, [\"172.16.0.1\"], [\"192.168.10.50\"],\n",
765 | " additional_filters=[\n",
766 | " ~(friday_df[\"Source Port\"].isin([0, 35952, 35954, 35956, 35958]))\n",
767 | " ]\n",
768 | ")\n",
769 | "\n",
770 | "label_flows(friday_df, \"Portscan\", 1499449920000000000, 1499451841699238000, [\"172.16.0.1\"], [\"192.168.10.50\"])\n",
771 | "\n",
772 | "#Putting Bot labelling in here too because Bot occurs throughout the day\n",
773 | "label_flows(friday_df, \"Botnet\", 1499432653990571000, 1499436122903736000, [\"192.168.10.15\", \"192.168.10.9\",\n",
774 | " \"192.168.10.14\", \"192.168.10.5\", \"192.168.10.8\"], [\"205.174.165.73\"], also_flip_flow_direction=True)\n",
775 | "\n",
776 | "#Payload filter\n",
777 | "label_flows(friday_df, \"Botnet - Attempted\", 1499432653990571000, 1499436122903736000, [\"192.168.10.15\", \"192.168.10.9\",\n",
778 | " \"192.168.10.14\", \"192.168.10.5\", \"192.168.10.8\"], [\"205.174.165.73\"], attempted_category=0,\n",
779 | " payload_filter=True, also_flip_flow_direction=True)\n",
780 | "\n",
781 | "\n",
782 | "label_flows(friday_df, \"Botnet - Attempted\", 1499436180000000000, 1499457684606663000, [\"192.168.10.15\", \"192.168.10.9\",\n",
783 | " \"192.168.10.14\", \"192.168.10.5\", \"192.168.10.8\"], [\"205.174.165.73\"], attempted_category=1, also_flip_flow_direction=True)\n",
784 | "\n",
785 | "label_rest_as_benign_and_write_csv(friday_df,\n",
786 | " OUTPUT_PATH + \"Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv\")\n",
787 | "\n",
788 | "# Botnet\n",
789 | "# ------\n",
790 | "\n",
791 | "friday_df = read_csvs_from_path_and_reformat(DATASET_PATH + \"friday/Friday-WorkingHours-Morning.pcap_ISCX.csv\")\n",
792 | "\n",
793 | "label_flows(friday_df, \"Botnet\", 1499432653990571000, 1499436122903736000, [\"192.168.10.15\", \"192.168.10.9\",\n",
794 | " \"192.168.10.14\", \"192.168.10.5\", \"192.168.10.8\"], [\"205.174.165.73\"], also_flip_flow_direction=True)\n",
795 | "\n",
796 | "#Payload filter\n",
797 | "label_flows(friday_df, \"Botnet - Attempted\", 1499432653990571000, 1499436122903736000, [\"192.168.10.15\", \"192.168.10.9\",\n",
798 | " \"192.168.10.14\", \"192.168.10.5\", \"192.168.10.8\"], [\"205.174.165.73\"], attempted_category=0,\n",
799 | " payload_filter=True, also_flip_flow_direction=True)\n",
800 | "\n",
801 | "label_flows(friday_df, \"Botnet - Attempted\", 1499436180000000000, 1499457684606663000, [\"192.168.10.15\", \"192.168.10.9\",\n",
802 | " \"192.168.10.14\", \"192.168.10.5\", \"192.168.10.8\"], [\"205.174.165.73\"], attempted_category=1, also_flip_flow_direction=True)\n",
803 | "\n",
804 | "label_rest_as_benign_and_write_csv(friday_df,\n",
805 | " OUTPUT_PATH + \"Friday-WorkingHours-Morning.pcap_ISCX.csv\")\n",
806 | "\n",
807 | "# DDoS\n",
808 | "# ----\n",
809 | "\n",
810 | "friday_df = read_csvs_from_path_and_reformat(DATASET_PATH + \"friday/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv\")\n",
811 | "\n",
812 | "label_flows(friday_df, \"DDoS\", 1499453791796937000, 1499454972216560000, [\"172.16.0.1\"], [\"192.168.10.50\"],\n",
813 | " also_flip_flow_direction=True)\n",
814 | "\n",
815 | "# Payload filter\n",
816 | "label_flows(friday_df, \"DDoS - Attempted\", 1499453791796937000, 1499454972216560000, [\"172.16.0.1\"], [\"192.168.10.50\"],\n",
817 | " attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n",
818 | "\n",
819 | "label_flows(friday_df, \"DDoS - Attempted\", 1499453791796937000, 1499454972216560000, [\"192.168.10.50\"], [\"172.16.0.1\"],\n",
820 | " attempted_category=0, additional_filters=[\n",
821 | " (friday_df[\"Total Length of Bwd Packets\"] == 0)\n",
822 | " ])\n",
823 | "\n",
824 | "# Putting Bot labelling in here too because Bot occurs throughout the day\n",
825 | "label_flows(friday_df, \"Botnet\", 1499432653990571000, 1499436122903736000, [\"192.168.10.15\", \"192.168.10.9\",\n",
826 | " \"192.168.10.14\", \"192.168.10.5\", \"192.168.10.8\"], [\"205.174.165.73\"], also_flip_flow_direction=True)\n",
827 | "\n",
828 | "#Payload filter\n",
829 | "label_flows(friday_df, \"Botnet - Attempted\", 1499432653990571000, 1499436122903736000, [\"192.168.10.15\", \"192.168.10.9\",\n",
830 | " \"192.168.10.14\", \"192.168.10.5\", \"192.168.10.8\"], [\"205.174.165.73\"], attempted_category=0,\n",
831 | " payload_filter=True, also_flip_flow_direction=True)\n",
832 | "\n",
833 | "label_flows(friday_df, \"Botnet - Attempted\", 1499436180000000000, 1499457684606663000, [\"192.168.10.15\", \"192.168.10.9\",\n",
834 | " \"192.168.10.14\", \"192.168.10.5\", \"192.168.10.8\"], [\"205.174.165.73\"], attempted_category=1, also_flip_flow_direction=True)\n",
835 | "\n",
836 | "\n",
837 | "label_rest_as_benign_and_write_csv(friday_df, OUTPUT_PATH + \"Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv\")"
838 | ],
839 | "metadata": {
840 | "collapsed": false
841 | }
842 | },
843 | {
844 | "cell_type": "code",
845 | "execution_count": 19,
846 | "outputs": [],
847 | "source": [
848 | "\n",
849 | "\n",
850 | "\n",
851 | "\n"
852 | ],
853 | "metadata": {
854 | "collapsed": false
855 | }
856 | }
857 | ],
858 | "metadata": {
859 | "kernelspec": {
860 | "display_name": "Python 3",
861 | "language": "python",
862 | "name": "python3"
863 | },
864 | "language_info": {
865 | "codemirror_mode": {
866 | "name": "ipython",
867 | "version": 2
868 | },
869 | "file_extension": ".py",
870 | "mimetype": "text/x-python",
871 | "name": "python",
872 | "nbconvert_exporter": "python",
873 | "pygments_lexer": "ipython2",
874 | "version": "2.7.6"
875 | }
876 | },
877 | "nbformat": 4,
878 | "nbformat_minor": 0
879 | }
880 |
--------------------------------------------------------------------------------
/Labelling/CICIDS2018_labelling_fixed_CICFlowMeter.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "outputs": [],
7 | "source": [
8 | "import pandas as pd\n",
9 | "import numpy as np\n",
10 | "import glob\n",
11 | "import os\n",
12 | "from sys import platform\n",
13 | "\n",
14 | "# THIS LABELLING SCRIPT IS USED TO LABEL THE CORRECTED VERSION OF CSE-CIC-IDS-2018.\n",
15 | "# FOR DETAILS CONSULT OUR WEBSITE:\n",
16 | "# https://intrusion-detection.distrinet-research.be/CNS2022/index.html\n",
17 | "\n",
18 | "\n",
19 | "pd.set_option('display.max_rows', 100)\n",
20 | "\n",
21 | "# Enter the path that contains the CSV files that were generated by the CICFlowMeter tool. The directory structure should\n",
22 | "# be the following:\n",
23 | "# The dataset path should contain separate subdirectories for each day (e.g. \"Wednesday-14-02-2018\"). In each\n",
24 | "# of these directories, there should be a directory called \"csv\" which contains the CSV files as generated by the\n",
25 | "# CICFlowMeter tool.\n",
26 | "DATASET_PATH = \"\"\n",
27 | "\n",
28 | "# If set to true, a column is added at the front of the CSV with line numbers\n",
29 | "print_index = True"
30 | ],
31 | "metadata": {
32 | "collapsed": false
33 | }
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 2,
38 | "outputs": [],
39 | "source": [
40 | "# Basic preprocessing before getting started on labelling.\n",
41 | "# Deletes rows with \"Infinity\" and NaNs, converts \"Timestamp\" to Pandas Datetime, and converts all necessary columns to\n",
42 | "# numeric values\n",
43 | "def format_csv_for_labelling(df):\n",
44 | " df = df.replace('Infinity', np.nan)\n",
45 | " df['Timestamp'] = pd.to_datetime(df['Timestamp'])\n",
46 | " for column in df.columns:\n",
47 | " if column not in ['Flow ID' , 'Timestamp', 'Src IP', 'Dst IP', 'Label']:\n",
48 | " df[column] = pd.to_numeric(df[column], errors='coerce')\n",
49 | " return df.dropna()\n",
50 | "\n",
51 | "# Reads all csvs of one day and concatenates them into one dataframe\n",
52 | "def read_csvs_from_path_and_reformat(path):\n",
53 | " csv_dataframes = []\n",
54 | "\n",
55 | " all_files = glob.glob(path + \"/*.csv\")\n",
56 | " for file in all_files:\n",
57 | " csv_dataframes.extend([pd.read_csv(file)])\n",
58 | " df = pd.concat(csv_dataframes, ignore_index=True)\n",
59 | "\n",
60 | " print(\"labels before pre-processing:\", df[\"Label\"].value_counts())\n",
61 | " df = format_csv_for_labelling(df)\n",
62 | " print(\"labels after pre-processing:\", df[\"Label\"].value_counts())\n",
63 | "\n",
64 | " df[\"Attempted Category\"] = -1\n",
65 | "\n",
66 | " int64_columns = [\"Total TCP Flow Time\"]\n",
67 | "\n",
68 | " int32_columns = [\"Src Port\", \"Dst Port\", \"Flow Duration\", \"Total Fwd Packet\", \"Total Bwd packets\", \"Total Length of Fwd Packet\", \"Total Length of Bwd Packet\", \"Fwd Packet Length Max\",\n",
69 | " \"Fwd Packet Length Min\", \"Bwd Packet Length Max\", \"Bwd Packet Length Min\", \"Flow IAT Max\", \"Flow IAT Min\", \"Fwd IAT Total\", \"Fwd IAT Max\", \"Fwd IAT Min\", \"Bwd IAT Total\",\n",
70 | " \"Bwd IAT Max\", \"Bwd IAT Min\", \"Fwd PSH Flags\", \"Bwd PSH Flags\", \"Fwd URG Flags\", \"Bwd URG Flags\", \"Packet Length Min\", \"Packet Length Max\", \"FIN Flag Count\", \"SYN Flag Count\", \"RST Flag Count\", \"PSH Flag Count\",\n",
71 | " \"ACK Flag Count\", \"URG Flag Count\", \"CWR Flag Count\", \"ECE Flag Count\", \"Subflow Fwd Packets\", \"Subflow Fwd Bytes\",\n",
72 | " \"Subflow Bwd Packets\", \"Subflow Bwd Bytes\", \"FWD Init Win Bytes\", \"Bwd Init Win Bytes\", \"Fwd Act Data Pkts\", \"Fwd Seg Size Min\", \"Active Max\",\n",
73 | " \"Active Min\", \"Idle Max\", \"Idle Min\"]\n",
74 | "\n",
75 | " int16_columns = [\"Fwd Header Length\", \"Bwd Header Length\", \"ICMP Code\", \"ICMP Type\"]\n",
76 | "\n",
77 | " for column in int64_columns:\n",
78 | " df[column] = df[column].astype('int64')\n",
79 | "\n",
80 | " for column in int32_columns:\n",
81 | " df[column] = df[column].astype('int32')\n",
82 | "\n",
83 | " for column in int16_columns:\n",
84 | " df[column] = df[column].astype('int16')\n",
85 | "\n",
86 | " return df\n",
87 | "\n",
88 | "\n",
89 | "# Main labelling function. Only used for labelling Malicious and Malicious - Attempted flows.\n",
90 | "# Timestamps are in NANOSECONDS (!) Unix time. Note that the CSV files are in the UTC timezone.\n",
91 | "# df = dataframe with flows. Note that labelling happens inplace on the 'df' parameter, and so this function doesn't return anything\n",
92 | "# label = the label that will be given to flows matching the criteria specified in the function\n",
93 | "# additional_filters = add any additional constraints that cannot be covered by the already provided function arguments\n",
94 | "# see examples in the actual labelling logic for correct syntax\n",
95 | "# attempted_category = please consult our website (https://intrusion-detection.distrinet-research.be/CNS2022/Tools_Documentation.html)\n",
96 | "# for details on how the \"Attempted\" categories are defined.\n",
97 | "# payload_filter = When set to true, this will automatically add a constraint [\"Total Length of Fwd Packet\"] == 0. Note that\n",
98 | "# the Attempted label and category still need to be specified manually\n",
99 | "def label_flows(df, label, attack_start_time_nanoseconds, attack_end_time_nanoseconds, src_ip_list=None,\n",
100 | " dst_ip_list=None, dst_port_list=None, attempted_category=-1, additional_filters=[], payload_filter = False):\n",
101 | "\n",
102 | " # Create initial mask with all values set to True. Squeeze is necessary to remove second axis (of size 1)\n",
103 | " # The reason is that a df of shape (X,), if you '&' it with a df of shape (X,1), gets converted to (1,X)\n",
104 | " custom_mask = pd.DataFrame(True, index=df.index, columns=[df.columns[0]]).squeeze()\n",
105 | "\n",
106 | " attack_start_datetime = pd.to_datetime(attack_start_time_nanoseconds, unit='ns')\n",
107 | " attack_end_datetime = pd.to_datetime(attack_end_time_nanoseconds, unit='ns')\n",
108 | "\n",
109 | " custom_mask &= (df[\"Timestamp\"] >= attack_start_datetime)\n",
110 | " custom_mask &= (df[\"Timestamp\"] <= attack_end_datetime)\n",
111 | "\n",
112 | " if src_ip_list is not None:\n",
113 | " custom_mask &= (df[\"Src IP\"].isin(src_ip_list))\n",
114 | " if dst_ip_list is not None:\n",
115 | " custom_mask &= (df[\"Dst IP\"].isin(dst_ip_list))\n",
116 | "\n",
117 | " if dst_port_list is not None:\n",
118 | " custom_mask &= (df[\"Dst Port\"].isin(dst_port_list))\n",
119 | "\n",
120 | " if payload_filter:\n",
121 | " custom_mask &= (df[\"Total Length of Fwd Packet\"] == 0)\n",
122 | "\n",
123 | " for filter in additional_filters:\n",
124 | " custom_mask &= filter\n",
125 | "\n",
126 | " df[\"Label\"].mask(custom_mask, label, inplace=True)\n",
127 | " df[\"Attempted Category\"].mask(custom_mask, attempted_category, inplace=True)\n",
128 | "\n",
129 | "# This function is called when all labelling of malicious flows is completed. Anything that has not yet received a label\n",
130 | "# so far is labelled as Benign.\n",
131 | "def label_rest_as_benign_and_write_csv(df, file_to_write):\n",
132 | " df[\"Label\"].mask(df[\"Label\"] == \"NeedManualLabel\", \"BENIGN\", inplace=True)\n",
133 | "\n",
134 | " # Relabel artefact flows with [Flow Id] = '8.0.6.4-8.6.0.1-0-0-0' to label = 0\n",
135 | " df[\"Label\"].mask(df[\"Flow ID\"] == '8.0.6.4-8.6.0.1-0-0-0', \"BENIGN\", inplace=True)\n",
136 | "\n",
137 | " print(\"label count after labelling:\\r\\n\", df[\"Label\"].value_counts())\n",
138 | " print(\"Attempted Category count after labelling:\\r\\n\", df[\"Attempted Category\"].value_counts())\n",
139 | "\n",
140 | " if print_index:\n",
141 | " df.reset_index(inplace=True, drop=True)\n",
142 | " df.index += 1\n",
143 | " df.index.name = 'id'\n",
144 | " df.to_csv(file_to_write)\n",
145 | " else:\n",
146 | " df.to_csv(file_to_write, index=False)"
147 | ],
148 | "metadata": {
149 | "collapsed": false
150 | }
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 3,
155 | "outputs": [
156 | {
157 | "name": "stdout",
158 | "output_type": "stream",
159 | "text": [
160 | "labels before pre-processing: NeedManualLabel 6268692\n",
161 | "Name: Label, dtype: int64\n",
162 | "labels after pre-processing: NeedManualLabel 5898350\n",
163 | "Name: Label, dtype: int64\n",
164 | "label count after labelling:\r\n",
165 | " BENIGN 5610799\n",
166 | "FTP-BruteForce - Attempted 193354\n",
167 | "SSH-BruteForce 94197\n",
168 | "Name: Label, dtype: int64\n",
169 | "Attempted Category count after labelling:\r\n",
170 | " -1 5704996\n",
171 | " 1 193324\n",
172 | " 4 30\n",
173 | "Name: Attempted Category, dtype: int64\n"
174 | ]
175 | }
176 | ],
177 | "source": [
178 | "#----------------------+\n",
179 | "# WEDNESDAY 14-02-2018 |\n",
180 | "#----------------------+\n",
181 | "\n",
182 | "dir_name = \"Wednesday-14-02-2018\"\n",
183 | "wednesday_14022018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + \"/csv\")\n",
184 | "\n",
185 | "#-- FTP-BruteForce\n",
186 | "label_flows(wednesday_14022018_df, \"FTP-BruteForce - Attempted\", 1518618806*(10**9),\n",
187 | " 1518624631*(10**9), [\"18.221.219.4\"], [\"172.31.69.25\"], attempted_category=1)\n",
188 | "\n",
189 | "# FTP-BruteForce - Attempted (tool accidentally got launched in FTP bruteforce mode instead of SSH bruteforce mode)\n",
190 | "# Note that, in order to avoid float imprecisions at the micro- and nanosecond level, the UNIX timestamps such as\n",
191 | "# 1518631281.199541000, which is in seconds, needs to be converted to nanoseconds, so that the number is stored\n",
192 | "# in int64 instead of float.\n",
193 | "label_flows(wednesday_14022018_df, \"FTP-BruteForce - Attempted\", 1518631281199541000,\n",
194 | " 1518631281502585000, [\"13.58.98.64\"], [\"172.31.69.25\"], [21], attempted_category=4)\n",
195 | "\n",
196 | "#-- SSH-BruteForce\n",
197 | "label_flows(wednesday_14022018_df, \"SSH-BruteForce\", 1518631310*(10**9),\n",
198 | " 1518636750*(10**9), [\"13.58.98.64\"], [\"172.31.69.25\"], [22])\n",
199 | "# Payload filter\n",
200 | "label_flows(wednesday_14022018_df, \"SSH-BruteForce - Attempted\", 1518631310*(10**9),\n",
201 | " 1518636750*(10**9), [\"13.58.98.64\"], [\"172.31.69.25\"], [22], attempted_category=0, payload_filter=True)\n",
202 | "\n",
203 | "label_rest_as_benign_and_write_csv(wednesday_14022018_df, DATASET_PATH + dir_name + \".csv\")"
204 | ],
205 | "metadata": {
206 | "collapsed": false
207 | }
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": 9,
212 | "outputs": [
213 | {
214 | "name": "stdout",
215 | "output_type": "stream",
216 | "text": [
217 | "labels before pre-processing: NeedManualLabel 5762777\n",
218 | "Name: Label, dtype: int64\n",
219 | "labels after pre-processing: NeedManualLabel 5410102\n",
220 | "Name: Label, dtype: int64\n",
221 | "label count after labelling:\r\n",
222 | " BENIGN 5372471\n",
223 | "DoS GoldenEye 22560\n",
224 | "DoS Slowloris 8490\n",
225 | "DoS GoldenEye - Attempted 4301\n",
226 | "DoS Slowloris - Attempted 2280\n",
227 | "Name: Label, dtype: int64\n",
228 | "Attempted Category count after labelling:\r\n",
229 | " -1 5403521\n",
230 | " 4 4248\n",
231 | " 0 2280\n",
232 | " 6 53\n",
233 | "Name: Attempted Category, dtype: int64\n"
234 | ]
235 | }
236 | ],
237 | "source": [
238 | "#---------------------+\n",
239 | "# THURSDAY 15-02-2018 |\n",
240 | "#---------------------+\n",
241 | "\n",
242 | "dir_name=\"Thursday-15-02-2018\"\n",
243 | "thursday_15022018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + \"/csv\")\n",
244 | "\n",
245 | "#-- DoS GoldenEye\n",
246 | "label_flows(thursday_15022018_df, \"DoS GoldenEye\", 1518701262*(10**9), 1518703905*(10**9), [\"18.219.211.138\"],\n",
247 | " [\"172.31.69.25\"], additional_filters=\n",
248 | " [(thursday_15022018_df[\"Fwd RST Flags\"] == 0) |\n",
249 | " (thursday_15022018_df[\"Flow Duration\"] >= 5050000)])\n",
250 | "\n",
251 | "#-- DoS GoldenEye - Attempted\n",
252 | "label_flows(thursday_15022018_df, \"DoS GoldenEye - Attempted\", 1518701262*(10**9), 1518703905*(10**9), [\"18.219.211.138\"],\n",
253 | " [\"172.31.69.25\"], attempted_category=4, additional_filters=\n",
254 | " [thursday_15022018_df[\"Fwd RST Flags\"] > 0,\n",
255 | " thursday_15022018_df[\"Flow Duration\"] < 5050000])\n",
256 | "\n",
257 | "#-- DoS GoldenEye - Attempted\n",
258 | "label_flows(thursday_15022018_df, \"DoS GoldenEye - Attempted\", 1518701262*(10**9), 1518703905*(10**9), [\"18.219.211.138\"],\n",
259 | " [\"172.31.69.25\"], attempted_category=6, additional_filters=\n",
260 | " [thursday_15022018_df[\"Bwd RST Flags\"] == 1,\n",
261 | " thursday_15022018_df[\"Total Length of Bwd Packet\"] == 0,\n",
262 | " thursday_15022018_df[\"Flow Duration\"] > 100000000])\n",
263 | "\n",
264 | "# Payload filter\n",
265 | "label_flows(thursday_15022018_df, \"DoS GoldenEye - Attempted\", 1518701262*(10**9), 1518703905*(10**9), [\"18.219.211.138\"],\n",
266 | " [\"172.31.69.25\"], attempted_category=0, payload_filter=True)\n",
267 | "\n",
268 | "#-- DoS Slowloris\n",
269 | "label_flows(thursday_15022018_df, \"DoS Slowloris\", 1518706812*(10**9), 1518709321*(10**9), [\"18.217.165.70\"],\n",
270 | " [\"172.31.69.25\"])\n",
271 | "\n",
272 | "# Payload filter\n",
273 | "label_flows(thursday_15022018_df, \"DoS Slowloris - Attempted\", 1518706812*(10**9), 1518709321*(10**9), [\"18.217.165.70\"],\n",
274 | " [\"172.31.69.25\"], attempted_category=0, payload_filter=True)\n",
275 | "\n",
276 | "label_rest_as_benign_and_write_csv(thursday_15022018_df, DATASET_PATH + dir_name + \".csv\")"
277 | ],
278 | "metadata": {
279 | "collapsed": false
280 | }
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": 5,
285 | "outputs": [
286 | {
287 | "name": "stdout",
288 | "output_type": "stream",
289 | "text": [
290 | "labels before pre-processing: NeedManualLabel 7719001\n",
291 | "Name: Label, dtype: int64\n",
292 | "labels after pre-processing: NeedManualLabel 7390266\n",
293 | "Name: Label, dtype: int64\n",
294 | "label count after labelling:\r\n",
295 | " BENIGN 5481500\n",
296 | "DoS Hulk 1803160\n",
297 | "FTP-BruteForce - Attempted 105520\n",
298 | "DoS Hulk - Attempted 86\n",
299 | "Name: Label, dtype: int64\n",
300 | "Attempted Category count after labelling:\r\n",
301 | " -1 7284660\n",
302 | " 1 105520\n",
303 | " 0 86\n",
304 | "Name: Attempted Category, dtype: int64\n"
305 | ]
306 | }
307 | ],
308 | "source": [
309 | "#-------------------+\n",
310 | "# FRIDAY 16-02-2018 |\n",
311 | "#-------------------+\n",
312 | "\n",
313 | "dir_name=\"Friday-16-02-2018\"\n",
314 | "friday_16022018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + \"/csv\")\n",
315 | "\n",
316 | "#-- FTP-Patator - Attempted\n",
317 | "label_flows(friday_16022018_df, \"FTP-BruteForce - Attempted\", 1518790334*(10**9), 1518793513*(10**9), [\"13.59.126.31\"],\n",
318 | " [\"172.31.69.25\"], attempted_category=1)\n",
319 | "\n",
320 | "#-- DoS Hulk\n",
321 | "label_flows(friday_16022018_df, \"DoS Hulk\", 1518803127*(10**9), 1518803903*(10**9), [\"18.219.193.20\"], [\"172.31.69.25\"])\n",
322 | "\n",
323 | "# Payload filter\n",
324 | "label_flows(friday_16022018_df, \"DoS Hulk - Attempted\", 1518803127*(10**9), 1518803903*(10**9), [\"18.219.193.20\"],\n",
325 | " [\"172.31.69.25\"], attempted_category=0, payload_filter=True)\n",
326 | "\n",
327 | "#-- Dos Slowhttptest: No actual DoS Slowloris flows are present on this day in this dataset!\n",
328 | "# Instead we only find failed FTP-Patator traffic, which is exactly what is covered earlier in this cell\n",
329 | "\n",
330 | "label_rest_as_benign_and_write_csv(friday_16022018_df, DATASET_PATH + dir_name + \".csv\")"
331 | ],
332 | "metadata": {
333 | "collapsed": false
334 | }
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": 6,
339 | "outputs": [
340 | {
341 | "name": "stdout",
342 | "output_type": "stream",
343 | "text": [
344 | "labels before pre-processing: NeedManualLabel 6411771\n",
345 | "Name: Label, dtype: int64\n",
346 | "labels after pre-processing: NeedManualLabel 6054702\n",
347 | "Name: Label, dtype: int64\n",
348 | "label count after labelling:\r\n",
349 | " BENIGN 5764497\n",
350 | "DDoS-LOIC-HTTP 289328\n",
351 | "DDoS-LOIC-UDP 797\n",
352 | "DDoS-LOIC-UDP - Attempted 80\n",
353 | "Name: Label, dtype: int64\n",
354 | "Attempted Category count after labelling:\r\n",
355 | " -1 6054622\n",
356 | " 6 80\n",
357 | "Name: Attempted Category, dtype: int64\n"
358 | ]
359 | }
360 | ],
361 | "source": [
362 | "#--------------------+\n",
363 | "# TUESDAY 20-02-2018 |\n",
364 | "#--------------------+\n",
365 | "\n",
366 | "dir_name=\"Tuesday-20-02-2018\"\n",
367 | "tuesday_20022018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + \"/csv\")\n",
368 | "\n",
369 | "#-- DDoS LOIC HTTP\n",
370 | "label_flows(tuesday_20022018_df, \"DDoS-LOIC-HTTP\", 1519136034*(10**9), 1519139809*(10**9),\n",
371 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n",
372 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n",
373 | " [\"172.31.69.25\"], additional_filters=[\n",
374 | " tuesday_20022018_df[\"Protocol\"] == 6\n",
375 | " ])\n",
376 | "\n",
377 | "# Payload filter\n",
378 | "label_flows(tuesday_20022018_df, \"DDoS-LOIC-HTTP - Attempted\", 1519136034*(10**9), 1519139809*(10**9),\n",
379 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n",
380 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n",
381 | " [\"172.31.69.25\"], attempted_category=0, payload_filter=True, additional_filters=[tuesday_20022018_df[\"Protocol\"] == 6])\n",
382 | "\n",
383 | "#-- DDoS LOIC UDP\n",
384 | "label_flows(tuesday_20022018_df, \"DDoS-LOIC-UDP\", 1519146857*(10**9), 1519147756*(10**9),\n",
385 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n",
386 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n",
387 | " [\"172.31.69.25\"], additional_filters=[\n",
388 | " tuesday_20022018_df[\"Protocol\"] == 17])\n",
389 | "\n",
390 | "# Payload filter\n",
391 | "label_flows(tuesday_20022018_df, \"DDoS-LOIC-UDP - Attempted\", 1519146857*(10**9), 1519147756*(10**9),\n",
392 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n",
393 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n",
394 | " [\"172.31.69.25\"], attempted_category=0, payload_filter=True, additional_filters=[tuesday_20022018_df[\"Protocol\"] == 17])\n",
395 | "\n",
396 | "# Attempted - Target unresponsive (the ICMP destination unreachable answers to the attack - using protocol = 1 for ICMP)\n",
397 | "label_flows(tuesday_20022018_df, \"DDoS-LOIC-UDP - Attempted\", 1519146857*(10**9), 1519147756*(10**9),\n",
398 | " [\"172.31.69.25\"], [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n",
399 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n",
400 | " attempted_category=6, additional_filters=[(tuesday_20022018_df[\"Protocol\"] == 1)])\n",
401 | "\n",
402 | "label_rest_as_benign_and_write_csv(tuesday_20022018_df, DATASET_PATH + dir_name + \".csv\")"
403 | ],
404 | "metadata": {
405 | "collapsed": false
406 | }
407 | },
408 | {
409 | "cell_type": "code",
410 | "execution_count": 7,
411 | "outputs": [
412 | {
413 | "name": "stdout",
414 | "output_type": "stream",
415 | "text": [
416 | "labels before pre-processing: NeedManualLabel 7295839\n",
417 | "Name: Label, dtype: int64\n",
418 | "labels after pre-processing: NeedManualLabel 6962593\n",
419 | "Name: Label, dtype: int64\n",
420 | "label count after labelling:\r\n",
421 | " BENIGN 5878399\n",
422 | "DDoS-HOIC 1082293\n",
423 | "DDoS-LOIC-UDP 1730\n",
424 | "DDoS-LOIC-UDP - Attempted 171\n",
425 | "Name: Label, dtype: int64\n",
426 | "Attempted Category count after labelling:\r\n",
427 | " -1 6962422\n",
428 | " 6 171\n",
429 | "Name: Attempted Category, dtype: int64\n"
430 | ]
431 | }
432 | ],
433 | "source": [
434 | "#----------------------+\n",
435 | "# WEDNESDAY 21-02-2018 |\n",
436 | "#----------------------+\n",
437 | "\n",
438 | "dir_name = \"Wednesday-21-02-2018\"\n",
439 | "wednesday_21022018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + \"/csv\")\n",
440 | "\n",
441 | "#-- DDoS LOIC UDP\n",
442 | "label_flows(wednesday_21022018_df, \"DDoS-LOIC-UDP\", 1519222131*(10**9), 1519224219*(10**9),\n",
443 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n",
444 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n",
445 | " [\"172.31.69.28\"], additional_filters=[\n",
446 | " wednesday_21022018_df[\"Protocol\"] == 17\n",
447 | " ])\n",
448 | "\n",
449 | "# Payload filter\n",
450 | "label_flows(wednesday_21022018_df, \"DDoS-LOIC-UDP - Attempted\", 1519222131*(10**9), 1519224219*(10**9),\n",
451 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n",
452 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n",
453 | " [\"172.31.69.28\"], attempted_category=0, payload_filter=True, additional_filters=[wednesday_21022018_df[\"Protocol\"] == 17])\n",
454 | "\n",
455 | "# Attempted - Target unresponsive (the ICMP destination unreachable answers to the attack - using protocol = 1 for ICMP)\n",
456 | "label_flows(wednesday_21022018_df, \"DDoS-LOIC-UDP - Attempted\", 1519222131*(10**9), 1519224219*(10**9),\n",
457 | " [\"172.31.69.28\"], [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n",
458 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n",
459 | " attempted_category=6, additional_filters=[(wednesday_21022018_df[\"Protocol\"] == 1)])\n",
460 | "\n",
461 | "#-- DDoS HOIC\n",
462 | "label_flows(wednesday_21022018_df, \"DDoS-HOIC\", 1519236668*(10**9), 1519239955*(10**9),\n",
463 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n",
464 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n",
465 | " [\"172.31.69.28\"], additional_filters=[\n",
466 | " wednesday_21022018_df[\"Protocol\"] == 6\n",
467 | " ])\n",
468 | "\n",
469 | "# Payload filter\n",
470 | "label_flows(wednesday_21022018_df, \"DDoS-HOIC - Attempted\", 1519236668*(10**9), 1519239955*(10**9),\n",
471 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n",
472 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n",
473 | " [\"172.31.69.28\"], attempted_category=0, payload_filter=True, additional_filters=[wednesday_21022018_df[\"Protocol\"] == 6])\n",
474 | "\n",
475 | "label_rest_as_benign_and_write_csv(wednesday_21022018_df, DATASET_PATH + dir_name + \".csv\")"
476 | ],
477 | "metadata": {
478 | "collapsed": false
479 | }
480 | },
481 | {
482 | "cell_type": "code",
483 | "execution_count": 8,
484 | "outputs": [
485 | {
486 | "name": "stdout",
487 | "output_type": "stream",
488 | "text": [
489 | "labels before pre-processing: NeedManualLabel 6483351\n",
490 | "Name: Label, dtype: int64\n",
491 | "labels after pre-processing: NeedManualLabel 6071153\n",
492 | "Name: Label, dtype: int64\n",
493 | "label count after labelling:\r\n",
494 | " BENIGN 6070945\n",
495 | "Web Attack - Brute Force - Attempted 76\n",
496 | "Web Attack - Brute Force 69\n",
497 | "Web Attack - XSS 40\n",
498 | "Web Attack - SQL 16\n",
499 | "Web Attack - SQL - Attempted 4\n",
500 | "Web Attack - XSS - Attempted 3\n",
501 | "Name: Label, dtype: int64\n",
502 | "Attempted Category count after labelling:\r\n",
503 | " -1 6071070\n",
504 | " 5 66\n",
505 | " 2 12\n",
506 | " 0 4\n",
507 | " 3 1\n",
508 | "Name: Attempted Category, dtype: int64\n"
509 | ]
510 | }
511 | ],
512 | "source": [
513 | "#---------------------+\n",
514 | "# THURSDAY 22-02-2018 |\n",
515 | "#---------------------+\n",
516 | "\n",
517 | "dir_name = \"Thursday-22-02-2018\"\n",
518 | "thursday_22022018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + \"/csv\")\n",
519 | "\n",
520 | "#-- Web Attack SQL\n",
521 | "label_flows(thursday_22022018_df, \"Web Attack - SQL\", 1519330590418906000, 1519331276022793000, [\"18.218.115.60\"],\n",
522 | " [\"172.31.69.28\"], additional_filters=\n",
523 | " [thursday_22022018_df[\"Total Length of Fwd Packet\"] > 0,\n",
524 | " thursday_22022018_df[\"Total Length of Bwd Packet\"] > 0])\n",
525 | "\n",
526 | "# Attack startup artefact\n",
527 | "label_flows(thursday_22022018_df, \"Web Attack - SQL - Attempted\", 1519330470169342000, 1519330498599986000, [\"18.218.115.60\"],\n",
528 | " [\"172.31.69.28\"], attempted_category=2)\n",
529 | "\n",
530 | "# Payload filter\n",
531 | "label_flows(thursday_22022018_df, \"Web Attack - SQL - Attempted\", 1519330590418906000, 1519331276022793000, [\"18.218.115.60\"],\n",
532 | " [\"172.31.69.28\"], attempted_category=0, payload_filter=True)\n",
533 | "\n",
534 | "#-- Web Attack XSS\n",
535 | "# Port 63782 is attack setup (navigating to website)\n",
536 | "label_flows(thursday_22022018_df, \"Web Attack - XSS\", 1519321899783923000, 1519324181827037000, [\"18.218.115.60\"],\n",
537 | " [\"172.31.69.28\"], additional_filters=\n",
538 | " [~(thursday_22022018_df[\"Src Port\"].isin([63782, 64144]))])\n",
539 | "\n",
540 | "# Attempted attack setup\n",
541 | "label_flows(thursday_22022018_df, \"Web Attack - XSS - Attempted\", 1519321899783923000, 1519324181827037000, [\"18.218.115.60\"],\n",
542 | " [\"172.31.69.28\"], attempted_category=2, additional_filters=\n",
543 | " [thursday_22022018_df[\"Src Port\"] == 63782])\n",
544 | "\n",
545 | "label_flows(thursday_22022018_df, \"Web Attack - XSS - Attempted\", 1519321899783923000, 1519324181827037000, [\"18.218.115.60\"],\n",
546 | " [\"172.31.69.28\"], attempted_category=3, additional_filters=\n",
547 | " [thursday_22022018_df[\"Src Port\"] == 64144])\n",
548 | "\n",
549 | "# Payload filter\n",
550 | "label_flows(thursday_22022018_df, \"Web Attack - XSS - Attempted\", 1519321899783923000, 1519324181827037000, [\"18.218.115.60\"],\n",
551 | " [\"172.31.69.28\"], attempted_category=0, payload_filter=True, additional_filters=\n",
552 | " [~(thursday_22022018_df[\"Src Port\"].isin([63782, 64144]))])\n",
553 | "\n",
554 | "#-- Web Attack Brute Force & Attempted\n",
555 | "\n",
556 | "label_flows(thursday_22022018_df, \"Web Attack - Brute Force\", 1519309071336902000, 1519313039858533000, [\"18.218.115.60\"],\n",
557 | " [\"172.31.69.28\"], additional_filters=\n",
558 | " [thursday_22022018_df[\"Total Fwd Packet\"] > 20])\n",
559 | "\n",
560 | "label_flows(thursday_22022018_df, \"Web Attack - Brute Force - Attempted\", 1519309071336902000, 1519313039858533000,\n",
561 | " [\"18.218.115.60\"], [\"172.31.69.28\"], attempted_category=5, additional_filters=\n",
562 | " [(thursday_22022018_df[\"Total Fwd Packet\"] <= 20) & (thursday_22022018_df[\"Total Length of Fwd Packet\"] > 0)])\n",
563 | "\n",
564 | "label_flows(thursday_22022018_df, \"Web Attack - Brute Force - Attempted\", 1519308824965705000, 1519308947920399000, [\"18.218.115.60\"],\n",
565 | " [\"172.31.69.28\"], attempted_category=2)\n",
566 | "\n",
567 | "# Payload filter\n",
568 | "label_flows(thursday_22022018_df, \"Web Attack - Brute Force - Attempted\", 1519309071336902000, 1519313039858533000,\n",
569 | " [\"18.218.115.60\"], [\"172.31.69.28\"], attempted_category=0, payload_filter=True)\n",
570 | "\n",
571 | "label_rest_as_benign_and_write_csv(thursday_22022018_df, DATASET_PATH + dir_name + \".csv\")\n"
572 | ],
573 | "metadata": {
574 | "collapsed": false
575 | }
576 | },
577 | {
578 | "cell_type": "code",
579 | "execution_count": 9,
580 | "outputs": [
581 | {
582 | "name": "stdout",
583 | "output_type": "stream",
584 | "text": [
585 | "labels before pre-processing: NeedManualLabel 6313169\n",
586 | "Name: Label, dtype: int64\n",
587 | "labels after pre-processing: NeedManualLabel 5976481\n",
588 | "Name: Label, dtype: int64\n",
589 | "label count after labelling:\r\n",
590 | " BENIGN 5976251\n",
591 | "Web Attack - XSS 73\n",
592 | "Web Attack - Brute Force 62\n",
593 | "Web Attack - Brute Force - Attempted 61\n",
594 | "Web Attack - SQL 23\n",
595 | "Web Attack - SQL - Attempted 10\n",
596 | "Web Attack - XSS - Attempted 1\n",
597 | "Name: Label, dtype: int64\n",
598 | "Attempted Category count after labelling:\r\n",
599 | " -1 5976409\n",
600 | " 5 60\n",
601 | " 0 6\n",
602 | " 2 6\n",
603 | "Name: Attempted Category, dtype: int64\n"
604 | ]
605 | }
606 | ],
607 | "source": [
608 | "#-------------------+\n",
609 | "# FRIDAY 23-02-2018 |\n",
610 | "#-------------------+\n",
611 | "\n",
612 | "dir_name = \"Friday-23-02-2018\"\n",
613 | "friday_23022018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + \"/csv\")\n",
614 | "\n",
615 | "#-- Web Attack SQL\n",
616 | "label_flows(friday_23022018_df, \"Web Attack - SQL\", 1519412792126122000, 1519413444947957000 , [\"18.218.115.60\"],\n",
617 | " [\"172.31.69.28\"], additional_filters=\n",
618 | " [friday_23022018_df[\"Total Length of Fwd Packet\"] > 0,\n",
619 | " friday_23022018_df[\"Total Length of Bwd Packet\"] > 0])\n",
620 | "\n",
621 | "# Attack startup artefact\n",
622 | "label_flows(friday_23022018_df, \"Web Attack - SQL - Attempted\", 1519412722675686000, 1519412787879296000, [\"18.218.115.60\"],\n",
623 | " [\"172.31.69.28\"], attempted_category=2)\n",
624 | "\n",
625 | "# Payload filter\n",
626 | "label_flows(friday_23022018_df, \"Web Attack - SQL - Attempted\", 1519412792126122000, 1519413444947957000 , [\"18.218.115.60\"],\n",
627 | " [\"172.31.69.28\"], attempted_category=0, payload_filter=True)\n",
628 | "\n",
629 | "#-- Web Attack XSS\n",
630 | "label_flows(friday_23022018_df, \"Web Attack - XSS\", 1519405264559707000, 1519409428237472000, [\"18.218.115.60\"],\n",
631 | " [\"172.31.69.28\"], additional_filters=\n",
632 | " [~(friday_23022018_df[\"Src Port\"].isin([59173]))])\n",
633 | "\n",
634 | "label_flows(friday_23022018_df, \"Web Attack - XSS - Attempted\", 1519405264559707000, 1519409428237472000, [\"18.218.115.60\"],\n",
635 | " [\"172.31.69.28\"], attempted_category=2, additional_filters=\n",
636 | " [(friday_23022018_df[\"Src Port\"].isin([59173]))])\n",
637 | "\n",
638 | "# Payload filter\n",
639 | "label_flows(friday_23022018_df, \"Web Attack - XSS - Attempted\", 1519405264559707000, 1519409428237472000, [\"18.218.115.60\"],\n",
640 | " [\"172.31.69.28\"], attempted_category=0, payload_filter=True)\n",
641 | "\n",
642 | "#-- Web Attack Brute Force & Attempted\n",
643 | "label_flows(friday_23022018_df, \"Web Attack - Brute Force\", 1519394670193975000, 1519398186406294000, [\"18.218.115.60\"],\n",
644 | " [\"172.31.69.28\"], additional_filters=\n",
645 | " [friday_23022018_df[\"Total Fwd Packet\"] > 20])\n",
646 | "\n",
647 | "label_flows(friday_23022018_df, \"Web Attack - Brute Force - Attempted\", 1519394670193975000, 1519398186406294000,\n",
648 | " [\"18.218.115.60\"], [\"172.31.69.28\"], attempted_category=5, additional_filters=\n",
649 | " [(friday_23022018_df[\"Total Fwd Packet\"] <= 20) & (friday_23022018_df[\"Total Length of Fwd Packet\"] > 0)])\n",
650 | "\n",
651 | "# Payload filter:\n",
652 | "label_flows(friday_23022018_df, \"Web Attack - Brute Force - Attempted\", 1519394670193975000, 1519398186406294000,\n",
653 | " [\"18.218.115.60\"], [\"172.31.69.28\"], attempted_category=0, payload_filter=True)\n",
654 | "\n",
655 | "label_rest_as_benign_and_write_csv(friday_23022018_df, DATASET_PATH + dir_name + \".csv\")"
656 | ],
657 | "metadata": {
658 | "collapsed": false
659 | }
660 | },
661 | {
662 | "cell_type": "code",
663 | "execution_count": 10,
664 | "outputs": [
665 | {
666 | "name": "stdout",
667 | "output_type": "stream",
668 | "text": [
669 | "labels before pre-processing: NeedManualLabel 7173690\n",
670 | "Name: Label, dtype: int64\n",
671 | "labels after pre-processing: NeedManualLabel 6568726\n",
672 | "Name: Label, dtype: int64\n",
673 | "label count after labelling:\r\n",
674 | " BENIGN 6518882\n",
675 | "Infiltration - NMAP Portscan 49740\n",
676 | "Infiltration - Dropbox Download 46\n",
677 | "Infiltration - Communication Victim Attacker 43\n",
678 | "Infiltration - Dropbox Download - Attempted 15\n",
679 | "Name: Label, dtype: int64\n",
680 | "Attempted Category count after labelling:\r\n",
681 | " -1 6568711\n",
682 | " 4 15\n",
683 | "Name: Attempted Category, dtype: int64\n"
684 | ]
685 | }
686 | ],
687 | "source": [
688 | "#----------------------+\n",
689 | "# WEDNESDAY 28-02-2018 |\n",
690 | "#----------------------+\n",
691 | "\n",
692 | "dir_name = \"Wednesday-28-02-2018\"\n",
693 | "wednesday_28022018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + \"/csv\")\n",
694 | "\n",
695 | "#-- Infiltration - Dropbox Download\n",
696 | "label_flows(wednesday_28022018_df, \"Infiltration - Dropbox Download\", 1519828404*(10**9), 1519829172*(10**9),\n",
697 | " [\"172.31.69.24\"],\n",
698 | " [\"162.125.3.1\", \"162.125.3.5\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"])\n",
699 | "\n",
700 | "label_flows(wednesday_28022018_df, \"Infiltration - Dropbox Download\", 1519839771*(10**9), 1519839824*(10**9),\n",
701 | " [\"172.31.69.24\"],\n",
702 | " [\"162.125.3.1\", \"162.125.3.5\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"])\n",
703 | "\n",
704 | "# Payload filter\n",
705 | "label_flows(wednesday_28022018_df, \"Infiltration - Dropbox Download - Attempted\", 1519828404*(10**9), 1519829172*(10**9),\n",
706 | " [\"172.31.69.24\"],\n",
707 | " [\"162.125.3.1\", \"162.125.3.5\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"],\n",
708 | " attempted_category=0, payload_filter=True)\n",
709 | "\n",
710 | "label_flows(wednesday_28022018_df, \"Infiltration - Dropbox Download - Attempted\", 1519839771*(10**9), 1519839824*(10**9),\n",
711 | " [\"172.31.69.24\"],\n",
712 | " [\"162.125.3.1\", \"162.125.3.5\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"],\n",
713 | " attempted_category=0, payload_filter=True)\n",
714 | "\n",
715 | "# Attempted - Attack artefact\n",
716 | "label_flows(wednesday_28022018_df, \"Infiltration - Dropbox Download - Attempted\", 1519828404*(10**9), 1519829172*(10**9),\n",
717 | " [\"172.31.69.24\"],\n",
718 | " [\"104.16.100.29\", \"104.16.99.29\", \"52.84.128.3\", \"52.85.101.236\", \"52.85.131.81\", \"52.85.95.206\"], attempted_category=4)\n",
719 | "\n",
720 | "label_flows(wednesday_28022018_df, \"Infiltration - Dropbox Download - Attempted\", 1519839771*(10**9), 1519839824*(10**9),\n",
721 | " [\"172.31.69.24\"],\n",
722 | " [\"104.16.100.29\", \"104.16.99.29\", \"52.84.128.3\", \"52.85.101.236\", \"52.85.131.81\", \"52.85.95.206\"], attempted_category=4)\n",
723 | "\n",
724 | "#-- Infiltration - Communication Victim Attacker\n",
725 | "label_flows(wednesday_28022018_df, \"Infiltration - Communication Victim Attacker\", 1519829140*(10**9),\n",
726 | " 1519834135*(10**9), [\"172.31.69.24\"], [\"13.58.225.34\"])\n",
727 | "\n",
728 | "label_flows(wednesday_28022018_df, \"Infiltration - Communication Victim Attacker\", 1519839839*(10**9),\n",
729 | " 1519843200*(10**9), [\"172.31.69.24\"], [\"13.58.225.34\"])\n",
730 | "\n",
731 | "# Payload filter\n",
732 | "\n",
733 | "label_flows(wednesday_28022018_df, \"Infiltration - Communication Victim Attacker - Attempted\", 1519829140*(10**9),\n",
734 | " 1519834135*(10**9), [\"172.31.69.24\"], [\"13.58.225.34\"], attempted_category=0, payload_filter=True)\n",
735 | "\n",
736 | "label_flows(wednesday_28022018_df, \"Infiltration - Communication Victim Attacker - Attempted\", 1519839839*(10**9),\n",
737 | " 1519843200*(10**9), [\"172.31.69.24\"], [\"13.58.225.34\"], attempted_category=0, payload_filter=True)\n",
738 | "\n",
739 | "#-- Infiltration - NMAP Portscan\n",
740 | "label_flows(wednesday_28022018_df, \"Infiltration - NMAP Portscan\", 1519829182*(10**9), 1519843140746247000,\n",
741 | " [\"172.31.69.24\"],\n",
742 | " [\"172.31.69.1\", \"172.31.69.10\", \"172.31.69.11\", \"172.31.69.12\", \"172.31.69.13\", \"172.31.69.14\",\n",
743 | " \"172.31.69.16\", \"172.31.69.17\", \"172.31.69.19\", \"172.31.69.20\", \"172.31.69.23\", \"172.31.69.4\",\n",
744 | " \"172.31.69.5\", \"172.31.69.6\", \"172.31.69.8\", \"172.31.69.9\", \"172.31.69.7\", \"172.31.69.22\",\n",
745 | " \"172.31.69.15\", \"172.31.69.21\", \"172.31.69.18\",], additional_filters=\n",
746 | " [~(wednesday_28022018_df[\"Src Port\"] == 68)])\n",
747 | "\n",
748 | "label_rest_as_benign_and_write_csv(wednesday_28022018_df, DATASET_PATH + dir_name + \".csv\")"
749 | ],
750 | "metadata": {
751 | "collapsed": false
752 | }
753 | },
754 | {
755 | "cell_type": "code",
756 | "execution_count": 11,
757 | "outputs": [
758 | {
759 | "name": "stdout",
760 | "output_type": "stream",
761 | "text": [
762 | "labels before pre-processing: NeedManualLabel 7252549\n",
763 | "Name: Label, dtype: int64\n",
764 | "labels after pre-processing: NeedManualLabel 6551401\n",
765 | "Name: Label, dtype: int64\n",
766 | "label count after labelling:\r\n",
767 | " BENIGN 6511554\n",
768 | "Infiltration - NMAP Portscan 39634\n",
769 | "Infiltration - Communication Victim Attacker 161\n",
770 | "Infiltration - Dropbox Download 39\n",
771 | "Infiltration - Dropbox Download - Attempted 13\n",
772 | "Name: Label, dtype: int64\n",
773 | "Attempted Category count after labelling:\r\n",
774 | " -1 6551388\n",
775 | " 4 13\n",
776 | "Name: Attempted Category, dtype: int64\n"
777 | ]
778 | }
779 | ],
780 | "source": [
781 | "#---------------------+\n",
782 | "# THURSDAY 01-03-2018 |\n",
783 | "#---------------------+\n",
784 | "\n",
785 | "dir_name = \"Thursday-01-03-2018\"\n",
786 | "thursday_01032018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + \"/csv\")\n",
787 | "\n",
788 | "#-- Infiltration - Dropbox Download\n",
789 | "label_flows(thursday_01032018_df, \"Infiltration - Dropbox Download\", 1519912390*(10**9), 1519912760*(10**9),\n",
790 | " [\"172.31.69.13\"], [\"162.125.3.1\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"])\n",
791 | "\n",
792 | "label_flows(thursday_01032018_df, \"Infiltration - Dropbox Download\", 1519913032*(10**9), 1519918454*(10**9),\n",
793 | " [\"172.31.69.13\"], [\"162.125.3.1\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"])\n",
794 | "\n",
795 | "# Payload filter\n",
796 | "label_flows(thursday_01032018_df, \"Infiltration - Dropbox Download - Attempted\", 1519912390*(10**9), 1519912760*(10**9),\n",
797 | " [\"172.31.69.13\"],\n",
798 | " [\"162.125.3.1\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"], attempted_category=0, payload_filter=True)\n",
799 | "\n",
800 | "label_flows(thursday_01032018_df, \"Infiltration - Dropbox Download - Attempted\", 1519913032*(10**9), 1519918454*(10**9),\n",
801 | " [\"172.31.69.13\"],\n",
802 | " [\"162.125.3.1\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"], attempted_category=0, payload_filter=True)\n",
803 | "\n",
804 | "# Attempted - Attack artefact\n",
805 | "label_flows(thursday_01032018_df, \"Infiltration - Dropbox Download - Attempted\", 1519912390*(10**9), 1519912760*(10**9),\n",
806 | " [\"172.31.69.13\"], [\"104.16.100.29\", \"13.32.168.125\", \"52.85.112.72\"], attempted_category=4)\n",
807 | "\n",
808 | "label_flows(thursday_01032018_df, \"Infiltration - Dropbox Download - Attempted\", 1519913032*(10**9), 1519918454*(10**9),\n",
809 | " [\"172.31.69.13\"], [\"104.16.100.29\", \"13.32.168.125\", \"52.85.112.72\"], attempted_category=4)\n",
810 | "\n",
811 | "#-- Infiltration - Communication Victim Attacker\n",
812 | "label_flows(thursday_01032018_df, \"Infiltration - Communication Victim Attacker\", 1519912674*(10**9),\n",
813 | " 1519912745*(10**9), [\"172.31.69.13\"], [\"13.58.225.34\"])\n",
814 | "\n",
815 | "label_flows(thursday_01032018_df, \"Infiltration - Communication Victim Attacker\", 1519913075*(10**9),\n",
816 | " 1519928245*(10**9), [\"172.31.69.13\"], [\"13.58.225.34\"])\n",
817 | "\n",
818 | "label_flows(thursday_01032018_df, \"Infiltration - Communication Victim Attacker\", 1519928295*(10**9),\n",
819 | " 1519933041*(10**9), [\"172.31.69.13\"], [\"13.58.225.34\"])\n",
820 | "\n",
821 | "# Payload filter\n",
822 | "label_flows(thursday_01032018_df, \"Infiltration - Communication Victim Attacker - Attempted\", 1519912674*(10**9),\n",
823 | " 1519912745*(10**9), [\"172.31.69.13\"], [\"13.58.225.34\"], attempted_category=0, payload_filter=True)\n",
824 | "\n",
825 | "label_flows(thursday_01032018_df, \"Infiltration - Communication Victim Attacker - Attempted\", 1519913075*(10**9),\n",
826 | " 1519928245*(10**9), [\"172.31.69.13\"], [\"13.58.225.34\"], attempted_category=0, payload_filter=True)\n",
827 | "\n",
828 | "label_flows(thursday_01032018_df, \"Infiltration - Communication Victim Attacker - Attempted\", 1519928295*(10**9),\n",
829 | " 1519933041*(10**9), [\"172.31.69.13\"], [\"13.58.225.34\"], attempted_category=0, payload_filter=True)\n",
830 | "\n",
831 | "#-- Infiltration - NMAP Portscan (TODO: do we not need to filter out DHCP background traffic on port 68 in NMAP\n",
832 | "# of previous day as well?)\n",
833 | "label_flows(thursday_01032018_df, \"Infiltration - NMAP Portscan\", 1519913388354333000, 1519933092182726000,\n",
834 | " [\"172.31.69.13\"],\n",
835 | " [\"172.31.69.1\", \"172.31.69.11\", \"172.31.69.12\", \"172.31.69.16\", \"172.31.69.8\", \"172.31.69.9\",\n",
836 | " \"172.31.69.10\", \"172.31.69.14\", \"172.31.69.4\", \"172.31.69.5\", \"172.31.69.6\", \"172.31.69.17\",\n",
837 | " \"172.31.69.20\", \"172.31.69.23\", \"172.31.69.24\", \"172.31.69.19\", \"172.31.69.7\", \"172.31.69.15\",\n",
838 | " \"172.31.69.18\", \"172.31.69.22\", \"172.31.69.21\"], additional_filters=\n",
839 | " [thursday_01032018_df[\"Src Port\"] != 68])\n",
840 | "\n",
841 | "label_rest_as_benign_and_write_csv(thursday_01032018_df, DATASET_PATH + dir_name + \".csv\")"
842 | ],
843 | "metadata": {
844 | "collapsed": false
845 | }
846 | },
847 | {
848 | "cell_type": "code",
849 | "execution_count": 3,
850 | "outputs": [
851 | {
852 | "name": "stdout",
853 | "output_type": "stream",
854 | "text": [
855 | "labels before pre-processing: NeedManualLabel 6637636\n",
856 | "Name: Label, dtype: int64\n",
857 | "labels after pre-processing: NeedManualLabel 6311371\n",
858 | "Name: Label, dtype: int64\n",
859 | "label count after labelling:\r\n",
860 | " BENIGN 6168188\n",
861 | "Botnet Ares 142921\n",
862 | "Botnet Ares - Attempted 262\n",
863 | "Name: Label, dtype: int64\n",
864 | "Attempted Category count after labelling:\r\n",
865 | " -1 6311109\n",
866 | " 0 258\n",
867 | " 2 4\n",
868 | "Name: Attempted Category, dtype: int64\n"
869 | ]
870 | }
871 | ],
872 | "source": [
873 | "#-------------------+\n",
874 | "# FRIDAY 02-03-2018 |\n",
875 | "#-------------------+\n",
876 | "\n",
877 | "dir_name = \"Friday-02-03-2018\"\n",
878 | "friday_02032018_df = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name + \"/csv\")\n",
879 | "\n",
880 | "#-- Botnet Ares\n",
881 | "label_flows(friday_02032018_df, \"Botnet Ares\", 1520000008*(10**9), 1520020492*(10**9), additional_filters=\n",
882 | " [(friday_02032018_df[\"Src IP\"] == \"18.219.211.138\") | (friday_02032018_df[\"Dst IP\"] == \"18.219.211.138\")])\n",
883 | "\n",
884 | "#-- Botnet Ares - Attempted: Tear-down artefact. Botnet slave has ongoing TCP connection to master which is prematurely terminated by master sending RST packet\n",
885 | "label_flows(friday_02032018_df, \"Botnet Ares - Attempted\", 1520020424*(10**9), 1520020492*(10**9), attempted_category=2, additional_filters=\n",
886 | " [(friday_02032018_df[\"Dst IP\"] == \"18.219.211.138\") &\n",
887 | " (friday_02032018_df[\"Total Length of Fwd Packet\"] > 0) &\n",
888 | " (friday_02032018_df[\"Bwd RST Flags\"] > 0)])\n",
889 | "\n",
890 | "\n",
891 | "# Payload filter\n",
892 | "label_flows(friday_02032018_df, \"Botnet Ares - Attempted\", 1520000008*(10**9), 1520020492*(10**9), attempted_category=0, additional_filters=\n",
893 | " [((friday_02032018_df[\"Src IP\"] == \"18.219.211.138\") | (friday_02032018_df[\"Dst IP\"] == \"18.219.211.138\")) &\n",
894 | " (friday_02032018_df[\"Total Length of Fwd Packet\"] == 0) & (friday_02032018_df[\"Total Length of Bwd Packet\"] == 0)])\n",
895 | "\n",
896 | "label_rest_as_benign_and_write_csv(friday_02032018_df, DATASET_PATH + dir_name + \".csv\")\n",
897 | "\n"
898 | ],
899 | "metadata": {
900 | "collapsed": false
901 | }
902 | },
903 | {
904 | "cell_type": "code",
905 | "execution_count": 12,
906 | "outputs": [],
907 | "source": [],
908 | "metadata": {
909 | "collapsed": false
910 | }
911 | }
912 | ],
913 | "metadata": {
914 | "kernelspec": {
915 | "display_name": "Python 3",
916 | "language": "python",
917 | "name": "python3"
918 | },
919 | "language_info": {
920 | "codemirror_mode": {
921 | "name": "ipython",
922 | "version": 2
923 | },
924 | "file_extension": ".py",
925 | "mimetype": "text/x-python",
926 | "name": "python",
927 | "nbconvert_exporter": "python",
928 | "pygments_lexer": "ipython2",
929 | "version": "2.7.6"
930 | }
931 | },
932 | "nbformat": 4,
933 | "nbformat_minor": 0
934 | }
935 |
--------------------------------------------------------------------------------
/Labelling/CICIDS2018_original_version_labelling.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import pandas as pd\n",
12 | "import numpy as np\n",
13 | "import glob\n",
14 | "import os\n",
15 | "from sys import platform\n",
16 | "\n",
17 | "# THIS LABELLING SCRIPT IS USED TO LABEL THE OLD VERSION OF CSE-CIC-IDS-2018. THIS VERSION SHOULD ONLY BE USED IF YOU\n",
18 | "# WISH TO RECREATE OUR RESULTS AS REPORTED IN OUR PAPER: https://intrusion-detection.distrinet-research.be/CNS2022/index.html\n",
19 | "\n",
20 | "# THIS SCRIPT ACCEPTS AS INPUT THE ORIGINAL CSVs AS RELEASED BY THE DATASET AUTHORS: https://www.unb.ca/cic/datasets/ids-2018.html\n",
21 | "\n",
22 | "\n",
23 | "pd.set_option('display.max_rows', 100)\n",
24 | "\n",
25 | "\n",
26 | "DATASET_PATH = \"\"\n",
27 | "\n",
28 | "# unset to remove line index (to refer to line numbers when writing final csv)\n",
29 | "print_index = True"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 2,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "def format_csv_for_labelling(df):\n",
39 | " # strip leading whitespaces in column names\n",
40 | " df.columns = df.columns.str.lstrip(\" \")\n",
41 | "\n",
42 | " print(\"labels before pre-processing:\", df[\"Label\"].value_counts())\n",
43 | "\n",
44 | " # Keep track of header rows that occur in the middle of the flow traces. Drop them\n",
45 | " # temporarily for ease of labeling and dataframe manipulation and then merge them\n",
46 | " # back in at the very end. The intention is to preserve the original published files\n",
47 | " # exactly except with the corrected labelling. This makes lining up mismatches between\n",
48 | " # the original and corrected version easier, using line number as the reference.\n",
49 | " # This is for 2018 version only, as the 2017 version does not contain header rows in\n",
50 | " # the middle of flow traces.\n",
51 | " header_rows = df[(df[\"Timestamp\"] == \"Timestamp\") & (df.index > 0)]\n",
52 | " df = df.drop(header_rows.index)\n",
53 | "\n",
54 | " # Since CICIDS 2018 authors used 12-hour format but removed AM/PM, we need to reconstruct it\n",
55 | " # We do this based on the knowledge they collected traffic from roughly 9:00 AM to 5:00 PM.\n",
56 | " df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%m/%Y %H:%M:%S')\n",
57 | " #for i, item in enumerate(df['Timestamp']):\n",
58 | " # try:\n",
59 | " # new_item = pd.to_datetime(item, format='%d/%m/%Y %H:%M:%S')\n",
60 | " # except ValueError:\n",
61 | " # print('ERROR at index {}: {}'.format(i, item))\n",
62 | "\n",
63 | " df['Timestamp'] = df['Timestamp'].apply(lambda x: x + pd.DateOffset(hours=12) if x.hour < 7 else x)\n",
64 | "\n",
65 | " # Convert to UTC from New Brunswick winter timezone (UTC-4)\n",
66 | " df['Timestamp'] = df['Timestamp'] + pd.DateOffset(hours=4)\n",
67 | "\n",
68 | " for column in df.columns:\n",
69 | " if column not in ['Flow ID' , 'Timestamp', 'Src IP', 'Dst IP', 'Label']:\n",
70 | " df[column] = pd.to_numeric(df[column])\n",
71 | "\n",
72 | " # Add attempted category column and initialise to -1\n",
73 | " df[\"Attempted Category\"] = -1\n",
74 | "\n",
75 | " # CICIDS 2018 author-released version comes prelabelled. This makes sure previous labels don't interfere\n",
76 | " df[\"Label\"] = \"NeedManualLabel\"\n",
77 | "\n",
78 | " print(\"labels after pre-processing:\", df[\"Label\"].value_counts())\n",
79 | "\n",
80 | " return df, header_rows\n",
81 | "\n",
82 | "def read_csvs_from_path_and_reformat(path):\n",
83 | " df = pd.read_csv(path + \"/merged.csv\")\n",
84 | "\n",
85 | " df, header_rows = format_csv_for_labelling(df)\n",
86 | "\n",
87 | " return df, header_rows\n",
88 | "\n",
89 | "# Important note: you should not use the also_flip_flow_direction if you set the additional_filters with a \"Fwd\" or \"Bwd\"\n",
90 | "# column filtering\n",
91 | "def label_flows(df, label, attack_start_time_nanoseconds, attack_end_time_nanoseconds, src_ip_list=None,\n",
92 | " dst_ip_list= None, src_port_list=None, dst_port_list=None, attempted_category = -1, additional_filters=[],\n",
93 | " also_flip_flow_direction=False, payload_filter=False):\n",
94 | " # Note that labelling happens inplace on the 'df' parameter, and so this function doesn't return anything\n",
95 | "\n",
96 | " # Create initial mask with all values set to True. Squeeze is necessary to remove second axis (with value 1)\n",
97 | " # The reason is that a df of shape (X,) gets converted to (1,X) if you '&' it with a df of shape (X,1)\n",
98 | " custom_mask = pd.DataFrame(True, index=df.index, columns=[df.columns[0]]).squeeze()\n",
99 | "\n",
100 | " # Need to round the start time down to the nearest second to prevent edge-case issues with flows being mislabelled as benign\n",
101 | " attack_start_datetime = pd.to_datetime(attack_start_time_nanoseconds, unit='ns').floor(freq='S')\n",
102 | " attack_end_datetime = pd.to_datetime(attack_end_time_nanoseconds, unit='ns')\n",
103 | "\n",
104 | " custom_mask &= (df[\"Timestamp\"] >= attack_start_datetime)\n",
105 | " custom_mask &= (df[\"Timestamp\"] <= attack_end_datetime)\n",
106 | "\n",
107 | " if src_ip_list is not None:\n",
108 | " custom_mask &= (df[\"Src IP\"].isin(src_ip_list))\n",
109 | " if dst_ip_list is not None:\n",
110 | " custom_mask &= (df[\"Dst IP\"].isin(dst_ip_list))\n",
111 | "\n",
112 | " if src_port_list is not None:\n",
113 | " custom_mask &= (df[\"Src Port\"].isin(src_port_list))\n",
114 | " if dst_port_list is not None:\n",
115 | " custom_mask &= (df[\"Dst Port\"].isin(dst_port_list))\n",
116 | "\n",
117 | " # IMPORTANT NOTE: If you decide to add TotLen Fwd Pkt == 6 for catching RST packets, you still have to manually alter some additional_filters for flipped flows where\n",
118 | " # you couldn't use payload_filter boolean function input value\n",
119 | " if payload_filter:\n",
120 | " custom_mask &= (df[\"TotLen Fwd Pkts\"] == 0)\n",
121 | "\n",
122 | " for filter in additional_filters:\n",
123 | " custom_mask &= filter\n",
124 | "\n",
125 | " df[\"Label\"].mask(custom_mask, label, inplace=True)\n",
126 | " df[\"Attempted Category\"].mask(custom_mask, attempted_category, inplace=True)\n",
127 | "\n",
128 | " if also_flip_flow_direction:\n",
129 | "\n",
130 | " custom_mask = pd.DataFrame(True, index=df.index, columns=[df.columns[0]]).squeeze()\n",
131 | "\n",
132 | " custom_mask &= (df[\"Timestamp\"] >= attack_start_datetime)\n",
133 | " custom_mask &= (df[\"Timestamp\"] <= attack_end_datetime)\n",
134 | "\n",
135 | " if src_ip_list is not None:\n",
136 | " custom_mask &= (df[\"Dst IP\"].isin(src_ip_list))\n",
137 | " if dst_ip_list is not None:\n",
138 | " custom_mask &= (df[\"Src IP\"].isin(dst_ip_list))\n",
139 | "\n",
140 | " if src_port_list is not None:\n",
141 | " custom_mask &= (df[\"Dst Port\"].isin(src_port_list))\n",
142 | " if dst_port_list is not None:\n",
143 | " custom_mask &= (df[\"Src Port\"].isin(dst_port_list))\n",
144 | "\n",
145 | " if payload_filter:\n",
146 | " custom_mask &= (df[\"TotLen Bwd Pkts\"] == 0)\n",
147 | "\n",
148 | " for filter in additional_filters:\n",
149 | " custom_mask &= filter\n",
150 | "\n",
151 | " df[\"Label\"].mask(custom_mask, label, inplace=True)\n",
152 | " df[\"Attempted Category\"].mask(custom_mask, attempted_category, inplace=True)\n",
153 | "\n",
154 | "\n",
155 | "\n",
156 | "def label_rest_as_benign_and_write_csv(df, header_rows, file_to_write):\n",
157 | " df[\"Label\"].mask(df[\"Label\"] == \"NeedManualLabel\", \"BENIGN\", inplace=True)\n",
158 | "\n",
159 | " # Relabel artefact flows with [Flow Id] = '8.0.6.4-8.6.0.1-0-0-0' to label = 0\n",
160 | " df[\"Label\"].mask(df[\"Flow ID\"] == '8.0.6.4-8.6.0.1-0-0-0', \"BENIGN\", inplace=True)\n",
161 | "\n",
162 | " print(\"label count after labelling:\\r\\n\", df[\"Label\"].value_counts())\n",
163 | " print(\"Attempted Category count after labelling:\\r\\n\", df[\"Attempted Category\"].value_counts())\n",
164 | "\n",
165 | " full_df = pd.concat([df, header_rows], sort=False).sort_index()\n",
166 | "\n",
167 | " if print_index:\n",
168 | " full_df.reset_index(inplace=True, drop=True)\n",
169 | " full_df.index += 1\n",
170 | " full_df.index.name = 'id'\n",
171 | " full_df.to_csv(file_to_write)\n",
172 | " else:\n",
173 | " full_df.to_csv(file_to_write, index=False)"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": 3,
179 | "metadata": {},
180 | "outputs": [
181 | {
182 | "name": "stdout",
183 | "output_type": "stream",
184 | "text": [
185 | "labels before pre-processing: Benign 6702133\n",
186 | "FTP-BruteForce 193360\n",
187 | "SSH-Bruteforce 187589\n",
188 | "Name: Label, dtype: int64\n",
189 | "labels after pre-processing: NeedManualLabel 7083082\n",
190 | "Name: Label, dtype: int64\n",
191 | "label count after labelling:\r\n",
192 | " BENIGN 6701304\n",
193 | "FTP-BruteForce - Attempted 193360\n",
194 | "SSH-BruteForce - Attempted 94211\n",
195 | "SSH-BruteForce 94207\n",
196 | "Name: Label, dtype: int64\n",
197 | "Attempted Category count after labelling:\r\n",
198 | " -1 6795511\n",
199 | " 1 193360\n",
200 | " 0 94211\n",
201 | "Name: Attempted Category, dtype: int64\n"
202 | ]
203 | }
204 | ],
205 | "source": [
206 | "#----------------------+\n",
207 | "# WEDNESDAY 14-02-2018 |\n",
208 | "#----------------------+\n",
209 | "\n",
210 | "dir_name = \"Wednesday-14-02-2018\"\n",
211 | "wednesday_14022018_df, wednesday_14022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)\n",
212 | "\n",
213 | "#-- FTP-BruteForce\n",
214 | "label_flows(wednesday_14022018_df, \"FTP-BruteForce - Attempted\", 1518618806*(10**9),\n",
215 | " 1518624631*(10**9), [\"18.221.219.4\"], [\"172.31.69.25\"], attempted_category=1, also_flip_flow_direction=True)\n",
216 | "\n",
217 | "# FTP-BruteForce - Attempted (tool accidentally got launched in FTP bruteforce mode instead of SSH bruteforce mode)\n",
218 | "# Note that, in order to avoid float imprecisions at the micro- and nanosecond level, the UNIX timestamps such as\n",
219 | "# 1518631281.199541000, which is in seconds, needs to be converted to nanoseconds, so that the number is stored\n",
220 | "# in int64 instead of float.\n",
221 | "label_flows(wednesday_14022018_df, \"FTP-BruteForce - Attempted\", 1518631281,\n",
222 | " 1518631281, [\"13.58.98.64\"], [\"172.31.69.25\"], dst_port_list=[21], attempted_category=4)\n",
223 | "\n",
224 | "#-- SSH-BruteForce\n",
225 | "label_flows(wednesday_14022018_df, \"SSH-BruteForce\", 1518631310*(10**9),\n",
226 | " 1518636750*(10**9), [\"13.58.98.64\"], [\"172.31.69.25\"], dst_port_list=[22], also_flip_flow_direction=True)\n",
227 | "# Payload filter\n",
228 | "label_flows(wednesday_14022018_df, \"SSH-BruteForce - Attempted\", 1518631310*(10**9),\n",
229 | " 1518636750*(10**9), [\"13.58.98.64\"], [\"172.31.69.25\"], dst_port_list=[22],\n",
230 | " attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n",
231 | "\n",
232 | "label_rest_as_benign_and_write_csv(wednesday_14022018_df, wednesday_14022018_df_header_rows, DATASET_PATH + dir_name + \".csv\")\n",
233 | "\n",
234 | "wednesday_14022018_df = None"
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": 4,
240 | "metadata": {},
241 | "outputs": [
242 | {
243 | "name": "stdout",
244 | "output_type": "stream",
245 | "text": [
246 | "labels before pre-processing: Benign 6565262\n",
247 | "DoS attacks-GoldenEye 41508\n",
248 | "DoS attacks-Slowloris 10990\n",
249 | "Name: Label, dtype: int64\n",
250 | "labels after pre-processing: NeedManualLabel 6617760\n",
251 | "Name: Label, dtype: int64\n",
252 | "label count after labelling:\r\n",
253 | " BENIGN 6564757\n",
254 | "DoS GoldenEye 27719\n",
255 | "DoS GoldenEye - Attempted 13789\n",
256 | "DoS Slowloris 8585\n",
257 | "DoS Slowloris - Attempted 2910\n",
258 | "Name: Label, dtype: int64\n",
259 | "Attempted Category count after labelling:\r\n",
260 | " -1 6601061\n",
261 | " 0 16638\n",
262 | " 6 53\n",
263 | " 4 8\n",
264 | "Name: Attempted Category, dtype: int64\n"
265 | ]
266 | }
267 | ],
268 | "source": [
269 | "#---------------------+\n",
270 | "# THURSDAY 15-02-2018 |\n",
271 | "#---------------------+\n",
272 | "\n",
273 | "dir_name=\"Thursday-15-02-2018\"\n",
274 | "thursday_15022018_df, thursday_15022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)\n",
275 | "\n",
276 | "#-- DoS GoldenEye\n",
277 | "label_flows(thursday_15022018_df, \"DoS GoldenEye\", 1518701262*(10**9), 1518703905*(10**9), [\"18.219.211.138\"],\n",
278 | " [\"172.31.69.25\"], also_flip_flow_direction=True)\n",
279 | "\n",
280 | "# Payload filter\n",
281 | "label_flows(thursday_15022018_df, \"DoS GoldenEye - Attempted\", 1518701262*(10**9), 1518703905*(10**9), [\"18.219.211.138\"],\n",
282 | " [\"172.31.69.25\"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n",
283 | "\n",
284 | "# Target system unresponsive\n",
285 | "label_flows(thursday_15022018_df, \"DoS GoldenEye - Attempted\", 1518701262*(10**9), 1518703905*(10**9), [\"18.219.211.138\"],\n",
286 | " [\"172.31.69.25\"], attempted_category=6, additional_filters=[\n",
287 | " (thursday_15022018_df[\"TotLen Bwd Pkts\"] == 0) & (thursday_15022018_df[\"TotLen Fwd Pkts\"] > 0) &\n",
288 | " (thursday_15022018_df[\"Tot Fwd Pkts\"] > 2) & (thursday_15022018_df[\"Flow Duration\"] > 100000000)\n",
289 | " ])\n",
290 | "\n",
291 | "#-- DoS Slowloris\n",
292 | "label_flows(thursday_15022018_df, \"DoS Slowloris\", 1518706812*(10**9), 1518709321*(10**9), [\"18.217.165.70\"],\n",
293 | " [\"172.31.69.25\"], also_flip_flow_direction=True)\n",
294 | "\n",
295 | "# Payload filter\n",
296 | "label_flows(thursday_15022018_df, \"DoS Slowloris - Attempted\", 1518706812*(10**9), 1518709321*(10**9), [\"18.217.165.70\"],\n",
297 | " [\"172.31.69.25\"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n",
298 | "\n",
299 | "# Attack artefact (exclusively for original version\n",
300 | "label_flows(thursday_15022018_df, \"DoS Slowloris - Attempted\", 1518706812*(10**9), 1518709321*(10**9), [\"172.31.69.25\"], [\"18.217.165.70\"],\n",
301 | " attempted_category=4, additional_filters=[\n",
302 | " (thursday_15022018_df[\"Tot Fwd Pkts\"] == 1) & (thursday_15022018_df[\"Tot Bwd Pkts\"] == 2) & (thursday_15022018_df[\"TotLen Fwd Pkts\"] == 0) &\n",
303 | " (thursday_15022018_df[\"TotLen Bwd Pkts\"] == 238)\n",
304 | " ])\n",
305 | "\n",
306 | "label_rest_as_benign_and_write_csv(thursday_15022018_df, thursday_15022018_df_header_rows, DATASET_PATH + dir_name + \".csv\")\n",
307 | "\n",
308 | "thursday_15022018_df = None"
309 | ]
310 | },
311 | {
312 | "cell_type": "code",
313 | "execution_count": 5,
314 | "metadata": {},
315 | "outputs": [
316 | {
317 | "name": "stderr",
318 | "output_type": "stream",
319 | "text": [
320 | "/tmp/ipykernel_65171/2882869393.py:6: DtypeWarning: Columns (2,4,5,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82) have mixed types.Specify dtype option on import or set low_memory=False.\n",
321 | " friday_16022018_df, friday_16022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)\n"
322 | ]
323 | },
324 | {
325 | "name": "stdout",
326 | "output_type": "stream",
327 | "text": [
328 | "labels before pre-processing: Benign 7413958\n",
329 | "DoS attacks-Hulk 923824\n",
330 | "DoS attacks-SlowHTTPTest 182868\n",
331 | "Label 1\n",
332 | "Name: Label, dtype: int64\n",
333 | "labels after pre-processing: NeedManualLabel 8520650\n",
334 | "Name: Label, dtype: int64\n",
335 | "label count after labelling:\r\n",
336 | " BENIGN 6521192\n",
337 | "DoS Hulk 935504\n",
338 | "DoS Hulk - Attempted 881086\n",
339 | "FTP-BruteForce - Attempted 182868\n",
340 | "Name: Label, dtype: int64\n",
341 | "Attempted Category count after labelling:\r\n",
342 | " -1 7456696\n",
343 | " 0 881086\n",
344 | " 1 182868\n",
345 | "Name: Attempted Category, dtype: int64\n"
346 | ]
347 | }
348 | ],
349 | "source": [
350 | "#-------------------+\n",
351 | "# FRIDAY 16-02-2018 |\n",
352 | "#-------------------+\n",
353 | "\n",
354 | "dir_name=\"Friday-16-02-2018\"\n",
355 | "friday_16022018_df, friday_16022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)\n",
356 | "\n",
357 | "#-- FTP-Patator - Attempted\n",
358 | "label_flows(friday_16022018_df, \"FTP-BruteForce - Attempted\", 1518790334*(10**9), 1518793513*(10**9), [\"13.59.126.31\"],\n",
359 | " [\"172.31.69.25\"], attempted_category=1, also_flip_flow_direction=True)\n",
360 | "\n",
361 | "#-- DoS Hulk\n",
362 | "label_flows(friday_16022018_df, \"DoS Hulk\", 1518803127*(10**9), 1518803903*(10**9), [\"18.219.193.20\"], [\"172.31.69.25\"],\n",
363 | " also_flip_flow_direction=True)\n",
364 | "\n",
365 | "label_flows(friday_16022018_df, \"DoS Hulk - Attempted\", 1518803127*(10**9), 1518803903*(10**9), [\"18.219.193.20\"],\n",
366 | " [\"172.31.69.25\"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n",
367 | "\n",
368 | "#-- Dos Slowhttptest: No actual DoS Slowloris flows are present on this day in this dataset!\n",
369 | "# Instead we only find failed FTP-Patator traffic, which is exactly what is covered earlier in this cell\n",
370 | "\n",
371 | "label_rest_as_benign_and_write_csv(friday_16022018_df, friday_16022018_df_header_rows, DATASET_PATH + dir_name + \".csv\")\n",
372 | "\n",
373 | "friday_16022018_df = None"
374 | ]
375 | },
376 | {
377 | "cell_type": "code",
378 | "execution_count": 1,
379 | "metadata": {},
380 | "outputs": [
381 | {
382 | "ename": "NameError",
383 | "evalue": "name 'read_csvs_from_path_and_reformat' is not defined",
384 | "output_type": "error",
385 | "traceback": [
386 | "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
387 | "\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)",
388 | "Input \u001B[0;32mIn [1]\u001B[0m, in \u001B[0;36m\u001B[0;34m()\u001B[0m\n\u001B[1;32m 1\u001B[0m \u001B[38;5;66;03m#--------------------+\u001B[39;00m\n\u001B[1;32m 2\u001B[0m \u001B[38;5;66;03m# TUESDAY 20-02-2018 |\u001B[39;00m\n\u001B[1;32m 3\u001B[0m \u001B[38;5;66;03m#--------------------+\u001B[39;00m\n\u001B[1;32m 5\u001B[0m dir_name\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mTuesday-20-02-2018\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m----> 6\u001B[0m tuesday_20022018_df, tuesday_20022018_df_header_rows \u001B[38;5;241m=\u001B[39m \u001B[43mread_csvs_from_path_and_reformat\u001B[49m(DATASET_PATH \u001B[38;5;241m+\u001B[39m dir_name)\n\u001B[1;32m 8\u001B[0m \u001B[38;5;66;03m#-- DDoS LOIC HTTP\u001B[39;00m\n\u001B[1;32m 9\u001B[0m label_flows(tuesday_20022018_df, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mDDoS-LOIC-HTTP\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;241m1519136034\u001B[39m\u001B[38;5;241m*\u001B[39m(\u001B[38;5;241m10\u001B[39m\u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39m\u001B[38;5;241m9\u001B[39m), \u001B[38;5;241m1519139809\u001B[39m\u001B[38;5;241m*\u001B[39m(\u001B[38;5;241m10\u001B[39m\u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39m\u001B[38;5;241m9\u001B[39m),\n\u001B[1;32m 10\u001B[0m [\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m18.218.115.60\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m18.219.9.1\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m18.219.32.43\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m18.218.55.126\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m52.14.136.135\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[1;32m 11\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m18.219.5.43\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m18.216.200.189\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m18.218.229.235\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m18.218.11.51\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m18.216.24.42\u001B[39m\u001B[38;5;124m\"\u001B[39m],\n\u001B[1;32m 12\u001B[0m [\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m172.31.69.25\u001B[39m\u001B[38;5;124m\"\u001B[39m], additional_filters\u001B[38;5;241m=\u001B[39m[\n\u001B[1;32m 13\u001B[0m tuesday_20022018_df[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mProtocol\u001B[39m\u001B[38;5;124m\"\u001B[39m] \u001B[38;5;241m==\u001B[39m \u001B[38;5;241m6\u001B[39m\n\u001B[1;32m 14\u001B[0m ])\n",
389 | "\u001B[0;31mNameError\u001B[0m: name 'read_csvs_from_path_and_reformat' is not defined"
390 | ]
391 | }
392 | ],
393 | "source": [
394 | "#--------------------+\n",
395 | "# TUESDAY 20-02-2018 |\n",
396 | "#--------------------+\n",
397 | "\n",
398 | "dir_name=\"Tuesday-20-02-2018\"\n",
399 | "tuesday_20022018_df, tuesday_20022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)\n",
400 | "\n",
401 | "#-- DDoS LOIC HTTP\n",
402 | "label_flows(tuesday_20022018_df, \"DDoS-LOIC-HTTP\", 1519136034*(10**9), 1519139809*(10**9),\n",
403 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n",
404 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n",
405 | " [\"172.31.69.25\"], additional_filters=[\n",
406 | " tuesday_20022018_df[\"Protocol\"] == 6\n",
407 | " ])\n",
408 | "\n",
409 | "# Payload filter\n",
410 | "label_flows(tuesday_20022018_df, \"DDoS-LOIC-HTTP - Attempted\", 1519136034*(10**9), 1519139809*(10**9),\n",
411 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n",
412 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n",
413 | " [\"172.31.69.25\"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True,\n",
414 | " additional_filters=[tuesday_20022018_df[\"Protocol\"] == 6])\n",
415 | "\n",
416 | "#-- DDoS LOIC UDP\n",
417 | "label_flows(tuesday_20022018_df, \"DDoS-LOIC-UDP\", 1519146857*(10**9), 1519147756*(10**9),\n",
418 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n",
419 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n",
420 | " [\"172.31.69.25\"], additional_filters=[\n",
421 | " tuesday_20022018_df[\"Protocol\"] == 17])\n",
422 | "\n",
423 | "# Payload filter\n",
424 | "label_flows(tuesday_20022018_df, \"DDoS-LOIC-UDP - Attempted\", 1519146857*(10**9), 1519147756*(10**9),\n",
425 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n",
426 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n",
427 | " [\"172.31.69.25\"], attempted_category=0, payload_filter=True,\n",
428 | " additional_filters=[tuesday_20022018_df[\"Protocol\"] == 17])\n",
429 | "\n",
430 | "# Attempted - Target unresponsive (the ICMP destination unreachable answers to the attack - using protocol == 17 (UDP) because original CICFlowMeter does not recognise ICMP)\n",
431 | "label_flows(tuesday_20022018_df, \"DDoS-LOIC-UDP - Attempted\", 1519146857*(10**9), 1519147756*(10**9),\n",
432 | " [\"172.31.69.25\"], [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n",
433 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n",
434 | " attempted_category=6, additional_filters=[tuesday_20022018_df[\"Protocol\"] == 17])\n",
435 | "\n",
436 | "\n",
437 | "label_rest_as_benign_and_write_csv(tuesday_20022018_df, tuesday_20022018_df_header_rows, DATASET_PATH + dir_name + \".csv\")\n",
438 | "\n",
439 | "tuesday_20022018_df = None"
440 | ]
441 | },
442 | {
443 | "cell_type": "code",
444 | "execution_count": 7,
445 | "metadata": {},
446 | "outputs": [
447 | {
448 | "name": "stdout",
449 | "output_type": "stream",
450 | "text": [
451 | "labels before pre-processing: Benign 8355458\n",
452 | "DDOS attack-HOIC 1246034\n",
453 | "DDOS attack-LOIC-UDP 1730\n",
454 | "Name: Label, dtype: int64\n",
455 | "labels after pre-processing: NeedManualLabel 9603222\n",
456 | "Name: Label, dtype: int64\n",
457 | "label count after labelling:\r\n",
458 | " BENIGN 7435307\n",
459 | "DDoS-HOIC - Attempted 1082294\n",
460 | "DDoS-HOIC 1082293\n",
461 | "DDoS-LOIC-UDP 1730\n",
462 | "DDoS-LOIC-UDP - Attempted 1598\n",
463 | "Name: Label, dtype: int64\n",
464 | "Attempted Category count after labelling:\r\n",
465 | " -1 8519330\n",
466 | " 0 1082294\n",
467 | " 6 1598\n",
468 | "Name: Attempted Category, dtype: int64\n"
469 | ]
470 | }
471 | ],
472 | "source": [
473 | "#----------------------+\n",
474 | "# WEDNESDAY 21-02-2018 |\n",
475 | "#----------------------+\n",
476 | "\n",
477 | "dir_name = \"Wednesday-21-02-2018\"\n",
478 | "wednesday_21022018_df, wednesday_21022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)\n",
479 | "\n",
480 | "#-- DDoS LOIC UDP\n",
481 | "label_flows(wednesday_21022018_df, \"DDoS-LOIC-UDP\", 1519222131*(10**9), 1519224219*(10**9),\n",
482 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n",
483 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n",
484 | " [\"172.31.69.28\"], additional_filters=[\n",
485 | " wednesday_21022018_df[\"Protocol\"] == 17\n",
486 | " ])\n",
487 | "\n",
488 | "# Payload filter\n",
489 | "label_flows(wednesday_21022018_df, \"DDoS-LOIC-UDP - Attempted\", 1519222131*(10**9), 1519224219*(10**9),\n",
490 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n",
491 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n",
492 | " [\"172.31.69.28\"], attempted_category=0, payload_filter=True,\n",
493 | " additional_filters=[wednesday_21022018_df[\"Protocol\"] == 17])\n",
494 | "\n",
495 | "# Attempted - Target unresponsive (the ICMP destination unreachable answers to the attack - using protocol == 17 (UDP) because original CICFlowMeter does not recognise ICMP)\n",
496 | "label_flows(wednesday_21022018_df, \"DDoS-LOIC-UDP - Attempted\", 1519222131*(10**9), 1519224219*(10**9),\n",
497 | " [\"172.31.69.28\"], [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n",
498 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n",
499 | " attempted_category=6,\n",
500 | " additional_filters=[wednesday_21022018_df[\"Protocol\"] == 17])\n",
501 | "\n",
502 | "#-- DDoS HOIC\n",
503 | "label_flows(wednesday_21022018_df, \"DDoS-HOIC\", 1519236668*(10**9), 1519239954*(10**9),\n",
504 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n",
505 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n",
506 | " [\"172.31.69.28\"], also_flip_flow_direction=True, additional_filters=[\n",
507 | " wednesday_21022018_df[\"Protocol\"] == 6\n",
508 | " ])\n",
509 | "\n",
510 | "# Payload filter\n",
511 | "label_flows(wednesday_21022018_df, \"DDoS-HOIC - Attempted\", 1519236668*(10**9), 1519239954*(10**9),\n",
512 | " [\"18.218.115.60\", \"18.219.9.1\", \"18.219.32.43\", \"18.218.55.126\", \"52.14.136.135\",\n",
513 | " \"18.219.5.43\", \"18.216.200.189\", \"18.218.229.235\", \"18.218.11.51\", \"18.216.24.42\"],\n",
514 | " [\"172.31.69.28\"], payload_filter=True, also_flip_flow_direction=True,\n",
515 | " attempted_category=0, additional_filters=[wednesday_21022018_df[\"Protocol\"] == 6])\n",
516 | "\n",
517 | "\n",
518 | "label_rest_as_benign_and_write_csv(wednesday_21022018_df, wednesday_21022018_df_header_rows, DATASET_PATH + dir_name + \".csv\")\n",
519 | "\n",
520 | "wednesday_21022018_df = None"
521 | ]
522 | },
523 | {
524 | "cell_type": "code",
525 | "execution_count": 8,
526 | "metadata": {},
527 | "outputs": [
528 | {
529 | "name": "stdout",
530 | "output_type": "stream",
531 | "text": [
532 | "labels before pre-processing: Benign 8179253\n",
533 | "Brute Force -Web 249\n",
534 | "Brute Force -XSS 79\n",
535 | "SQL Injection 34\n",
536 | "Name: Label, dtype: int64\n",
537 | "labels after pre-processing: NeedManualLabel 8179615\n",
538 | "Name: Label, dtype: int64\n",
539 | "label count after labelling:\r\n",
540 | " BENIGN 8179201\n",
541 | "Web Attack - Brute Force - Attempted 221\n",
542 | "Web Attack - Brute Force 69\n",
543 | "Web Attack - XSS - Attempted 44\n",
544 | "Web Attack - XSS 40\n",
545 | "Web Attack - SQL - Attempted 24\n",
546 | "Web Attack - SQL 16\n",
547 | "Name: Label, dtype: int64\n",
548 | "Attempted Category count after labelling:\r\n",
549 | " -1 8179326\n",
550 | " 0 197\n",
551 | " 5 66\n",
552 | " 2 24\n",
553 | " 3 2\n",
554 | "Name: Attempted Category, dtype: int64\n"
555 | ]
556 | }
557 | ],
558 | "source": [
559 | "#---------------------+\n",
560 | "# THURSDAY 22-02-2018 |\n",
561 | "#---------------------+\n",
562 | "\n",
563 | "dir_name = \"Thursday-22-02-2018\"\n",
564 | "thursday_22022018_df, thursday_22022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)\n",
565 | "\n",
566 | "#-- Web Attack SQL\n",
567 | "label_flows(thursday_22022018_df, \"Web Attack - SQL\", 1519330590418906000, 1519331276022793000, [\"18.218.115.60\"],\n",
568 | " [\"172.31.69.28\"], also_flip_flow_direction=True, additional_filters=\n",
569 | " [thursday_22022018_df[\"TotLen Fwd Pkts\"] > 0,\n",
570 | " thursday_22022018_df[\"TotLen Bwd Pkts\"] > 0])\n",
571 | "\n",
572 | "# Attack startup artefact\n",
573 | "label_flows(thursday_22022018_df, \"Web Attack - SQL - Attempted\", 1519330470*(10**9), 1519330498*(10**9), [\"18.218.115.60\"],\n",
574 | " [\"172.31.69.28\"], attempted_category=2, also_flip_flow_direction=True)\n",
575 | "\n",
576 | "# Payload filter\n",
577 | "label_flows(thursday_22022018_df, \"Web Attack - SQL - Attempted\", 1519330590418906000, 1519331276022793000, [\"18.218.115.60\"],\n",
578 | " [\"172.31.69.28\"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n",
579 | "\n",
580 | "#-- Web Attack XSS\n",
581 | "# Port 63782 is attack setup (navigating to website)\n",
582 | "label_flows(thursday_22022018_df, \"Web Attack - XSS\", 1519321899783923000, 1519324181827037000, [\"18.218.115.60\"],\n",
583 | " [\"172.31.69.28\"], additional_filters=\n",
584 | " [~(thursday_22022018_df[\"Src Port\"].isin([63782, 64144]))])\n",
585 | "\n",
586 | "#Flip\n",
587 | "label_flows(thursday_22022018_df, \"Web Attack - XSS\", 1519321899783923000, 1519324181827037000,\n",
588 | " [\"172.31.69.28\"], [\"18.218.115.60\"], additional_filters=\n",
589 | " [~(thursday_22022018_df[\"Dst Port\"].isin([63782, 64144]))])\n",
590 | "\n",
591 | "# Attempted attack setup\n",
592 | "label_flows(thursday_22022018_df, \"Web Attack - XSS - Attempted\", 1519321899783923000, 1519324181827037000, [\"18.218.115.60\"],\n",
593 | " [\"172.31.69.28\"], attempted_category=2, additional_filters=\n",
594 | " [thursday_22022018_df[\"Src Port\"] == 63782])\n",
595 | "\n",
596 | "label_flows(thursday_22022018_df, \"Web Attack - XSS - Attempted\", 1519321899783923000, 1519324181827037000, [\"18.218.115.60\"],\n",
597 | " [\"172.31.69.28\"], attempted_category=3, additional_filters=\n",
598 | " [thursday_22022018_df[\"Src Port\"] == 64144])\n",
599 | "\n",
600 | "# Payload filter\n",
601 | "label_flows(thursday_22022018_df, \"Web Attack - XSS - Attempted\", 1519321899783923000, 1519324181827037000, [\"18.218.115.60\"],\n",
602 | " [\"172.31.69.28\"], attempted_category=0, payload_filter=True, additional_filters=\n",
603 | " [~(thursday_22022018_df[\"Src Port\"].isin([63782, 64144]))])\n",
604 | "\n",
605 | "#Flip\n",
606 | "label_flows(thursday_22022018_df, \"Web Attack - XSS - Attempted\", 1519321899783923000, 1519324181827037000,\n",
607 | " [\"172.31.69.28\"], [\"18.218.115.60\"], attempted_category=0, additional_filters=\n",
608 | " [~(thursday_22022018_df[\"Dst Port\"].isin([63782, 64144])) &\n",
609 | " (thursday_22022018_df[\"TotLen Bwd Pkts\"] == 0)])\n",
610 | "\n",
611 | "#-- Web Attack Brute Force & Attempted\n",
612 | "label_flows(thursday_22022018_df, \"Web Attack - Brute Force\", 1519309071336902000, 1519313039858533000, [\"18.218.115.60\"],\n",
613 | " [\"172.31.69.28\"], additional_filters=\n",
614 | " [thursday_22022018_df[\"Tot Fwd Pkts\"] > 20])\n",
615 | "\n",
616 | "#Flip\n",
617 | "label_flows(thursday_22022018_df, \"Web Attack - Brute Force\", 1519309071336902000, 1519313039858533000,\n",
618 | " [\"172.31.69.28\"], [\"18.218.115.60\"], additional_filters=\n",
619 | " [thursday_22022018_df[\"Tot Bwd Pkts\"] > 20])\n",
620 | "\n",
621 | "label_flows(thursday_22022018_df, \"Web Attack - Brute Force - Attempted\", 1519309071336902000, 1519313039858533000,\n",
622 | " [\"18.218.115.60\"], [\"172.31.69.28\"], attempted_category=5, additional_filters=\n",
623 | " [(thursday_22022018_df[\"Tot Fwd Pkts\"] <= 20) & (thursday_22022018_df[\"TotLen Fwd Pkts\"] > 0)])\n",
624 | "\n",
625 | "#Flip\n",
626 | "label_flows(thursday_22022018_df, \"Web Attack - Brute Force - Attempted\", 1519309071336902000, 1519313039858533000,\n",
627 | " [\"172.31.69.28\"], [\"18.218.115.60\"], attempted_category=5, additional_filters=\n",
628 | " [(thursday_22022018_df[\"Tot Bwd Pkts\"] <= 20) & (thursday_22022018_df[\"TotLen Bwd Pkts\"] > 0)])\n",
629 | "\n",
630 | "label_flows(thursday_22022018_df, \"Web Attack - Brute Force - Attempted\", 1519308824965705000, 1519308947920399000, [\"18.218.115.60\"],\n",
631 | " [\"172.31.69.28\"], attempted_category=2, also_flip_flow_direction=True)\n",
632 | "\n",
633 | "# Payload filter\n",
634 | "label_flows(thursday_22022018_df, \"Web Attack - Brute Force - Attempted\", 1519309071336902000, 1519313039858533000,\n",
635 | " [\"18.218.115.60\"], [\"172.31.69.28\"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n",
636 | "\n",
637 | "label_rest_as_benign_and_write_csv(thursday_22022018_df, thursday_22022018_df_header_rows, DATASET_PATH + dir_name + \".csv\")\n",
638 | "\n",
639 | "thursday_22022018_df = None\n"
640 | ]
641 | },
642 | {
643 | "cell_type": "code",
644 | "execution_count": 9,
645 | "metadata": {},
646 | "outputs": [
647 | {
648 | "name": "stdout",
649 | "output_type": "stream",
650 | "text": [
651 | "labels before pre-processing: Benign 7927630\n",
652 | "Brute Force -Web 362\n",
653 | "Brute Force -XSS 151\n",
654 | "SQL Injection 53\n",
655 | "Name: Label, dtype: int64\n",
656 | "labels after pre-processing: NeedManualLabel 7928196\n",
657 | "Name: Label, dtype: int64\n",
658 | "label count after labelling:\r\n",
659 | " BENIGN 7927736\n",
660 | "Web Attack - Brute Force - Attempted 184\n",
661 | "Web Attack - XSS - Attempted 75\n",
662 | "Web Attack - XSS 73\n",
663 | "Web Attack - Brute Force 62\n",
664 | "Web Attack - SQL - Attempted 43\n",
665 | "Web Attack - SQL 23\n",
666 | "Name: Label, dtype: int64\n",
667 | "Attempted Category count after labelling:\r\n",
668 | " -1 7927894\n",
669 | " 0 231\n",
670 | " 5 60\n",
671 | " 2 11\n",
672 | "Name: Attempted Category, dtype: int64\n"
673 | ]
674 | }
675 | ],
676 | "source": [
677 | "#-------------------+\n",
678 | "# FRIDAY 23-02-2018 |\n",
679 | "#-------------------+\n",
680 | "\n",
681 | "dir_name = \"Friday-23-02-2018\"\n",
682 | "friday_23022018_df, friday_23022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)\n",
683 | "\n",
684 | "#-- Web Attack SQL\n",
685 | "label_flows(friday_23022018_df, \"Web Attack - SQL\", 1519412792126122000, 1519413444947957000 , [\"18.218.115.60\"],\n",
686 | " [\"172.31.69.28\"], also_flip_flow_direction=True, additional_filters=\n",
687 | " [friday_23022018_df[\"TotLen Fwd Pkts\"] > 0,\n",
688 | " friday_23022018_df[\"TotLen Bwd Pkts\"] > 0])\n",
689 | "\n",
690 | "# Attack startup artefact\n",
691 | "label_flows(friday_23022018_df, \"Web Attack - SQL - Attempted\", 1519412722*(10**9), 1519412787*(10**9) , [\"18.218.115.60\"],\n",
692 | " [\"172.31.69.28\"], attempted_category=2, also_flip_flow_direction=True)\n",
693 | "\n",
694 | "# Payload filter\n",
695 | "label_flows(friday_23022018_df, \"Web Attack - SQL - Attempted\", 1519412792126122000, 1519413444947957000 , [\"18.218.115.60\"],\n",
696 | " [\"172.31.69.28\"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n",
697 | "\n",
698 | "#-- Web Attack XSS\n",
699 | "label_flows(friday_23022018_df, \"Web Attack - XSS\", 1519405264559707000, 1519409428237472000, [\"18.218.115.60\"],\n",
700 | " [\"172.31.69.28\"], additional_filters=\n",
701 | " [~(friday_23022018_df[\"Src Port\"].isin([59173]))])\n",
702 | "\n",
703 | "#Flip\n",
704 | "label_flows(friday_23022018_df, \"Web Attack - XSS\", 1519405264559707000, 1519409428237472000,\n",
705 | " [\"172.31.69.28\"], [\"18.218.115.60\"], additional_filters=\n",
706 | " [~(friday_23022018_df[\"Dst Port\"].isin([59173]))])\n",
707 | "\n",
708 | "label_flows(friday_23022018_df, \"Web Attack - XSS - Attempted\", 1519405264559707000, 1519409428237472000, [\"18.218.115.60\"],\n",
709 | " [\"172.31.69.28\"], attempted_category=2, src_port_list=[59173], also_flip_flow_direction=True)\n",
710 | "\n",
711 | "# Payload filter\n",
712 | "label_flows(friday_23022018_df, \"Web Attack - XSS - Attempted\", 1519405264559707000, 1519409428237472000, [\"18.218.115.60\"],\n",
713 | " [\"172.31.69.28\"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n",
714 | "\n",
715 | "#-- Web Attack Brute Force & Attempted\n",
716 | "label_flows(friday_23022018_df, \"Web Attack - Brute Force\", 1519394670193975000, 1519398186406294000, [\"18.218.115.60\"],\n",
717 | " [\"172.31.69.28\"], additional_filters=\n",
718 | " [friday_23022018_df[\"Tot Fwd Pkts\"] > 20])\n",
719 | "\n",
720 | "#Flip\n",
721 | "label_flows(friday_23022018_df, \"Web Attack - Brute Force\", 1519394670193975000, 1519398186406294000,\n",
722 | " [\"172.31.69.28\"], [\"18.218.115.60\"], additional_filters=\n",
723 | " [friday_23022018_df[\"Tot Bwd Pkts\"] > 20])\n",
724 | "\n",
725 | "label_flows(friday_23022018_df, \"Web Attack - Brute Force - Attempted\", 1519394670193975000, 1519398186406294000,\n",
726 | " [\"18.218.115.60\"], [\"172.31.69.28\"], attempted_category=5, additional_filters=\n",
727 | " [(friday_23022018_df[\"Tot Fwd Pkts\"] <= 20) & (friday_23022018_df[\"TotLen Fwd Pkts\"] > 0)])\n",
728 | "\n",
729 | "#Flip\n",
730 | "label_flows(friday_23022018_df, \"Web Attack - Brute Force - Attempted\", 1519394670193975000, 1519398186406294000,\n",
731 | " [\"172.31.69.28\"], [\"18.218.115.60\"], attempted_category=5, additional_filters=\n",
732 | " [(friday_23022018_df[\"Tot Bwd Pkts\"] <= 20) & (friday_23022018_df[\"TotLen Bwd Pkts\"] > 0)])\n",
733 | "\n",
734 | "# Payload filter:\n",
735 | "label_flows(friday_23022018_df, \"Web Attack - Brute Force - Attempted\", 1519394670193975000, 1519398186406294000,\n",
736 | " [\"18.218.115.60\"], [\"172.31.69.28\"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n",
737 | "\n",
738 | "label_rest_as_benign_and_write_csv(friday_23022018_df, friday_23022018_df_header_rows, DATASET_PATH + dir_name + \".csv\")\n",
739 | "\n",
740 | "friday_23022018_df = None"
741 | ]
742 | },
743 | {
744 | "cell_type": "code",
745 | "execution_count": 10,
746 | "metadata": {},
747 | "outputs": [
748 | {
749 | "name": "stderr",
750 | "output_type": "stream",
751 | "text": [
752 | "/tmp/ipykernel_65171/1487319557.py:6: DtypeWarning: Columns (2,4,5,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82) have mixed types.Specify dtype option on import or set low_memory=False.\n",
753 | " wednesday_28022018_df, wednesday_28022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)\n"
754 | ]
755 | },
756 | {
757 | "name": "stdout",
758 | "output_type": "stream",
759 | "text": [
760 | "labels before pre-processing: Benign 544200\n",
761 | "Infilteration 68871\n",
762 | "Label 33\n",
763 | "Name: Label, dtype: int64\n",
764 | "labels after pre-processing: NeedManualLabel 613072\n",
765 | "Name: Label, dtype: int64\n",
766 | "label count after labelling:\r\n",
767 | " BENIGN 553425\n",
768 | "Infiltration - NMAP Portscan 59494\n",
769 | "Infiltration - Dropbox Download - Attempted 63\n",
770 | "Infiltration - Dropbox Download 46\n",
771 | "Infiltration - Communication Victim Attacker 44\n",
772 | "Name: Label, dtype: int64\n",
773 | "Attempted Category count after labelling:\r\n",
774 | " -1 613009\n",
775 | " 0 39\n",
776 | " 4 24\n",
777 | "Name: Attempted Category, dtype: int64\n"
778 | ]
779 | }
780 | ],
781 | "source": [
782 | "#----------------------+\n",
783 | "# WEDNESDAY 28-02-2018 |\n",
784 | "#----------------------+\n",
785 | "\n",
786 | "dir_name = \"Wednesday-28-02-2018\"\n",
787 | "wednesday_28022018_df, wednesday_28022018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)\n",
788 | "\n",
789 | "#-- Infiltration - Dropbox Download\n",
790 | "label_flows(wednesday_28022018_df, \"Infiltration - Dropbox Download\", 1519828404*(10**9), 1519829172*(10**9),\n",
791 | " [\"172.31.69.24\"],\n",
792 | " [\"162.125.3.1\", \"162.125.3.5\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"],\n",
793 | " also_flip_flow_direction=True)\n",
794 | "\n",
795 | "label_flows(wednesday_28022018_df, \"Infiltration - Dropbox Download\", 1519839771*(10**9), 1519839824*(10**9),\n",
796 | " [\"172.31.69.24\"],\n",
797 | " [\"162.125.3.1\", \"162.125.3.5\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"],\n",
798 | " also_flip_flow_direction=True)\n",
799 | "\n",
800 | "# Payload filter\n",
801 | "label_flows(wednesday_28022018_df, \"Infiltration - Dropbox Download - Attempted\", 1519828404*(10**9), 1519829172*(10**9),\n",
802 | " [\"172.31.69.24\"],\n",
803 | " [\"162.125.3.1\", \"162.125.3.5\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"],\n",
804 | " attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n",
805 | "\n",
806 | "label_flows(wednesday_28022018_df, \"Infiltration - Dropbox Download - Attempted\", 1519839771*(10**9), 1519839824*(10**9),\n",
807 | " [\"172.31.69.24\"],\n",
808 | " [\"162.125.3.1\", \"162.125.3.5\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"],\n",
809 | " attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n",
810 | "\n",
811 | "# Attempted - Attack artefact\n",
812 | "label_flows(wednesday_28022018_df, \"Infiltration - Dropbox Download - Attempted\", 1519828404*(10**9), 1519829172*(10**9),\n",
813 | " [\"172.31.69.24\"],\n",
814 | " [\"104.16.100.29\", \"104.16.99.29\", \"52.84.128.3\", \"52.85.101.236\", \"52.85.131.81\", \"52.85.95.206\"], attempted_category=4, also_flip_flow_direction=True)\n",
815 | "\n",
816 | "label_flows(wednesday_28022018_df, \"Infiltration - Dropbox Download - Attempted\", 1519839771*(10**9), 1519839824*(10**9),\n",
817 | " [\"172.31.69.24\"],\n",
818 | " [\"104.16.100.29\", \"104.16.99.29\", \"52.84.128.3\", \"52.85.101.236\", \"52.85.131.81\", \"52.85.95.206\"], attempted_category=4, also_flip_flow_direction=True)\n",
819 | "\n",
820 | "#-- Infiltration - Communication Victim Attacker\n",
821 | "label_flows(wednesday_28022018_df, \"Infiltration - Communication Victim Attacker\", 1519829140*(10**9),\n",
822 | " 1519834135*(10**9), [\"172.31.69.24\"], [\"13.58.225.34\"], also_flip_flow_direction=True)\n",
823 | "\n",
824 | "label_flows(wednesday_28022018_df, \"Infiltration - Communication Victim Attacker\", 1519839839*(10**9),\n",
825 | " 1519843199*(10**9), [\"172.31.69.24\"], [\"13.58.225.34\"], also_flip_flow_direction=True)\n",
826 | "\n",
827 | "# Payload filter\n",
828 | "\n",
829 | "label_flows(wednesday_28022018_df, \"Infiltration - Communication Victim Attacker - Attempted\", 1519829140*(10**9),\n",
830 | " 1519834135*(10**9), [\"172.31.69.24\"], [\"13.58.225.34\"], payload_filter=True, also_flip_flow_direction=True)\n",
831 | "\n",
832 | "label_flows(wednesday_28022018_df, \"Infiltration - Communication Victim Attacker - Attempted\", 1519839839*(10**9),\n",
833 | " 1519843199*(10**9), [\"172.31.69.24\"], [\"13.58.225.34\"], payload_filter=True, also_flip_flow_direction=True)\n",
834 | "\n",
835 | "#-- Infiltration - NMAP Portscan\n",
836 | "label_flows(wednesday_28022018_df, \"Infiltration - NMAP Portscan\", 1519829182*(10**9), 1519843140746247000,\n",
837 | " [\"172.31.69.24\"],\n",
838 | " [\"172.31.69.1\", \"172.31.69.10\", \"172.31.69.11\", \"172.31.69.12\", \"172.31.69.13\", \"172.31.69.14\",\n",
839 | " \"172.31.69.16\", \"172.31.69.17\", \"172.31.69.19\", \"172.31.69.20\", \"172.31.69.23\", \"172.31.69.4\",\n",
840 | " \"172.31.69.5\", \"172.31.69.6\", \"172.31.69.8\", \"172.31.69.9\", \"172.31.69.7\", \"172.31.69.22\",\n",
841 | " \"172.31.69.15\", \"172.31.69.21\", \"172.31.69.18\",], additional_filters=\n",
842 | " [~(wednesday_28022018_df[\"Src Port\"] == 68)])\n",
843 | "\n",
844 | "label_rest_as_benign_and_write_csv(wednesday_28022018_df, wednesday_28022018_df_header_rows, DATASET_PATH + dir_name + \".csv\")\n",
845 | "\n",
846 | "wednesday_28022018_df = None"
847 | ]
848 | },
849 | {
850 | "cell_type": "code",
851 | "execution_count": 11,
852 | "metadata": {},
853 | "outputs": [
854 | {
855 | "name": "stderr",
856 | "output_type": "stream",
857 | "text": [
858 | "/tmp/ipykernel_65171/25895083.py:6: DtypeWarning: Columns (2,4,5,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82) have mixed types.Specify dtype option on import or set low_memory=False.\n",
859 | " thursday_01032018_df, thursday_01032018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)\n"
860 | ]
861 | },
862 | {
863 | "name": "stdout",
864 | "output_type": "stream",
865 | "text": [
866 | "labels before pre-processing: Benign 238037\n",
867 | "Infilteration 93063\n",
868 | "Label 25\n",
869 | "Name: Label, dtype: int64\n",
870 | "labels after pre-processing: NeedManualLabel 331101\n",
871 | "Name: Label, dtype: int64\n",
872 | "label count after labelling:\r\n",
873 | " BENIGN 290058\n",
874 | "Infiltration - NMAP Portscan 40804\n",
875 | "Infiltration - Communication Victim Attacker 162\n",
876 | "Infiltration - Dropbox Download 39\n",
877 | "Infiltration - Dropbox Download - Attempted 37\n",
878 | "Infiltration - Communication Victim Attacker - Attempted 1\n",
879 | "Name: Label, dtype: int64\n",
880 | "Attempted Category count after labelling:\r\n",
881 | " -1 331063\n",
882 | " 4 21\n",
883 | " 0 17\n",
884 | "Name: Attempted Category, dtype: int64\n"
885 | ]
886 | }
887 | ],
888 | "source": [
889 | "#---------------------+\n",
890 | "# THURSDAY 01-03-2018 |\n",
891 | "#---------------------+\n",
892 | "\n",
893 | "dir_name = \"Thursday-01-03-2018\"\n",
894 | "thursday_01032018_df, thursday_01032018_df_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)\n",
895 | "\n",
896 | "#-- Infiltration - Dropbox Download\n",
897 | "label_flows(thursday_01032018_df, \"Infiltration - Dropbox Download\", 1519912390*(10**9), 1519916360*(10**9),\n",
898 | " [\"172.31.69.13\"], [\"162.125.3.1\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"], also_flip_flow_direction=True)\n",
899 | "\n",
900 | "label_flows(thursday_01032018_df, \"Infiltration - Dropbox Download\", 1519913032*(10**9), 1519918454*(10**9),\n",
901 | " [\"172.31.69.13\"], [\"162.125.3.1\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"], also_flip_flow_direction=True)\n",
902 | "\n",
903 | "# Payload filter\n",
904 | "label_flows(thursday_01032018_df, \"Infiltration - Dropbox Download - Attempted\", 1519912390*(10**9), 1519916360*(10**9),\n",
905 | " [\"172.31.69.13\"],\n",
906 | " [\"162.125.3.1\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n",
907 | "\n",
908 | "label_flows(thursday_01032018_df, \"Infiltration - Dropbox Download - Attempted\", 1519913032*(10**9), 1519918454*(10**9),\n",
909 | " [\"172.31.69.13\"],\n",
910 | " [\"162.125.3.1\", \"162.125.3.6\", \"162.125.248.1\", \"162.125.18.133\", \"104.16.100.29\"], attempted_category=0, payload_filter=True, also_flip_flow_direction=True)\n",
911 | "\n",
912 | "# Attempted - Attack artefact\n",
913 | "label_flows(thursday_01032018_df, \"Infiltration - Dropbox Download - Attempted\", 1519912390*(10**9), 1519916360*(10**9),\n",
914 | " [\"172.31.69.13\"], [\"104.16.100.29\", \"13.32.168.125\", \"52.85.112.72\"], attempted_category=4, also_flip_flow_direction=True)\n",
915 | "\n",
916 | "label_flows(thursday_01032018_df, \"Infiltration - Dropbox Download - Attempted\", 1519913032*(10**9), 1519918454*(10**9),\n",
917 | " [\"172.31.69.13\"], [\"104.16.100.29\", \"13.32.168.125\", \"52.85.112.72\"], attempted_category=4, also_flip_flow_direction=True)\n",
918 | "\n",
919 | "#-- Infiltration - Communication Victim Attacker\n",
920 | "label_flows(thursday_01032018_df, \"Infiltration - Communication Victim Attacker\", 1519912674*(10**9),\n",
921 | " 1519912745*(10**9), [\"172.31.69.13\"], [\"13.58.225.34\"], also_flip_flow_direction=True)\n",
922 | "\n",
923 | "label_flows(thursday_01032018_df, \"Infiltration - Communication Victim Attacker\", 1519913075*(10**9),\n",
924 | " 1519928245*(10**9), [\"172.31.69.13\"], [\"13.58.225.34\"], also_flip_flow_direction=True)\n",
925 | "\n",
926 | "label_flows(thursday_01032018_df, \"Infiltration - Communication Victim Attacker\", 1519928295*(10**9),\n",
927 | " 1519933041*(10**9), [\"172.31.69.13\"], [\"13.58.225.34\"], also_flip_flow_direction=True)\n",
928 | "\n",
929 | "# Payload filter\n",
930 | "label_flows(thursday_01032018_df, \"Infiltration - Communication Victim Attacker - Attempted\", 1519912674*(10**9),\n",
931 | " 1519912745*(10**9), [\"172.31.69.13\"], [\"13.58.225.34\"], attempted_category=0,\n",
932 | " payload_filter=True, also_flip_flow_direction=True)\n",
933 | "\n",
934 | "label_flows(thursday_01032018_df, \"Infiltration - Communication Victim Attacker - Attempted\", 1519913075*(10**9),\n",
935 | " 1519928245*(10**9), [\"172.31.69.13\"], [\"13.58.225.34\"], attempted_category=0,\n",
936 | " payload_filter=True, also_flip_flow_direction=True)\n",
937 | "\n",
938 | "label_flows(thursday_01032018_df, \"Infiltration - Communication Victim Attacker - Attempted\", 1519928295*(10**9),\n",
939 | " 1519933041*(10**9), [\"172.31.69.13\"], [\"13.58.225.34\"], attempted_category=0,\n",
940 | " payload_filter=True, also_flip_flow_direction=True)\n",
941 | "\n",
942 | "#-- Infiltration - NMAP Portscan\n",
943 | "label_flows(thursday_01032018_df, \"Infiltration - NMAP Portscan\", 1519913388*(10**9), 1519933092182726000,\n",
944 | " [\"172.31.69.13\"],\n",
945 | " [\"172.31.69.1\", \"172.31.69.11\", \"172.31.69.12\", \"172.31.69.16\", \"172.31.69.8\", \"172.31.69.9\",\n",
946 | " \"172.31.69.10\", \"172.31.69.14\", \"172.31.69.4\", \"172.31.69.5\", \"172.31.69.6\", \"172.31.69.17\",\n",
947 | " \"172.31.69.20\", \"172.31.69.23\", \"172.31.69.24\", \"172.31.69.19\", \"172.31.69.7\", \"172.31.69.15\",\n",
948 | " \"172.31.69.18\", \"172.31.69.22\", \"172.31.69.21\"], additional_filters=\n",
949 | " [thursday_01032018_df[\"Src Port\"] != 68])\n",
950 | "\n",
951 | "label_rest_as_benign_and_write_csv(thursday_01032018_df, thursday_01032018_df_header_rows, DATASET_PATH + dir_name + \".csv\")\n",
952 | "\n",
953 | "thursday_01032018_df = None"
954 | ]
955 | },
956 | {
957 | "cell_type": "code",
958 | "execution_count": 12,
959 | "metadata": {},
960 | "outputs": [
961 | {
962 | "name": "stdout",
963 | "output_type": "stream",
964 | "text": [
965 | "labels before pre-processing: Benign 7931011\n",
966 | "Bot 286191\n",
967 | "Name: Label, dtype: int64\n",
968 | "labels after pre-processing: NeedManualLabel 8217202\n",
969 | "Name: Label, dtype: int64\n",
970 | "label count after labelling:\r\n",
971 | " BENIGN 7931011\n",
972 | "Botnet Ares - Attempted 143263\n",
973 | "Botnet Ares 142928\n",
974 | "Name: Label, dtype: int64\n",
975 | "Attempted Category count after labelling:\r\n",
976 | " -1 8073939\n",
977 | " 0 143263\n",
978 | "Name: Attempted Category, dtype: int64\n"
979 | ]
980 | }
981 | ],
982 | "source": [
983 | "#-------------------+\n",
984 | "# FRIDAY 02-03-2018 |\n",
985 | "#-------------------+\n",
986 | "\n",
987 | "dir_name = \"Friday-02-03-2018\"\n",
988 | "friday_02032018_df, friday_02032018_header_rows = read_csvs_from_path_and_reformat(DATASET_PATH + dir_name)\n",
989 | "\n",
990 | "#-- Botnet Ares\n",
991 | "label_flows(friday_02032018_df, \"Botnet Ares\", 1520000008*(10**9), 1520020492*(10**9), also_flip_flow_direction=True,\n",
992 | " additional_filters=\n",
993 | " [(friday_02032018_df[\"Src IP\"] == \"18.219.211.138\") | (friday_02032018_df[\"Dst IP\"] == \"18.219.211.138\")])\n",
994 | "\n",
995 | "# Payload filter\n",
996 | "label_flows(friday_02032018_df, \"Botnet Ares - Attempted\", 1520000008*(10**9), 1520020492*(10**9), attempted_category=0, additional_filters=\n",
997 | " [((friday_02032018_df[\"Src IP\"] == \"18.219.211.138\") | (friday_02032018_df[\"Dst IP\"] == \"18.219.211.138\")) &\n",
998 | " (friday_02032018_df[\"TotLen Fwd Pkts\"] == 0) & (friday_02032018_df[\"TotLen Bwd Pkts\"] == 0)])\n",
999 | "\n",
1000 | "label_rest_as_benign_and_write_csv(friday_02032018_df, friday_02032018_header_rows, DATASET_PATH + dir_name + \".csv\")\n",
1001 | "\n",
1002 | "friday_02032018_df = None"
1003 | ]
1004 | },
1005 | {
1006 | "cell_type": "code",
1007 | "execution_count": null,
1008 | "metadata": {},
1009 | "outputs": [],
1010 | "source": []
1011 | }
1012 | ],
1013 | "metadata": {
1014 | "kernelspec": {
1015 | "display_name": "Python 3 (ipykernel)",
1016 | "language": "python",
1017 | "name": "python3"
1018 | },
1019 | "language_info": {
1020 | "codemirror_mode": {
1021 | "name": "ipython",
1022 | "version": 3
1023 | },
1024 | "file_extension": ".py",
1025 | "mimetype": "text/x-python",
1026 | "name": "python",
1027 | "nbconvert_exporter": "python",
1028 | "pygments_lexer": "ipython3",
1029 | "version": "3.8.12"
1030 | }
1031 | },
1032 | "nbformat": 4,
1033 | "nbformat_minor": 1
1034 | }
1035 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Error Prevalence in NIDS Datasets: A Case Study on CIC-IDS-2017 and CSE-CIC-IDS-2018
2 |
3 | This repository contains the code used for our paper (Link to be added when proceedings are published).
4 | The code performs the labelling and benchmarking for the [CIC-IDS-2017](https://www.unb.ca/cic/datasets/ids-2017.html)
5 | and [CSE-CIC-IDS-2018](https://www.unb.ca/cic/datasets/ids-2018.html) datasets
6 | after it has been processed by [our modified version of the CICFlowMeter tool](https://github.com/GintsEngelen/CICFlowMeter).
7 |
8 | Note that all of this is *research code*.
9 |
10 | If you use the code in this repository, please cite our paper:
11 |
12 | @inproceedings{liu2022error,
13 | title={Error Prevalence in NIDS datasets: A Case Study on CIC-IDS-2017 and CSE-CIC-IDS-2018},
14 | author={Liu, Lisa and Engelen, Gints and Lynar, Timothy and Essam, Daryl and Joosen, Wouter},
15 | booktitle={2022 IEEE Conference on Communications and Network Security (CNS)},
16 | pages={254--262},
17 | year={2022},
18 | organization={IEEE}
19 | }
20 |
21 |
22 | An extended documentation of our paper can be found [here](https://intrusion-detection.distrinet-research.be/CNS2022/).
23 |
24 | ## How to use this repository
25 |
26 | First, head over to the website of the dataset (either CIC-IDS-2017 or CSE-CIC-IDS-2018) and download
27 | the raw version of the dataset (PCAP file format).
28 |
29 | Then, navigate to "Original Network Traffic and Log data/Friday-02-03-2018/pcap" and delete the following file: 'capEC2AMAZ-O4EL3NG-172.31.69 - Copy.24' (This file contains traffic from the previous day and thus leads to duplicate flow entries).
30 |
31 | Then, first run [pcapfix](https://github.com/Rup0rt/pcapfix) and then [reordercap](https://www.wireshark.org/docs/man-pages/reordercap.html)
32 | on the PCAP files.
33 |
34 | Then, run [our modified version of the CICFlowMeter tool](https://github.com/GintsEngelen/CICFlowMeter) on the data
35 | obtained in the previous step:
36 |
37 | 1. Start the CICFlowMeter tool
38 | 2. Under the "NetWork" menu option, select "Offline"
39 | 3. Select the directory or directories containing the PCAP files. Note that for CSE-CIC-IDS-2018 you will have to run the
40 | CICFlowMeter tool multiple times, once for each directory (where each directory corresponds to one day)
41 | 5. Keep the default values for the "Flow TimeOut" and "Activity Timeout" parameters (120000000 and 5000000 respectively)
42 |
43 | This will generate the CSV files with the flows extracted from the raw PCAP files.
44 |
45 | For labelling of the CIC-IDS-2017 files, we used the CICIDS2017_labelling_fixed_CICFlowMeter.ipynb script. For labelling of the CSE-CIC-IDS-2018 files, we used the CICIDS2018_labelling_fixed_CICFlowMeter.ipynb script.
46 |
47 | The two scripts with "original_version" in their name were used for our experiments where we determined the impact of
48 | just the labelling errors on classifiers. These scripts should only be used if you wish to reproduce our experimental results
49 | as published in our paper.
50 |
--------------------------------------------------------------------------------
| |