├── .DS_Store
├── .gitignore
├── .idea
├── .gitignore
├── inspectionProfiles
│ └── profiles_settings.xml
└── vcs.xml
├── Benchmarking_RF.py
├── Class-based_metrics
└── .gitignore
├── FeatureImportance
└── .gitignore
├── Figures
└── .gitignore
├── LICENSE
├── LabelledDataset
└── .gitignore
├── MakeDataNumpyFriendly.py
├── NumpyFriendlyData
└── .gitignore
├── README.md
├── Scores
└── .gitignore
├── UnlabelledDataset
└── .gitignore
└── labelling_CSV_flows.py
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GintsEngelen/WTMC2021-Code/14ee845f0d1c2f5d703d678233e25fb4d051e9d1/.DS_Store
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Created by https://www.toptal.com/developers/gitignore/api/python,pycharm+iml
3 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,pycharm+iml
4 |
5 | ### PyCharm+iml ###
6 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
7 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
8 |
9 | # User-specific stuff
10 | .idea/**/workspace.xml
11 | .idea/**/tasks.xml
12 | .idea/**/usage.statistics.xml
13 | .idea/**/dictionaries
14 | .idea/**/shelf
15 |
16 | # Generated files
17 | .idea/**/contentModel.xml
18 |
19 | # Sensitive or high-churn files
20 | .idea/**/dataSources/
21 | .idea/**/dataSources.ids
22 | .idea/**/dataSources.local.xml
23 | .idea/**/sqlDataSources.xml
24 | .idea/**/dynamic.xml
25 | .idea/**/uiDesigner.xml
26 | .idea/**/dbnavigator.xml
27 |
28 | # Gradle
29 | .idea/**/gradle.xml
30 | .idea/**/libraries
31 |
32 | # Gradle and Maven with auto-import
33 | # When using Gradle or Maven with auto-import, you should exclude module files,
34 | # since they will be recreated, and may cause churn. Uncomment if using
35 | # auto-import.
36 | # .idea/artifacts
37 | # .idea/compiler.xml
38 | # .idea/jarRepositories.xml
39 | # .idea/modules.xml
40 | # .idea/*.iml
41 | # .idea/modules
42 | # *.iml
43 | # *.ipr
44 |
45 | # CMake
46 | cmake-build-*/
47 |
48 | # Mongo Explorer plugin
49 | .idea/**/mongoSettings.xml
50 |
51 | # File-based project format
52 | *.iws
53 |
54 | # IntelliJ
55 | out/
56 |
57 | # mpeltonen/sbt-idea plugin
58 | .idea_modules/
59 |
60 | # JIRA plugin
61 | atlassian-ide-plugin.xml
62 |
63 | # Cursive Clojure plugin
64 | .idea/replstate.xml
65 |
66 | # Crashlytics plugin (for Android Studio and IntelliJ)
67 | com_crashlytics_export_strings.xml
68 | crashlytics.properties
69 | crashlytics-build.properties
70 | fabric.properties
71 |
72 | # Editor-based Rest Client
73 | .idea/httpRequests
74 |
75 | # Android studio 3.1+ serialized cache file
76 | .idea/caches/build_file_checksums.ser
77 |
78 | ### PyCharm+iml Patch ###
79 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023
80 |
81 | *.iml
82 | modules.xml
83 | .idea/misc.xml
84 | *.ipr
85 |
86 | ### Python ###
87 | # Byte-compiled / optimized / DLL files
88 | __pycache__/
89 | *.py[cod]
90 | *$py.class
91 |
92 | # C extensions
93 | *.so
94 |
95 | # Distribution / packaging
96 | .Python
97 | build/
98 | develop-eggs/
99 | dist/
100 | downloads/
101 | eggs/
102 | .eggs/
103 | lib/
104 | lib64/
105 | parts/
106 | sdist/
107 | var/
108 | wheels/
109 | pip-wheel-metadata/
110 | share/python-wheels/
111 | *.egg-info/
112 | .installed.cfg
113 | *.egg
114 | MANIFEST
115 |
116 | # PyInstaller
117 | # Usually these files are written by a python script from a template
118 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
119 | *.manifest
120 | *.spec
121 |
122 | # Installer logs
123 | pip-log.txt
124 | pip-delete-this-directory.txt
125 |
126 | # Unit test / coverage reports
127 | htmlcov/
128 | .tox/
129 | .nox/
130 | .coverage
131 | .coverage.*
132 | .cache
133 | nosetests.xml
134 | coverage.xml
135 | *.cover
136 | *.py,cover
137 | .hypothesis/
138 | .pytest_cache/
139 | pytestdebug.log
140 |
141 | # Translations
142 | *.mo
143 | *.pot
144 |
145 | # Django stuff:
146 | *.log
147 | local_settings.py
148 | db.sqlite3
149 | db.sqlite3-journal
150 |
151 | # Flask stuff:
152 | instance/
153 | .webassets-cache
154 |
155 | # Scrapy stuff:
156 | .scrapy
157 |
158 | # Sphinx documentation
159 | docs/_build/
160 | doc/_build/
161 |
162 | # PyBuilder
163 | target/
164 |
165 | # Jupyter Notebook
166 | .ipynb_checkpoints
167 |
168 | # IPython
169 | profile_default/
170 | ipython_config.py
171 |
172 | # pyenv
173 | .python-version
174 |
175 | # pipenv
176 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
177 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
178 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
179 | # install all needed dependencies.
180 | #Pipfile.lock
181 |
182 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
183 | __pypackages__/
184 |
185 | # Celery stuff
186 | celerybeat-schedule
187 | celerybeat.pid
188 |
189 | # SageMath parsed files
190 | *.sage.py
191 |
192 | # Environments
193 | .env
194 | .venv
195 | env/
196 | venv/
197 | ENV/
198 | env.bak/
199 | venv.bak/
200 | pythonenv*
201 |
202 | # Spyder project settings
203 | .spyderproject
204 | .spyproject
205 |
206 | # Rope project settings
207 | .ropeproject
208 |
209 | # mkdocs documentation
210 | /site
211 |
212 | # mypy
213 | .mypy_cache/
214 | .dmypy.json
215 | dmypy.json
216 |
217 | # Pyre type checker
218 | .pyre/
219 |
220 | # pytype static type analyzer
221 | .pytype/
222 |
223 | # profiling data
224 | .prof
225 |
226 | # End of https://www.toptal.com/developers/gitignore/api/python,pycharm+iml
227 |
228 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Datasource local storage ignored files
5 | /dataSources/
6 | /dataSources.local.xml
7 | # Editor-based HTTP Client requests
8 | /httpRequests/
9 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/Benchmarking_RF.py:
--------------------------------------------------------------------------------
1 | # %%
2 |
3 | # https://stackoverflow.com/questions/48484807/training-a-decision-tree-using-id3-algorithm-by-sklearn
4 | # https://scikit-learn.org/stable/modules/tree.html#tree
5 | # https://medium.com/@mohtedibf/indepth-parameter-tuning-for-decision-tree-6753118a03c3
6 | # https://medium.com/datadriveninvestor/tree-algorithms-id3-c4-5-c5-0-and-cart-413387342164
7 | # https://scikit-learn.org/stable/modules/cross_validation.html
8 | import json
9 |
10 | import matplotlib
11 | from datetime import datetime
12 | import pandas as pd, numpy as np
13 | import math
14 | from sklearn.model_selection import cross_val_predict, KFold, cross_val_score, train_test_split, learning_curve, \
15 | cross_validate
16 | from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, \
17 | make_scorer, recall_score, precision_recall_fscore_support
18 | from sklearn.preprocessing import MinMaxScaler, minmax_scale, scale
19 | import matplotlib.pyplot as plt
20 | from copy import deepcopy
21 | from sklearn.ensemble import RandomForestClassifier
22 | from sklearn.neural_network import MLPClassifier
23 |
24 |
25 | DITCH_DEST_PORT = True # Remove destination port!
26 | MLP = False
27 | RF = True
28 | CR_VAL_TRAIN = False
29 | DATA_VERSION = "no_artefacts_with_payload_filter"
30 | # 3 random states for each dataset iteration: 42, 43, 44
31 | DATA_SPLIT_RANDOM_STATE = 44
32 | RF_RANDOM_STATE = 44
33 |
34 | def gen_id():
35 | return datetime.utcnow().strftime("%d-%m_%H%-M%-S")
36 |
37 |
38 | def plot_confusion_matrix(y_true, y_pred, classes,
39 | normalize=True,
40 | cmap=plt.cm.Reds,
41 | save=False,
42 | name=None):
43 | # title =
44 |
45 | # Compute confusion matrix
46 | cm2 = confusion_matrix(y_true, y_pred)
47 |
48 | if normalize:
49 | cm = cm2.astype('float') / cm2.sum(axis=1)[:, np.newaxis]
50 | print("Normalized confusion matrix")
51 | else:
52 | print('Confusion matrix, without normalization')
53 |
54 | # print(cm)
55 |
56 | fig, ax = plt.subplots(figsize=(9, 9))
57 | im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
58 | # ax.figure.colorbar(im, ax=ax)
59 | # We want to show all ticks...
60 | ax.set(xticks=np.arange(cm.shape[1]),
61 | yticks=np.arange(cm.shape[0]),
62 | # ... and label them with the respective list entries
63 | # xticklabels=classes, yticklabels=classes,
64 | # title=title,
65 | # ylabel='True label',
66 | # xlabel='Predicted label'
67 | )
68 | hfont = {"fontname": "serif"}
69 | fontsize = "x-large"
70 | # ax.set_xlabel('Predicted', fontsize=fontsize,**hfont),
71 | # ax.set_ylabel('True', fontsize=fontsize,**hfont),
72 | ax.set_xticklabels(classes, fontsize=fontsize, **hfont)
73 | ax.set_yticklabels(classes, fontsize=fontsize, **hfont, fontweight='bold')
74 |
75 | # Rotate the tick labels and set their alignment.
76 | plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
77 | rotation_mode="anchor")
78 |
79 | # Loop over data dimensions and create text annotations.
80 | fmt = '.2f' if normalize else 'd'
81 | thresh = cm.max() / 2.
82 | for i in range(cm.shape[0]):
83 | for j in range(cm.shape[1]):
84 | ax.text(j, i, format(cm2[i, j], 'd'), # format(cm[i, j], fmt)
85 | ha="center", va="center",
86 | color="white" if cm[i, j] > thresh else "black")
87 | fig.tight_layout()
88 |
89 | if save:
90 | fig.savefig("Figures/" + name + ".pdf", dpi=400,
91 | bbox_inches='tight', pad_inches=0)
92 |
93 | plt.show()
94 | return ax
95 |
96 |
97 | # https://stackoverflow.com/questions/35249760/using-scikit-to-determine-contributions-of-each-feature-to-a-specific-class-pred
98 | def class_feature_importance(X, Y, feature_importances):
99 | N, M = X.shape
100 | X = scale(X)
101 |
102 | out = {}
103 | for c in set(Y):
104 | out[c] = dict(
105 | zip(range(N), np.mean(X[Y == c, :], axis=0) * feature_importances)
106 | )
107 |
108 | return out
109 |
110 |
111 | def load_data():
112 | print("Loading training data ...")
113 | full_dataset = np.load("NumpyFriendlyData/full_dataset_" + DATA_VERSION + ".npy")
114 |
115 | full_dataset = full_dataset[~np.isnan(full_dataset).any(axis=1)]
116 | full_dataset = full_dataset[~np.isinf(full_dataset).any(axis=1)]
117 |
118 | data_x = full_dataset[:, :-1]
119 | data_y = full_dataset[:, -1]
120 |
121 | print(np.unique(data_y, return_counts=True))
122 |
123 | if DITCH_DEST_PORT:
124 | data_x = data_x[:, 1:] # Dest Port index = 0
125 |
126 | splits = train_test_split(data_x, data_y, test_size=0.25, stratify=data_y,
127 | random_state=DATA_SPLIT_RANDOM_STATE)
128 | return splits
129 |
130 |
131 | if __name__ == "__main__":
132 |
133 | time_id = gen_id()
134 |
135 | (X_train, X_test, Y_train, Y_test) = load_data()
136 | print(X_train.shape, X_test.shape)
137 |
138 | # from sklearn.tree import DecisionTreeClassifier
139 | # print("Decision Tree")
140 | # clf = DecisionTreeClassifier(criterion='entropy', random_state=0)
141 |
142 | if MLP:
143 | #X_train = minmax_scale(X_train)
144 | #X_test = minmax_scale(X_test)
145 |
146 | print("Applying minmax-scaling on train and test set")
147 | scaler = MinMaxScaler()
148 | scaler.fit(X_train)
149 | X_train = scaler.transform(X_train)
150 | X_test = scaler.transform(X_test)
151 |
152 | print("Multilayered Perceptron")
153 | mlp_classifier = MLPClassifier(hidden_layer_sizes=(156,78,39))
154 |
155 | print("wrong script for MLP")
156 | exit(0)
157 |
158 | scoring = {
159 | 'accuracy': make_scorer(accuracy_score),
160 | 'precision': make_scorer(precision_score, average='weighted'),
161 | 'f1_score': make_scorer(f1_score, average='weighted'),
162 | 'recall': make_scorer(recall_score, average='weighted')
163 | }
164 |
165 | print("Random Forest")
166 | rf_classifier = RandomForestClassifier(n_estimators=50, max_depth=20, random_state=RF_RANDOM_STATE)
167 |
168 | if CR_VAL_TRAIN:
169 | print("Cross validating ...")
170 | sc = cross_validate(rf_classifier, X_train, Y_train, cv=5, scoring=scoring)
171 | print("Score:\n", sc)
172 |
173 | # print("Fit time: " % (sc['fit_time']))
174 | # print("Score time: " % (sc['score_time']))
175 |
176 | print("Precision: %0.8f (%0.8f)" % (sc['test_precision'].mean(), sc['test_precision'].std()))
177 | print("Recall: %0.8f (%0.8f)" % (sc['test_recall'].mean(), sc['test_recall'].std()))
178 | print("F1_score: %0.8f (%0.8f)" % (sc['test_f1_score'].mean(), sc['test_f1_score'].std()))
179 | print("Accuracy: %0.8f (%0.8f)" % (sc['test_accuracy'].mean(), sc['test_accuracy'].std()))
180 |
181 | print("Fitting model ...")
182 | rf_classifier.fit(X_train, Y_train)
183 |
184 | Y_pred = rf_classifier.predict(X_test)
185 |
186 | nY_test = []
187 | nY_pred = []
188 | for i in range(len(Y_test)):
189 | nY_test += [Y_test[i]]
190 | nY_pred += [Y_pred[i]]
191 | Y_test = np.array(nY_test)
192 | Y_pred = np.array(nY_pred)
193 | print(Y_test.shape, Y_pred.shape)
194 |
195 | # %%
196 |
197 | classes = ["Benign", "FTP-Patator", "SSH-Patator", "DoS GoldenEye", "DoS Hulk", "DoS Slowhttptest", "DoS slowloris",
198 | "Heartbleed", "Web Attack - Brute Force", "Web Attack - XSS", "Web Attack - Sql Injection", "Infiltration",
199 | "Bot", "PortScan", "DDoS"]
200 |
201 | plot_confusion_matrix(Y_test, Y_pred, classes, save=True, name="RF_" + DATA_VERSION + "_" + time_id )
202 |
203 | prfs = precision_recall_fscore_support(Y_test, Y_pred, average='weighted')
204 | print("Precision, Recall, F-Score, Support:", prfs)
205 |
206 | # %%
207 |
208 | plot_confusion_matrix(Y_test, Y_pred, classes)
209 |
210 | prfs = precision_recall_fscore_support(Y_test, Y_pred, average='weighted')
211 | print("Precision, Recall, F-Score, Support:", prfs)
212 | with open("Scores/RF_" + DATA_VERSION + "_" + time_id + "_metrics_aggregated.txt", 'w') as out_file:
213 | out_file.write("Precision, Recall, F-Score, Support: " + str(prfs))
214 |
215 | Y_pred = list(Y_pred)
216 |
217 | np.save("Class-based_metrics/Y_test_" + DATA_VERSION + "_" + time_id, Y_test)
218 | np.save("Class-based_metrics/Y_pred_" + DATA_VERSION + "_" + time_id, Y_pred)
219 |
220 | class_based_metrics = classification_report(Y_test, Y_pred, target_names=classes, zero_division="warn", digits=4)
221 | print("Class based metrics:\r\n", class_based_metrics)
222 | with open("Scores/RF_" + DATA_VERSION + "_metrics_class_based_" + time_id + ".txt", 'w') as out_file:
223 | out_file.write(class_based_metrics)
224 |
225 | # This next part is to calculate the feature importance for a RF classifier. Comment this out if you're using MLP
226 |
227 | feature_importances = rf_classifier.feature_importances_
228 |
229 | result = class_feature_importance(X_test, Y_pred, feature_importances)
230 |
231 | print(json.dumps(result, indent=4))
232 |
233 | with open("FeatureImportance/feature_importance_full_dataset_" + DATA_VERSION + "_" + time_id + ".json", 'w') as f:
234 | json.dump(result, f)
235 |
236 | '''
237 | features_list = "Dst Port,Protocol,Flow Duration,Total Fwd Packet," \
238 | "Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max," \
239 | "Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max," \
240 | "Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s," \
241 | "Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std," \
242 | "Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min," \
243 | "Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Length,Bwd Header Length," \
244 | "Fwd Packets/s,Bwd Packets/s,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std," \
245 | "Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count," \
246 | "URG Flag Count,CWR Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Fwd Segment Size Avg," \
247 | "Bwd Segment Size Avg,Fwd Bytes/Bulk Avg,Fwd Packet/Bulk Avg,Fwd Bulk Rate Avg,Bwd Bytes/Bulk Avg," \
248 | "Bwd Packet/Bulk Avg,Bwd Bulk Rate Avg,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets," \
249 | "Subflow Bwd Bytes,FWD Init Win Bytes,Bwd Init Win Bytes,Fwd Act Data Pkts,Fwd Seg Size Min," \
250 | "Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label".split(',')
251 |
252 | label_dictionary = {
253 | 'BENIGN': '0',
254 | 'FTP-Patator': '1',
255 | 'SSH-Patator': '2',
256 | 'DoS GoldenEye': '3',
257 | 'DoS Hulk': '4',
258 | 'DoS Slowhttptest': '5',
259 | 'DoS slowloris': '6',
260 | 'Heartbleed': '7',
261 | 'Web Attack – Brute Force': '8',
262 | 'Web Attack – XSS': '9',
263 | 'Web Attack – Sql Injection': '10',
264 | 'Infiltration': '11',
265 | 'Bot': '12',
266 | 'PortScan': '13',
267 | 'DDoS': '14'
268 | }
269 | '''
270 |
--------------------------------------------------------------------------------
/Class-based_metrics/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | */
3 | !.gitignore
--------------------------------------------------------------------------------
/FeatureImportance/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | */
3 | !.gitignore
--------------------------------------------------------------------------------
/Figures/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | */
3 | !.gitignore
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2017 Gints Engelen (gints.engelen@kuleuven.be)
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 |
21 | # When using this code, please cite our paper:
22 | G. Engelen, V. Rimmer, W. Joosen, "Troubleshooting an Intrusion Detection Dataset: the CICIDS2017 Case Study", 2021 IEEE European Symposium on Security and Privacy Workshops (EuroS&PW), 2021.
23 |
24 | Our paper as well as its extended documentation can be found at https://downloads.distrinet-research.be/WTMC2021/
25 |
26 | # Contributors
27 |
28 | For labelling_CSV_flows.py:
29 | Jin Li
30 | Vera Rimmer
31 | Gints Engelen
32 |
33 | For all other code:
34 | Gints Engelen
35 | Vera Rimmer
--------------------------------------------------------------------------------
/LabelledDataset/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | */
3 | !.gitignore
--------------------------------------------------------------------------------
/MakeDataNumpyFriendly.py:
--------------------------------------------------------------------------------
1 | '''
2 | This Should be run after all CSV files are fully labelled.
3 | The script does two things:
4 | 1. It removes flow id, source ip, destination ip, source port, and timestamp as features
5 | 2. It converts all labels to numerical labels
6 | '''
7 | import copy
8 | import csv
9 | import pandas as pd
10 | import numpy as np
11 |
12 | dataset_directory = 'LabelledDataset'
13 | saved_numpy_name = 'full_dataset_no_artefacts_with_payload_filter.npy'
14 |
15 |
16 | def importCsvAsDict(path):
17 | print('Importing from ', path)
18 | csvfile = csv.DictReader(open(path), delimiter=',')
19 | return [x for x in csvfile]
20 |
21 |
22 | def convertToNumericalLabels(flows_list_of_dict):
23 | print('Relabelling flows')
24 |
25 | label_dictionary = {
26 | 'BENIGN': '0',
27 | 'FTP-Patator': '1',
28 | 'SSH-Patator': '2',
29 | 'DoS GoldenEye': '3',
30 | 'DoS Hulk': '4',
31 | 'DoS Slowhttptest': '5',
32 | 'DoS slowloris': '6',
33 | 'Heartbleed': '7',
34 | 'Web Attack - Brute Force': '8',
35 | 'Web Attack - XSS': '9',
36 | 'Web Attack - Sql Injection': '10',
37 | 'Infiltration': '11',
38 | 'Bot': '12',
39 | 'PortScan': '13',
40 | 'DDoS': '14',
41 | # IMPORTANT NOTE: For our experiments, we treated all "X - Attempted" flows as BENIGN. If you want to keep the
42 | # "X - Attempted" flows separate, please change the values corresponding to the keys below
43 | 'FTP-Patator - Attempted' : '0',
44 | 'SSH-Patator - Attempted' : '0',
45 | 'DoS GoldenEye - Attempted' : '0',
46 | 'DoS Hulk - Attempted' : '0',
47 | 'DoS Slowhttptest - Attempted' : '0',
48 | 'DoS slowloris - Attempted' : '0',
49 | 'Heartbleed - Attempted' : '0',
50 | 'Web Attack - Brute Force - Attempted' : '0',
51 | 'Web Attack - XSS - Attempted' : '0',
52 | 'Web Attack - Sql Injection - Attempted' : '0',
53 | 'Infiltration - Attempted' : '0',
54 | 'Bot - Attempted' : '0',
55 | # Note that PortScan doesn't have any 'Attempted' flows because it doesn't rely on a payload transfer for its
56 | # effectiveness
57 | 'DDoS - Attempted' : '0'
58 | }
59 |
60 | for (index, row) in enumerate(flows_list_of_dict):
61 | current_label = row['Label']
62 | flows_list_of_dict[index]['Label'] = label_dictionary[current_label]
63 |
64 |
65 | def listOfDictToNumpyArray(list_of_dict):
66 | dataframe = pd.DataFrame(list_of_dict)
67 | numpy_string_array = dataframe.values
68 | # See point 1 in the description at the top of the file
69 | trimmed_values = np.concatenate((numpy_string_array[:, 4:6], numpy_string_array[:, 7:]), axis=1)
70 | return trimmed_values.astype(np.float)
71 |
72 |
73 | print("monday")
74 | monday_dict = importCsvAsDict(dataset_directory + '/Monday-WorkingHours.pcap_REVI.csv')
75 | convertToNumericalLabels(monday_dict)
76 | monday_numpy_array = listOfDictToNumpyArray(monday_dict)
77 |
78 | print("tuesday")
79 | tuesday_dict = importCsvAsDict(dataset_directory + '/Tuesday-WorkingHours.pcap_REVI.csv')
80 | convertToNumericalLabels(tuesday_dict)
81 | tuesday_numpy_array = listOfDictToNumpyArray(tuesday_dict)
82 |
83 | print("wednesday")
84 | wednesday_dict = importCsvAsDict(dataset_directory + '/Wednesday-WorkingHours.pcap_REVI.csv')
85 | convertToNumericalLabels(wednesday_dict)
86 | wednesday_numpy_array = listOfDictToNumpyArray(wednesday_dict)
87 |
88 | print("thursday")
89 | thursday_dict = importCsvAsDict(dataset_directory + '/Thursday-WorkingHours.pcap_REVI.csv')
90 | convertToNumericalLabels(thursday_dict)
91 | thursday_numpy_array = listOfDictToNumpyArray(thursday_dict)
92 |
93 | print("friday")
94 | friday_dict = importCsvAsDict(dataset_directory + '/Friday-WorkingHours.pcap_REVI.csv')
95 | convertToNumericalLabels(friday_dict)
96 | friday_numpy_array = listOfDictToNumpyArray(friday_dict)
97 |
98 | full_dataset = np.concatenate((monday_numpy_array, tuesday_numpy_array, wednesday_numpy_array, thursday_numpy_array,
99 | friday_numpy_array), axis=0)
100 |
101 | print("saving dataset")
102 | np.save('NumpyFriendlyData/' + saved_numpy_name, full_dataset)
103 |
--------------------------------------------------------------------------------
/NumpyFriendlyData/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | */
3 | !.gitignore
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Troubleshooting an Intrusion Detection Dataset: the CICIDS2017 Case Study
2 |
3 | This repository contains the code used for our [paper](https://downloads.distrinet-research.be/WTMC2021/Resources/wtmc2021_Engelen_Troubleshooting.pdf).
4 | The code performs the labelling and benchmarking for the [CICIDS 2017 dataset](https://www.unb.ca/cic/datasets/ids-2017.html)
5 | after it has been processed by [our modified version of the CICFlowMeter tool](https://github.com/GintsEngelen/CICFlowMeter).
6 |
7 | Note that all of this is *research code*.
8 |
9 | If you use the code in this repository, please cite our paper:
10 |
11 | @inproceedings{engelen2021troubleshooting,
12 | title={Troubleshooting an Intrusion Detection Dataset: the CICIDS2017 Case Study},
13 | author={Engelen, Gints and Rimmer, Vera and Joosen, Wouter},
14 | booktitle={2021 IEEE Security and Privacy Workshops (SPW)},
15 | pages={7--12},
16 | year={2021},
17 | organization={IEEE}
18 | }
19 |
20 | An extended documentation of our paper can be found [here](https://downloads.distrinet-research.be/WTMC2021/).
21 |
22 | ## How to use this repository
23 |
24 | First, head over to the website of the [CICIDS 2017 dataset](https://www.unb.ca/cic/datasets/ids-2017.html) and download
25 | the raw version of the dataset (PCAP file format). There are 5 files in total, one for each day.
26 |
27 | Then, run our [our modified version of the CICFlowMeter tool](https://github.com/GintsEngelen/CICFlowMeter) on the data
28 | obtained in the previous step:
29 |
30 | 1. Start the CICFlowMeter tool
31 | 2. Under the "NetWork" menu option, select "Offline"
32 | 3. For "Pcap dir", choose the directory containing the 5 PCAP files of the CICIDS 2017 dataset
33 | 4. For "Output dir", choose the "UnlabelledDataset" directory of this WTCM2021-Code project.
34 | 5. Keep the default values for the "Flow TimeOut" and "Activity Timeout" parameters (120000000 and 5000000 respectively)
35 |
36 | This will generate 5 CSV files with the flows extracted from the raw PCAP files.
37 |
38 | After this, verify the `TIME_DIFFERENCE`, `INPUT_DIR`, `OUTPUT_DIR` and `PAYLOAD_FILTER_ACTIVE` attributes in the
39 | `labelling_CSV_flows.py` script, and then run it (no need to specify any command-line options). This will label all the
40 | flows in the CSV files generated by the CICFlowMeter tool.
41 |
42 | Then, run the `MakeDataNumpyFriendly.py` script, which will convert the labelled CSV files into a single numpy array.
43 | Note that, in our experiments, we chose to relabel all "Attempted" flows as BENIGN. If you wish to keep them separate,
44 | make sure to change the numerical labels in the `convertToNumericalLabels(flows_list_of_dict)` function.
45 |
46 | Finally, run the `Benchmarking_RF.py` script to perform benchmarking on the dataset using a Random Forest classifier.
47 | Random seeds and various other options can be specified in the first few lines of the script.
--------------------------------------------------------------------------------
/Scores/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | */
3 | !.gitignore
--------------------------------------------------------------------------------
/UnlabelledDataset/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | */
3 | !.gitignore
--------------------------------------------------------------------------------
/labelling_CSV_flows.py:
--------------------------------------------------------------------------------
1 | import csv
2 | from datetime import datetime
3 | from datetime import timedelta
4 |
5 | DAY_STR = [None, "Monday", "Tuesday", "Wednesday", "Thursday", "Friday"]
6 |
7 | PRINT_STEPS = 3000
8 | DATE_FORMAT_INTERNAL = '%d/%m/%Y %I:%M:%S %p'
9 | DATE_FORMAT_DATASET = '%d/%m/%Y %I:%M:%S %p'
10 | # The CICIDS 2017 dataset was generated in New Brunswick, Canada. Running the CICFlowMeter tool on this data automatically
11 | # converts all timestamps in the data from the timezone of New Brunswick, Canada, to the timezone of the host running
12 | # the CICFlowMeter tool. The TIME_DIFFERENCE attribute specifies the time difference between these two timezones.
13 | # specifically: TIME_DIFFERENCE = {CICFlowMeter host timezone} - {New Brunswick, Canada timezone}
14 | TIME_DIFFERENCE = timedelta(hours=5)
15 |
16 | INPUT_DIR = 'UnlabelledDataset/'
17 | OUTPUT_DIR = 'LabelledDataset/'
18 |
19 | # Some attack categories rely on transfer of a payload in order to be effective. When a malicious flow belongs to such a
20 | # category but doesn't contain a payload, setting this filter to True will label these flows as "X - Attempted" with "X"
21 | # the original attack class. Setting this filter to False will simply label the flow as part of the attack category.
22 | PAYLOAD_FILTER_ACTIVE = True
23 |
24 |
25 | # DATE_FORMAT_DATASET = '%d/%m/%Y %H:%M'
26 | # TIME_DIFFERENCE = timedelta(hours=0)
27 |
28 |
29 | def merge_label(day):
30 | day_str = DAY_STR[day] # 3-reorganize
31 | with open('G:\\Datasets\\CICIDS2017-PCAPs\\2-labeling\\' + day_str + '-WorkingHours.pcap_REVI.csv') as csv_flow:
32 | spamreader = csv.reader(csv_flow, delimiter=',', quotechar='|')
33 | next(spamreader)
34 | total = 0
35 | with open('G:\\Datasets\\CICIDS2017-PCAPs\\2-labeling\\' + day_str + '-WorkingHours.pcap_SeqInfo.txt',
36 | 'r') as txt_input:
37 | with open('G:\\Datasets\\CICIDS2017-PCAPs\\2-labeling\\' + day_str + '-WorkingHours.pcap_SeqInfoLabel.txt',
38 | 'w') as txt_output:
39 | for row_seq in txt_input:
40 | txt_row = row_seq.split(';')
41 | csv_row = next(spamreader)
42 | assert (txt_row[0] == csv_row[-1]) # same uid
43 |
44 | txt_row.insert(1, csv_row[-1]) # insert label into text file
45 | txt_output.write(';'.join(txt_row))
46 | txt_output.flush()
47 |
48 | total += 1
49 | print(day_str + " merged")
50 |
51 |
52 | def dataset_stat_attack(day, ver='ISCX'):
53 | day_str = DAY_STR[day]
54 | col = -1 # if ver == 'ISCX' else -2
55 | with open(OUTPUT_DIR + day_str + '-WorkingHours.pcap_' + ver + '.csv',
56 | newline='') as csv_flow:
57 | spamreader = csv.reader(csv_flow, delimiter=',', quotechar='|')
58 | next(spamreader)
59 | total = 0
60 | all_attacks = {}
61 | for row in spamreader:
62 | lbl_attack = row[col]
63 | if lbl_attack not in all_attacks:
64 | all_attacks[lbl_attack] = 1
65 | else:
66 | all_attacks[lbl_attack] += 1
67 | total += 1
68 | # if total % PRINT_STEPS == 0:
69 | # print('> ' + str(total))
70 | print(ver + ' Stat ' + day_str + ':')
71 | print(all_attacks)
72 | print('Total: ' + str(total))
73 |
74 |
75 | # row = a row in the CSV file, corresponding to one flow
76 | # attack_class = String name of the attack class
77 | # Returns a string of the attack class if it passes through the filter
78 | # Returns "X - Attempted" with X the attack_class if the flow is a TCP flow and does not contain any data transfer in
79 | # the forward direction.
80 | # Note that if the payload filter is not active, or the underlying protocol is not TCP, it returns the attack class
81 | # by default.
82 | def payload_filter(row, attack_class):
83 | # row[10] = total Length of payload bytes in Fwd direction
84 | # row[5] = Protocol, we only want TCP connections, 6 = TCP
85 | if PAYLOAD_FILTER_ACTIVE and int(row[5]) == 6:
86 | if float(row[10]) > 0.0:
87 | return attack_class
88 | else:
89 | return attack_class + " - Attempted"
90 | else:
91 | return attack_class
92 |
93 |
94 | def monday_benign(_):
95 | return "BENIGN"
96 |
97 |
98 | def tuesday_ftp_patator(row):
99 | t_start = datetime.strptime('04/07/2017 09:17:00 AM', DATE_FORMAT_INTERNAL)
100 | t_end = datetime.strptime('04/07/2017 10:30:00 AM', DATE_FORMAT_INTERNAL)
101 | attacker = '172.16.0.1'
102 | victim = '192.168.10.50'
103 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
104 | if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end:
105 | return payload_filter(row, "FTP-Patator")
106 | return None
107 |
108 |
109 | def tuesday_ssh_patator(row):
110 | t_start = datetime.strptime('04/07/2017 01:00:00 PM', DATE_FORMAT_INTERNAL)
111 | t_end = datetime.strptime('04/07/2017 04:11:00 PM', DATE_FORMAT_INTERNAL)
112 | attacker = '172.16.0.1'
113 | victim = '192.168.10.50'
114 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
115 | if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end:
116 | return payload_filter(row, "SSH-Patator")
117 | return None
118 |
119 |
120 | def wednesday_dos_slowloris(row):
121 | t_start = datetime.strptime('05/07/2017 02:23:00 AM', DATE_FORMAT_INTERNAL)
122 | t_end = datetime.strptime('05/07/2017 10:12:59 AM', DATE_FORMAT_INTERNAL)
123 | attacker = '172.16.0.1'
124 | victim = '192.168.10.50'
125 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
126 | if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end:
127 | return payload_filter(row, "DoS slowloris")
128 | return None
129 |
130 |
131 | def wednesday_dos_slowhttptest(row):
132 | t_start = datetime.strptime('05/07/2017 10:13:00 AM', DATE_FORMAT_INTERNAL)
133 | t_end = datetime.strptime('05/07/2017 10:38:00 AM', DATE_FORMAT_INTERNAL)
134 | attacker = '172.16.0.1'
135 | victim = '192.168.10.50'
136 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
137 | if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end:
138 | return payload_filter(row, "DoS Slowhttptest")
139 | return None
140 |
141 |
142 | def wednesday_dos_hulk(row):
143 | t_start = datetime.strptime('05/07/2017 10:39:00 AM', DATE_FORMAT_INTERNAL)
144 | t_end = datetime.strptime('05/07/2017 11:09:00 AM', DATE_FORMAT_INTERNAL)
145 | attacker = '172.16.0.1'
146 | victim = '192.168.10.50'
147 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
148 | if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end:
149 | return payload_filter(row, "DoS Hulk")
150 | return None
151 |
152 |
153 | def wednesday_dos_goldeneye(row):
154 | t_start = datetime.strptime('05/07/2017 11:10:00 AM', DATE_FORMAT_INTERNAL)
155 | t_end = datetime.strptime('05/07/2017 11:23:00 AM', DATE_FORMAT_INTERNAL)
156 | attacker = '172.16.0.1'
157 | victim = '192.168.10.50'
158 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
159 | if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end:
160 | return payload_filter(row, "DoS GoldenEye")
161 | return None
162 |
163 |
164 | def wednesday_heartbleed(row):
165 | t_start = datetime.strptime('05/07/2017 03:11:00 PM', DATE_FORMAT_INTERNAL)
166 | t_end = datetime.strptime('05/07/2017 03:33:00 PM', DATE_FORMAT_INTERNAL)
167 | attacker = '172.16.0.1'
168 | victim = '192.168.10.51'
169 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
170 | if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end and row[4] == '444':
171 | return payload_filter(row, "Heartbleed")
172 | return None
173 |
174 |
175 | def thursday_web_attack_brute_force(row):
176 | t_start = datetime.strptime('06/07/2017 09:10:00 AM', DATE_FORMAT_INTERNAL)
177 | t_end = datetime.strptime('06/07/2017 10:12:00 AM', DATE_FORMAT_INTERNAL)
178 | attacker = '172.16.0.1'
179 | victim = '192.168.10.50'
180 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
181 | if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end:
182 | return payload_filter(row, "Web Attack - Brute Force")
183 | return None
184 |
185 |
186 | def thursday_web_attack_xss(row):
187 | t_start = datetime.strptime('06/07/2017 10:13:00 AM', DATE_FORMAT_INTERNAL)
188 | t_end = datetime.strptime('06/07/2017 10:37:00 AM', DATE_FORMAT_INTERNAL)
189 | attacker = '172.16.0.1'
190 | victim = '192.168.10.50'
191 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
192 | if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end:
193 | return payload_filter(row, "Web Attack - XSS")
194 | return None
195 |
196 |
197 | def thursday_web_attack_sql_injection(row):
198 | t_start = datetime.strptime('06/07/2017 10:39:00 AM', DATE_FORMAT_INTERNAL)
199 | t_end = datetime.strptime('06/07/2017 10:45:00 AM', DATE_FORMAT_INTERNAL)
200 | attacker = '172.16.0.1'
201 | victim = '192.168.10.50'
202 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
203 | if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end:
204 | return payload_filter(row, "Web Attack - Sql Injection")
205 | return None
206 |
207 |
208 | def thursday_web_attack_infiltration(row):
209 | t_start = datetime.strptime('06/07/2017 02:15:00 PM', DATE_FORMAT_INTERNAL)
210 | t_end = datetime.strptime('06/07/2017 03:50:00 PM', DATE_FORMAT_INTERNAL)
211 | attacker = '192.168.10.8'
212 | victim = '205.174.165.73'
213 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
214 | if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end:
215 | return payload_filter(row, "Infiltration")
216 | return None
217 |
218 |
219 | def friday_botnet(row):
220 | t_start = datetime.strptime('07/07/2017 09:30:00 AM', DATE_FORMAT_INTERNAL)
221 | t_end = datetime.strptime('07/07/2017 12:59:59 PM', DATE_FORMAT_INTERNAL)
222 | cond_hosts = (row[1] == '205.174.165.73' or row[3] == '205.174.165.73') or (
223 | row[1] == '192.168.10.17' and row[3] == '52.7.235.158') or (
224 | row[1] == '192.168.10.12' and row[3] == '52.6.13.28')
225 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
226 | if t_start <= t_flow <= t_end and cond_hosts and (row[2] == '8080' or row[4] == '8080') and row[5] == '6':
227 | return payload_filter(row, "Bot")
228 | return None
229 |
230 |
231 | def friday_portscan(row):
232 | t_start = datetime.strptime('07/07/2017 12:30:00 PM', DATE_FORMAT_INTERNAL)
233 | t_end = datetime.strptime('07/07/2017 03:40:00 PM', DATE_FORMAT_INTERNAL)
234 | attacker = '172.16.0.1'
235 | victim = '192.168.10.50'
236 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
237 | if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end:
238 | return "PortScan"
239 | return None
240 |
241 |
242 | def friday_ddos(row):
243 | t_start = datetime.strptime('07/07/2017 03:40:00 PM', DATE_FORMAT_INTERNAL)
244 | t_end = datetime.strptime('07/07/2017 04:30:00 PM', DATE_FORMAT_INTERNAL)
245 | attacker = '172.16.0.1'
246 | victim = '192.168.10.50'
247 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
248 | if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end:
249 | return payload_filter(row, "DDoS")
250 | return None
251 |
252 |
253 | def dataset_labeling(day):
254 | day_str = [None, "Monday", "Tuesday", "Wednesday", "Thursday", "Friday"][day]
255 | day_filters = [None,
256 | [monday_benign],
257 | [tuesday_ftp_patator, tuesday_ssh_patator],
258 | [wednesday_dos_slowloris, wednesday_dos_slowhttptest, wednesday_dos_hulk, wednesday_dos_goldeneye,
259 | wednesday_heartbleed],
260 | [thursday_web_attack_brute_force, thursday_web_attack_xss, thursday_web_attack_sql_injection,
261 | thursday_web_attack_infiltration],
262 | [friday_botnet, friday_portscan, friday_ddos]][day]
263 | with open(INPUT_DIR + day_str + '-WorkingHours.pcap_Flow.csv',
264 | newline='') as csv_flow:
265 | with open(OUTPUT_DIR + day_str + '-WorkingHours.pcap_REVI.csv', 'w',
266 | newline='') as csv_revised:
267 | spamreader = csv.reader(csv_flow, delimiter=',', quotechar='|')
268 | spamwriter = csv.writer(csv_revised, delimiter=',', quotechar='|')
269 | header = next(spamreader)
270 | spamwriter.writerow(header)
271 |
272 | total = 0
273 | all_attacks = {}
274 | for row in spamreader:
275 | lbl = "BENIGN"
276 | for filter in day_filters:
277 | lbl_attack = filter(row)
278 | if lbl_attack:
279 | lbl = lbl_attack
280 | break
281 | row[-1] = lbl
282 |
283 | if lbl not in all_attacks:
284 | all_attacks[lbl] = 1
285 | else:
286 | all_attacks[lbl] += 1
287 |
288 | spamwriter.writerow(row)
289 | total += 1
290 | # if total % PRINT_STEPS == 0:
291 | # print('> ' + str(total))
292 | print('REVI Stat ' + day_str + ':')
293 | print(all_attacks)
294 | print('Total: ' + str(total))
295 |
296 |
297 | def show_all_stats():
298 | # dataset_stat_attack(5, 'ISCX')
299 | dataset_stat_attack(5, 'REVI')
300 |
301 |
302 | def label_all_datasets():
303 | for i in range(1, 6):
304 | dataset_labeling(i)
305 |
306 | for i in range(1, 6):
307 | # dataset_stat_attack(i, 'ISCX')
308 | dataset_stat_attack(i, 'REVI')
309 | print('\n')
310 |
311 |
312 | def merge_all_labels():
313 | for i in range(1, 6):
314 | merge_label(i)
315 |
316 |
317 | if __name__ == '__main__':
318 | label_all_datasets()
319 | # merge_all_labels()
320 |
--------------------------------------------------------------------------------