├── .DS_Store ├── .gitignore ├── .idea ├── .gitignore ├── inspectionProfiles │ └── profiles_settings.xml └── vcs.xml ├── Benchmarking_RF.py ├── Class-based_metrics └── .gitignore ├── FeatureImportance └── .gitignore ├── Figures └── .gitignore ├── LICENSE ├── LabelledDataset └── .gitignore ├── MakeDataNumpyFriendly.py ├── NumpyFriendlyData └── .gitignore ├── README.md ├── Scores └── .gitignore ├── UnlabelledDataset └── .gitignore └── labelling_CSV_flows.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GintsEngelen/WTMC2021-Code/14ee845f0d1c2f5d703d678233e25fb4d051e9d1/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.toptal.com/developers/gitignore/api/python,pycharm+iml 3 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,pycharm+iml 4 | 5 | ### PyCharm+iml ### 6 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 7 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 8 | 9 | # User-specific stuff 10 | .idea/**/workspace.xml 11 | .idea/**/tasks.xml 12 | .idea/**/usage.statistics.xml 13 | .idea/**/dictionaries 14 | .idea/**/shelf 15 | 16 | # Generated files 17 | .idea/**/contentModel.xml 18 | 19 | # Sensitive or high-churn files 20 | .idea/**/dataSources/ 21 | .idea/**/dataSources.ids 22 | .idea/**/dataSources.local.xml 23 | .idea/**/sqlDataSources.xml 24 | .idea/**/dynamic.xml 25 | .idea/**/uiDesigner.xml 26 | .idea/**/dbnavigator.xml 27 | 28 | # Gradle 29 | .idea/**/gradle.xml 30 | .idea/**/libraries 31 | 32 | # Gradle and Maven with auto-import 33 | # When using Gradle or Maven with auto-import, you should exclude module files, 34 | # since they will be recreated, and may cause churn. Uncomment if using 35 | # auto-import. 36 | # .idea/artifacts 37 | # .idea/compiler.xml 38 | # .idea/jarRepositories.xml 39 | # .idea/modules.xml 40 | # .idea/*.iml 41 | # .idea/modules 42 | # *.iml 43 | # *.ipr 44 | 45 | # CMake 46 | cmake-build-*/ 47 | 48 | # Mongo Explorer plugin 49 | .idea/**/mongoSettings.xml 50 | 51 | # File-based project format 52 | *.iws 53 | 54 | # IntelliJ 55 | out/ 56 | 57 | # mpeltonen/sbt-idea plugin 58 | .idea_modules/ 59 | 60 | # JIRA plugin 61 | atlassian-ide-plugin.xml 62 | 63 | # Cursive Clojure plugin 64 | .idea/replstate.xml 65 | 66 | # Crashlytics plugin (for Android Studio and IntelliJ) 67 | com_crashlytics_export_strings.xml 68 | crashlytics.properties 69 | crashlytics-build.properties 70 | fabric.properties 71 | 72 | # Editor-based Rest Client 73 | .idea/httpRequests 74 | 75 | # Android studio 3.1+ serialized cache file 76 | .idea/caches/build_file_checksums.ser 77 | 78 | ### PyCharm+iml Patch ### 79 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 80 | 81 | *.iml 82 | modules.xml 83 | .idea/misc.xml 84 | *.ipr 85 | 86 | ### Python ### 87 | # Byte-compiled / optimized / DLL files 88 | __pycache__/ 89 | *.py[cod] 90 | *$py.class 91 | 92 | # C extensions 93 | *.so 94 | 95 | # Distribution / packaging 96 | .Python 97 | build/ 98 | develop-eggs/ 99 | dist/ 100 | downloads/ 101 | eggs/ 102 | .eggs/ 103 | lib/ 104 | lib64/ 105 | parts/ 106 | sdist/ 107 | var/ 108 | wheels/ 109 | pip-wheel-metadata/ 110 | share/python-wheels/ 111 | *.egg-info/ 112 | .installed.cfg 113 | *.egg 114 | MANIFEST 115 | 116 | # PyInstaller 117 | # Usually these files are written by a python script from a template 118 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 119 | *.manifest 120 | *.spec 121 | 122 | # Installer logs 123 | pip-log.txt 124 | pip-delete-this-directory.txt 125 | 126 | # Unit test / coverage reports 127 | htmlcov/ 128 | .tox/ 129 | .nox/ 130 | .coverage 131 | .coverage.* 132 | .cache 133 | nosetests.xml 134 | coverage.xml 135 | *.cover 136 | *.py,cover 137 | .hypothesis/ 138 | .pytest_cache/ 139 | pytestdebug.log 140 | 141 | # Translations 142 | *.mo 143 | *.pot 144 | 145 | # Django stuff: 146 | *.log 147 | local_settings.py 148 | db.sqlite3 149 | db.sqlite3-journal 150 | 151 | # Flask stuff: 152 | instance/ 153 | .webassets-cache 154 | 155 | # Scrapy stuff: 156 | .scrapy 157 | 158 | # Sphinx documentation 159 | docs/_build/ 160 | doc/_build/ 161 | 162 | # PyBuilder 163 | target/ 164 | 165 | # Jupyter Notebook 166 | .ipynb_checkpoints 167 | 168 | # IPython 169 | profile_default/ 170 | ipython_config.py 171 | 172 | # pyenv 173 | .python-version 174 | 175 | # pipenv 176 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 177 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 178 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 179 | # install all needed dependencies. 180 | #Pipfile.lock 181 | 182 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 183 | __pypackages__/ 184 | 185 | # Celery stuff 186 | celerybeat-schedule 187 | celerybeat.pid 188 | 189 | # SageMath parsed files 190 | *.sage.py 191 | 192 | # Environments 193 | .env 194 | .venv 195 | env/ 196 | venv/ 197 | ENV/ 198 | env.bak/ 199 | venv.bak/ 200 | pythonenv* 201 | 202 | # Spyder project settings 203 | .spyderproject 204 | .spyproject 205 | 206 | # Rope project settings 207 | .ropeproject 208 | 209 | # mkdocs documentation 210 | /site 211 | 212 | # mypy 213 | .mypy_cache/ 214 | .dmypy.json 215 | dmypy.json 216 | 217 | # Pyre type checker 218 | .pyre/ 219 | 220 | # pytype static type analyzer 221 | .pytype/ 222 | 223 | # profiling data 224 | .prof 225 | 226 | # End of https://www.toptal.com/developers/gitignore/api/python,pycharm+iml 227 | 228 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Datasource local storage ignored files 5 | /dataSources/ 6 | /dataSources.local.xml 7 | # Editor-based HTTP Client requests 8 | /httpRequests/ 9 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /Benchmarking_RF.py: -------------------------------------------------------------------------------- 1 | # %% 2 | 3 | # https://stackoverflow.com/questions/48484807/training-a-decision-tree-using-id3-algorithm-by-sklearn 4 | # https://scikit-learn.org/stable/modules/tree.html#tree 5 | # https://medium.com/@mohtedibf/indepth-parameter-tuning-for-decision-tree-6753118a03c3 6 | # https://medium.com/datadriveninvestor/tree-algorithms-id3-c4-5-c5-0-and-cart-413387342164 7 | # https://scikit-learn.org/stable/modules/cross_validation.html 8 | import json 9 | 10 | import matplotlib 11 | from datetime import datetime 12 | import pandas as pd, numpy as np 13 | import math 14 | from sklearn.model_selection import cross_val_predict, KFold, cross_val_score, train_test_split, learning_curve, \ 15 | cross_validate 16 | from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, \ 17 | make_scorer, recall_score, precision_recall_fscore_support 18 | from sklearn.preprocessing import MinMaxScaler, minmax_scale, scale 19 | import matplotlib.pyplot as plt 20 | from copy import deepcopy 21 | from sklearn.ensemble import RandomForestClassifier 22 | from sklearn.neural_network import MLPClassifier 23 | 24 | 25 | DITCH_DEST_PORT = True # Remove destination port! 26 | MLP = False 27 | RF = True 28 | CR_VAL_TRAIN = False 29 | DATA_VERSION = "no_artefacts_with_payload_filter" 30 | # 3 random states for each dataset iteration: 42, 43, 44 31 | DATA_SPLIT_RANDOM_STATE = 44 32 | RF_RANDOM_STATE = 44 33 | 34 | def gen_id(): 35 | return datetime.utcnow().strftime("%d-%m_%H%-M%-S") 36 | 37 | 38 | def plot_confusion_matrix(y_true, y_pred, classes, 39 | normalize=True, 40 | cmap=plt.cm.Reds, 41 | save=False, 42 | name=None): 43 | # title = 44 | 45 | # Compute confusion matrix 46 | cm2 = confusion_matrix(y_true, y_pred) 47 | 48 | if normalize: 49 | cm = cm2.astype('float') / cm2.sum(axis=1)[:, np.newaxis] 50 | print("Normalized confusion matrix") 51 | else: 52 | print('Confusion matrix, without normalization') 53 | 54 | # print(cm) 55 | 56 | fig, ax = plt.subplots(figsize=(9, 9)) 57 | im = ax.imshow(cm, interpolation='nearest', cmap=cmap) 58 | # ax.figure.colorbar(im, ax=ax) 59 | # We want to show all ticks... 60 | ax.set(xticks=np.arange(cm.shape[1]), 61 | yticks=np.arange(cm.shape[0]), 62 | # ... and label them with the respective list entries 63 | # xticklabels=classes, yticklabels=classes, 64 | # title=title, 65 | # ylabel='True label', 66 | # xlabel='Predicted label' 67 | ) 68 | hfont = {"fontname": "serif"} 69 | fontsize = "x-large" 70 | # ax.set_xlabel('Predicted', fontsize=fontsize,**hfont), 71 | # ax.set_ylabel('True', fontsize=fontsize,**hfont), 72 | ax.set_xticklabels(classes, fontsize=fontsize, **hfont) 73 | ax.set_yticklabels(classes, fontsize=fontsize, **hfont, fontweight='bold') 74 | 75 | # Rotate the tick labels and set their alignment. 76 | plt.setp(ax.get_xticklabels(), rotation=45, ha="right", 77 | rotation_mode="anchor") 78 | 79 | # Loop over data dimensions and create text annotations. 80 | fmt = '.2f' if normalize else 'd' 81 | thresh = cm.max() / 2. 82 | for i in range(cm.shape[0]): 83 | for j in range(cm.shape[1]): 84 | ax.text(j, i, format(cm2[i, j], 'd'), # format(cm[i, j], fmt) 85 | ha="center", va="center", 86 | color="white" if cm[i, j] > thresh else "black") 87 | fig.tight_layout() 88 | 89 | if save: 90 | fig.savefig("Figures/" + name + ".pdf", dpi=400, 91 | bbox_inches='tight', pad_inches=0) 92 | 93 | plt.show() 94 | return ax 95 | 96 | 97 | # https://stackoverflow.com/questions/35249760/using-scikit-to-determine-contributions-of-each-feature-to-a-specific-class-pred 98 | def class_feature_importance(X, Y, feature_importances): 99 | N, M = X.shape 100 | X = scale(X) 101 | 102 | out = {} 103 | for c in set(Y): 104 | out[c] = dict( 105 | zip(range(N), np.mean(X[Y == c, :], axis=0) * feature_importances) 106 | ) 107 | 108 | return out 109 | 110 | 111 | def load_data(): 112 | print("Loading training data ...") 113 | full_dataset = np.load("NumpyFriendlyData/full_dataset_" + DATA_VERSION + ".npy") 114 | 115 | full_dataset = full_dataset[~np.isnan(full_dataset).any(axis=1)] 116 | full_dataset = full_dataset[~np.isinf(full_dataset).any(axis=1)] 117 | 118 | data_x = full_dataset[:, :-1] 119 | data_y = full_dataset[:, -1] 120 | 121 | print(np.unique(data_y, return_counts=True)) 122 | 123 | if DITCH_DEST_PORT: 124 | data_x = data_x[:, 1:] # Dest Port index = 0 125 | 126 | splits = train_test_split(data_x, data_y, test_size=0.25, stratify=data_y, 127 | random_state=DATA_SPLIT_RANDOM_STATE) 128 | return splits 129 | 130 | 131 | if __name__ == "__main__": 132 | 133 | time_id = gen_id() 134 | 135 | (X_train, X_test, Y_train, Y_test) = load_data() 136 | print(X_train.shape, X_test.shape) 137 | 138 | # from sklearn.tree import DecisionTreeClassifier 139 | # print("Decision Tree") 140 | # clf = DecisionTreeClassifier(criterion='entropy', random_state=0) 141 | 142 | if MLP: 143 | #X_train = minmax_scale(X_train) 144 | #X_test = minmax_scale(X_test) 145 | 146 | print("Applying minmax-scaling on train and test set") 147 | scaler = MinMaxScaler() 148 | scaler.fit(X_train) 149 | X_train = scaler.transform(X_train) 150 | X_test = scaler.transform(X_test) 151 | 152 | print("Multilayered Perceptron") 153 | mlp_classifier = MLPClassifier(hidden_layer_sizes=(156,78,39)) 154 | 155 | print("wrong script for MLP") 156 | exit(0) 157 | 158 | scoring = { 159 | 'accuracy': make_scorer(accuracy_score), 160 | 'precision': make_scorer(precision_score, average='weighted'), 161 | 'f1_score': make_scorer(f1_score, average='weighted'), 162 | 'recall': make_scorer(recall_score, average='weighted') 163 | } 164 | 165 | print("Random Forest") 166 | rf_classifier = RandomForestClassifier(n_estimators=50, max_depth=20, random_state=RF_RANDOM_STATE) 167 | 168 | if CR_VAL_TRAIN: 169 | print("Cross validating ...") 170 | sc = cross_validate(rf_classifier, X_train, Y_train, cv=5, scoring=scoring) 171 | print("Score:\n", sc) 172 | 173 | # print("Fit time: " % (sc['fit_time'])) 174 | # print("Score time: " % (sc['score_time'])) 175 | 176 | print("Precision: %0.8f (%0.8f)" % (sc['test_precision'].mean(), sc['test_precision'].std())) 177 | print("Recall: %0.8f (%0.8f)" % (sc['test_recall'].mean(), sc['test_recall'].std())) 178 | print("F1_score: %0.8f (%0.8f)" % (sc['test_f1_score'].mean(), sc['test_f1_score'].std())) 179 | print("Accuracy: %0.8f (%0.8f)" % (sc['test_accuracy'].mean(), sc['test_accuracy'].std())) 180 | 181 | print("Fitting model ...") 182 | rf_classifier.fit(X_train, Y_train) 183 | 184 | Y_pred = rf_classifier.predict(X_test) 185 | 186 | nY_test = [] 187 | nY_pred = [] 188 | for i in range(len(Y_test)): 189 | nY_test += [Y_test[i]] 190 | nY_pred += [Y_pred[i]] 191 | Y_test = np.array(nY_test) 192 | Y_pred = np.array(nY_pred) 193 | print(Y_test.shape, Y_pred.shape) 194 | 195 | # %% 196 | 197 | classes = ["Benign", "FTP-Patator", "SSH-Patator", "DoS GoldenEye", "DoS Hulk", "DoS Slowhttptest", "DoS slowloris", 198 | "Heartbleed", "Web Attack - Brute Force", "Web Attack - XSS", "Web Attack - Sql Injection", "Infiltration", 199 | "Bot", "PortScan", "DDoS"] 200 | 201 | plot_confusion_matrix(Y_test, Y_pred, classes, save=True, name="RF_" + DATA_VERSION + "_" + time_id ) 202 | 203 | prfs = precision_recall_fscore_support(Y_test, Y_pred, average='weighted') 204 | print("Precision, Recall, F-Score, Support:", prfs) 205 | 206 | # %% 207 | 208 | plot_confusion_matrix(Y_test, Y_pred, classes) 209 | 210 | prfs = precision_recall_fscore_support(Y_test, Y_pred, average='weighted') 211 | print("Precision, Recall, F-Score, Support:", prfs) 212 | with open("Scores/RF_" + DATA_VERSION + "_" + time_id + "_metrics_aggregated.txt", 'w') as out_file: 213 | out_file.write("Precision, Recall, F-Score, Support: " + str(prfs)) 214 | 215 | Y_pred = list(Y_pred) 216 | 217 | np.save("Class-based_metrics/Y_test_" + DATA_VERSION + "_" + time_id, Y_test) 218 | np.save("Class-based_metrics/Y_pred_" + DATA_VERSION + "_" + time_id, Y_pred) 219 | 220 | class_based_metrics = classification_report(Y_test, Y_pred, target_names=classes, zero_division="warn", digits=4) 221 | print("Class based metrics:\r\n", class_based_metrics) 222 | with open("Scores/RF_" + DATA_VERSION + "_metrics_class_based_" + time_id + ".txt", 'w') as out_file: 223 | out_file.write(class_based_metrics) 224 | 225 | # This next part is to calculate the feature importance for a RF classifier. Comment this out if you're using MLP 226 | 227 | feature_importances = rf_classifier.feature_importances_ 228 | 229 | result = class_feature_importance(X_test, Y_pred, feature_importances) 230 | 231 | print(json.dumps(result, indent=4)) 232 | 233 | with open("FeatureImportance/feature_importance_full_dataset_" + DATA_VERSION + "_" + time_id + ".json", 'w') as f: 234 | json.dump(result, f) 235 | 236 | ''' 237 | features_list = "Dst Port,Protocol,Flow Duration,Total Fwd Packet," \ 238 | "Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max," \ 239 | "Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max," \ 240 | "Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s," \ 241 | "Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std," \ 242 | "Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min," \ 243 | "Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Length,Bwd Header Length," \ 244 | "Fwd Packets/s,Bwd Packets/s,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std," \ 245 | "Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count," \ 246 | "URG Flag Count,CWR Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Fwd Segment Size Avg," \ 247 | "Bwd Segment Size Avg,Fwd Bytes/Bulk Avg,Fwd Packet/Bulk Avg,Fwd Bulk Rate Avg,Bwd Bytes/Bulk Avg," \ 248 | "Bwd Packet/Bulk Avg,Bwd Bulk Rate Avg,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets," \ 249 | "Subflow Bwd Bytes,FWD Init Win Bytes,Bwd Init Win Bytes,Fwd Act Data Pkts,Fwd Seg Size Min," \ 250 | "Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label".split(',') 251 | 252 | label_dictionary = { 253 | 'BENIGN': '0', 254 | 'FTP-Patator': '1', 255 | 'SSH-Patator': '2', 256 | 'DoS GoldenEye': '3', 257 | 'DoS Hulk': '4', 258 | 'DoS Slowhttptest': '5', 259 | 'DoS slowloris': '6', 260 | 'Heartbleed': '7', 261 | 'Web Attack – Brute Force': '8', 262 | 'Web Attack – XSS': '9', 263 | 'Web Attack – Sql Injection': '10', 264 | 'Infiltration': '11', 265 | 'Bot': '12', 266 | 'PortScan': '13', 267 | 'DDoS': '14' 268 | } 269 | ''' 270 | -------------------------------------------------------------------------------- /Class-based_metrics/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | */ 3 | !.gitignore -------------------------------------------------------------------------------- /FeatureImportance/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | */ 3 | !.gitignore -------------------------------------------------------------------------------- /Figures/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | */ 3 | !.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Gints Engelen (gints.engelen@kuleuven.be) 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | 21 | # When using this code, please cite our paper: 22 | G. Engelen, V. Rimmer, W. Joosen, "Troubleshooting an Intrusion Detection Dataset: the CICIDS2017 Case Study", 2021 IEEE European Symposium on Security and Privacy Workshops (EuroS&PW), 2021. 23 | 24 | Our paper as well as its extended documentation can be found at https://downloads.distrinet-research.be/WTMC2021/ 25 | 26 | # Contributors 27 | 28 | For labelling_CSV_flows.py: 29 | Jin Li 30 | Vera Rimmer 31 | Gints Engelen 32 | 33 | For all other code: 34 | Gints Engelen 35 | Vera Rimmer -------------------------------------------------------------------------------- /LabelledDataset/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | */ 3 | !.gitignore -------------------------------------------------------------------------------- /MakeDataNumpyFriendly.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This Should be run after all CSV files are fully labelled. 3 | The script does two things: 4 | 1. It removes flow id, source ip, destination ip, source port, and timestamp as features 5 | 2. It converts all labels to numerical labels 6 | ''' 7 | import copy 8 | import csv 9 | import pandas as pd 10 | import numpy as np 11 | 12 | dataset_directory = 'LabelledDataset' 13 | saved_numpy_name = 'full_dataset_no_artefacts_with_payload_filter.npy' 14 | 15 | 16 | def importCsvAsDict(path): 17 | print('Importing from ', path) 18 | csvfile = csv.DictReader(open(path), delimiter=',') 19 | return [x for x in csvfile] 20 | 21 | 22 | def convertToNumericalLabels(flows_list_of_dict): 23 | print('Relabelling flows') 24 | 25 | label_dictionary = { 26 | 'BENIGN': '0', 27 | 'FTP-Patator': '1', 28 | 'SSH-Patator': '2', 29 | 'DoS GoldenEye': '3', 30 | 'DoS Hulk': '4', 31 | 'DoS Slowhttptest': '5', 32 | 'DoS slowloris': '6', 33 | 'Heartbleed': '7', 34 | 'Web Attack - Brute Force': '8', 35 | 'Web Attack - XSS': '9', 36 | 'Web Attack - Sql Injection': '10', 37 | 'Infiltration': '11', 38 | 'Bot': '12', 39 | 'PortScan': '13', 40 | 'DDoS': '14', 41 | # IMPORTANT NOTE: For our experiments, we treated all "X - Attempted" flows as BENIGN. If you want to keep the 42 | # "X - Attempted" flows separate, please change the values corresponding to the keys below 43 | 'FTP-Patator - Attempted' : '0', 44 | 'SSH-Patator - Attempted' : '0', 45 | 'DoS GoldenEye - Attempted' : '0', 46 | 'DoS Hulk - Attempted' : '0', 47 | 'DoS Slowhttptest - Attempted' : '0', 48 | 'DoS slowloris - Attempted' : '0', 49 | 'Heartbleed - Attempted' : '0', 50 | 'Web Attack - Brute Force - Attempted' : '0', 51 | 'Web Attack - XSS - Attempted' : '0', 52 | 'Web Attack - Sql Injection - Attempted' : '0', 53 | 'Infiltration - Attempted' : '0', 54 | 'Bot - Attempted' : '0', 55 | # Note that PortScan doesn't have any 'Attempted' flows because it doesn't rely on a payload transfer for its 56 | # effectiveness 57 | 'DDoS - Attempted' : '0' 58 | } 59 | 60 | for (index, row) in enumerate(flows_list_of_dict): 61 | current_label = row['Label'] 62 | flows_list_of_dict[index]['Label'] = label_dictionary[current_label] 63 | 64 | 65 | def listOfDictToNumpyArray(list_of_dict): 66 | dataframe = pd.DataFrame(list_of_dict) 67 | numpy_string_array = dataframe.values 68 | # See point 1 in the description at the top of the file 69 | trimmed_values = np.concatenate((numpy_string_array[:, 4:6], numpy_string_array[:, 7:]), axis=1) 70 | return trimmed_values.astype(np.float) 71 | 72 | 73 | print("monday") 74 | monday_dict = importCsvAsDict(dataset_directory + '/Monday-WorkingHours.pcap_REVI.csv') 75 | convertToNumericalLabels(monday_dict) 76 | monday_numpy_array = listOfDictToNumpyArray(monday_dict) 77 | 78 | print("tuesday") 79 | tuesday_dict = importCsvAsDict(dataset_directory + '/Tuesday-WorkingHours.pcap_REVI.csv') 80 | convertToNumericalLabels(tuesday_dict) 81 | tuesday_numpy_array = listOfDictToNumpyArray(tuesday_dict) 82 | 83 | print("wednesday") 84 | wednesday_dict = importCsvAsDict(dataset_directory + '/Wednesday-WorkingHours.pcap_REVI.csv') 85 | convertToNumericalLabels(wednesday_dict) 86 | wednesday_numpy_array = listOfDictToNumpyArray(wednesday_dict) 87 | 88 | print("thursday") 89 | thursday_dict = importCsvAsDict(dataset_directory + '/Thursday-WorkingHours.pcap_REVI.csv') 90 | convertToNumericalLabels(thursday_dict) 91 | thursday_numpy_array = listOfDictToNumpyArray(thursday_dict) 92 | 93 | print("friday") 94 | friday_dict = importCsvAsDict(dataset_directory + '/Friday-WorkingHours.pcap_REVI.csv') 95 | convertToNumericalLabels(friday_dict) 96 | friday_numpy_array = listOfDictToNumpyArray(friday_dict) 97 | 98 | full_dataset = np.concatenate((monday_numpy_array, tuesday_numpy_array, wednesday_numpy_array, thursday_numpy_array, 99 | friday_numpy_array), axis=0) 100 | 101 | print("saving dataset") 102 | np.save('NumpyFriendlyData/' + saved_numpy_name, full_dataset) 103 | -------------------------------------------------------------------------------- /NumpyFriendlyData/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | */ 3 | !.gitignore -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Troubleshooting an Intrusion Detection Dataset: the CICIDS2017 Case Study 2 | 3 | This repository contains the code used for our [paper](https://downloads.distrinet-research.be/WTMC2021/Resources/wtmc2021_Engelen_Troubleshooting.pdf). 4 | The code performs the labelling and benchmarking for the [CICIDS 2017 dataset](https://www.unb.ca/cic/datasets/ids-2017.html) 5 | after it has been processed by [our modified version of the CICFlowMeter tool](https://github.com/GintsEngelen/CICFlowMeter). 6 | 7 | Note that all of this is *research code*. 8 | 9 | If you use the code in this repository, please cite our paper: 10 | 11 | @inproceedings{engelen2021troubleshooting, 12 | title={Troubleshooting an Intrusion Detection Dataset: the CICIDS2017 Case Study}, 13 | author={Engelen, Gints and Rimmer, Vera and Joosen, Wouter}, 14 | booktitle={2021 IEEE Security and Privacy Workshops (SPW)}, 15 | pages={7--12}, 16 | year={2021}, 17 | organization={IEEE} 18 | } 19 | 20 | An extended documentation of our paper can be found [here](https://downloads.distrinet-research.be/WTMC2021/). 21 | 22 | ## How to use this repository 23 | 24 | First, head over to the website of the [CICIDS 2017 dataset](https://www.unb.ca/cic/datasets/ids-2017.html) and download 25 | the raw version of the dataset (PCAP file format). There are 5 files in total, one for each day. 26 | 27 | Then, run our [our modified version of the CICFlowMeter tool](https://github.com/GintsEngelen/CICFlowMeter) on the data 28 | obtained in the previous step: 29 | 30 | 1. Start the CICFlowMeter tool 31 | 2. Under the "NetWork" menu option, select "Offline" 32 | 3. For "Pcap dir", choose the directory containing the 5 PCAP files of the CICIDS 2017 dataset 33 | 4. For "Output dir", choose the "UnlabelledDataset" directory of this WTCM2021-Code project. 34 | 5. Keep the default values for the "Flow TimeOut" and "Activity Timeout" parameters (120000000 and 5000000 respectively) 35 | 36 | This will generate 5 CSV files with the flows extracted from the raw PCAP files. 37 | 38 | After this, verify the `TIME_DIFFERENCE`, `INPUT_DIR`, `OUTPUT_DIR` and `PAYLOAD_FILTER_ACTIVE` attributes in the 39 | `labelling_CSV_flows.py` script, and then run it (no need to specify any command-line options). This will label all the 40 | flows in the CSV files generated by the CICFlowMeter tool. 41 | 42 | Then, run the `MakeDataNumpyFriendly.py` script, which will convert the labelled CSV files into a single numpy array. 43 | Note that, in our experiments, we chose to relabel all "Attempted" flows as BENIGN. If you wish to keep them separate, 44 | make sure to change the numerical labels in the `convertToNumericalLabels(flows_list_of_dict)` function. 45 | 46 | Finally, run the `Benchmarking_RF.py` script to perform benchmarking on the dataset using a Random Forest classifier. 47 | Random seeds and various other options can be specified in the first few lines of the script. -------------------------------------------------------------------------------- /Scores/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | */ 3 | !.gitignore -------------------------------------------------------------------------------- /UnlabelledDataset/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | */ 3 | !.gitignore -------------------------------------------------------------------------------- /labelling_CSV_flows.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from datetime import datetime 3 | from datetime import timedelta 4 | 5 | DAY_STR = [None, "Monday", "Tuesday", "Wednesday", "Thursday", "Friday"] 6 | 7 | PRINT_STEPS = 3000 8 | DATE_FORMAT_INTERNAL = '%d/%m/%Y %I:%M:%S %p' 9 | DATE_FORMAT_DATASET = '%d/%m/%Y %I:%M:%S %p' 10 | # The CICIDS 2017 dataset was generated in New Brunswick, Canada. Running the CICFlowMeter tool on this data automatically 11 | # converts all timestamps in the data from the timezone of New Brunswick, Canada, to the timezone of the host running 12 | # the CICFlowMeter tool. The TIME_DIFFERENCE attribute specifies the time difference between these two timezones. 13 | # specifically: TIME_DIFFERENCE = {CICFlowMeter host timezone} - {New Brunswick, Canada timezone} 14 | TIME_DIFFERENCE = timedelta(hours=5) 15 | 16 | INPUT_DIR = 'UnlabelledDataset/' 17 | OUTPUT_DIR = 'LabelledDataset/' 18 | 19 | # Some attack categories rely on transfer of a payload in order to be effective. When a malicious flow belongs to such a 20 | # category but doesn't contain a payload, setting this filter to True will label these flows as "X - Attempted" with "X" 21 | # the original attack class. Setting this filter to False will simply label the flow as part of the attack category. 22 | PAYLOAD_FILTER_ACTIVE = True 23 | 24 | 25 | # DATE_FORMAT_DATASET = '%d/%m/%Y %H:%M' 26 | # TIME_DIFFERENCE = timedelta(hours=0) 27 | 28 | 29 | def merge_label(day): 30 | day_str = DAY_STR[day] # 3-reorganize 31 | with open('G:\\Datasets\\CICIDS2017-PCAPs\\2-labeling\\' + day_str + '-WorkingHours.pcap_REVI.csv') as csv_flow: 32 | spamreader = csv.reader(csv_flow, delimiter=',', quotechar='|') 33 | next(spamreader) 34 | total = 0 35 | with open('G:\\Datasets\\CICIDS2017-PCAPs\\2-labeling\\' + day_str + '-WorkingHours.pcap_SeqInfo.txt', 36 | 'r') as txt_input: 37 | with open('G:\\Datasets\\CICIDS2017-PCAPs\\2-labeling\\' + day_str + '-WorkingHours.pcap_SeqInfoLabel.txt', 38 | 'w') as txt_output: 39 | for row_seq in txt_input: 40 | txt_row = row_seq.split(';') 41 | csv_row = next(spamreader) 42 | assert (txt_row[0] == csv_row[-1]) # same uid 43 | 44 | txt_row.insert(1, csv_row[-1]) # insert label into text file 45 | txt_output.write(';'.join(txt_row)) 46 | txt_output.flush() 47 | 48 | total += 1 49 | print(day_str + " merged") 50 | 51 | 52 | def dataset_stat_attack(day, ver='ISCX'): 53 | day_str = DAY_STR[day] 54 | col = -1 # if ver == 'ISCX' else -2 55 | with open(OUTPUT_DIR + day_str + '-WorkingHours.pcap_' + ver + '.csv', 56 | newline='') as csv_flow: 57 | spamreader = csv.reader(csv_flow, delimiter=',', quotechar='|') 58 | next(spamreader) 59 | total = 0 60 | all_attacks = {} 61 | for row in spamreader: 62 | lbl_attack = row[col] 63 | if lbl_attack not in all_attacks: 64 | all_attacks[lbl_attack] = 1 65 | else: 66 | all_attacks[lbl_attack] += 1 67 | total += 1 68 | # if total % PRINT_STEPS == 0: 69 | # print('> ' + str(total)) 70 | print(ver + ' Stat ' + day_str + ':') 71 | print(all_attacks) 72 | print('Total: ' + str(total)) 73 | 74 | 75 | # row = a row in the CSV file, corresponding to one flow 76 | # attack_class = String name of the attack class 77 | # Returns a string of the attack class if it passes through the filter 78 | # Returns "X - Attempted" with X the attack_class if the flow is a TCP flow and does not contain any data transfer in 79 | # the forward direction. 80 | # Note that if the payload filter is not active, or the underlying protocol is not TCP, it returns the attack class 81 | # by default. 82 | def payload_filter(row, attack_class): 83 | # row[10] = total Length of payload bytes in Fwd direction 84 | # row[5] = Protocol, we only want TCP connections, 6 = TCP 85 | if PAYLOAD_FILTER_ACTIVE and int(row[5]) == 6: 86 | if float(row[10]) > 0.0: 87 | return attack_class 88 | else: 89 | return attack_class + " - Attempted" 90 | else: 91 | return attack_class 92 | 93 | 94 | def monday_benign(_): 95 | return "BENIGN" 96 | 97 | 98 | def tuesday_ftp_patator(row): 99 | t_start = datetime.strptime('04/07/2017 09:17:00 AM', DATE_FORMAT_INTERNAL) 100 | t_end = datetime.strptime('04/07/2017 10:30:00 AM', DATE_FORMAT_INTERNAL) 101 | attacker = '172.16.0.1' 102 | victim = '192.168.10.50' 103 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE 104 | if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end: 105 | return payload_filter(row, "FTP-Patator") 106 | return None 107 | 108 | 109 | def tuesday_ssh_patator(row): 110 | t_start = datetime.strptime('04/07/2017 01:00:00 PM', DATE_FORMAT_INTERNAL) 111 | t_end = datetime.strptime('04/07/2017 04:11:00 PM', DATE_FORMAT_INTERNAL) 112 | attacker = '172.16.0.1' 113 | victim = '192.168.10.50' 114 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE 115 | if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end: 116 | return payload_filter(row, "SSH-Patator") 117 | return None 118 | 119 | 120 | def wednesday_dos_slowloris(row): 121 | t_start = datetime.strptime('05/07/2017 02:23:00 AM', DATE_FORMAT_INTERNAL) 122 | t_end = datetime.strptime('05/07/2017 10:12:59 AM', DATE_FORMAT_INTERNAL) 123 | attacker = '172.16.0.1' 124 | victim = '192.168.10.50' 125 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE 126 | if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end: 127 | return payload_filter(row, "DoS slowloris") 128 | return None 129 | 130 | 131 | def wednesday_dos_slowhttptest(row): 132 | t_start = datetime.strptime('05/07/2017 10:13:00 AM', DATE_FORMAT_INTERNAL) 133 | t_end = datetime.strptime('05/07/2017 10:38:00 AM', DATE_FORMAT_INTERNAL) 134 | attacker = '172.16.0.1' 135 | victim = '192.168.10.50' 136 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE 137 | if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end: 138 | return payload_filter(row, "DoS Slowhttptest") 139 | return None 140 | 141 | 142 | def wednesday_dos_hulk(row): 143 | t_start = datetime.strptime('05/07/2017 10:39:00 AM', DATE_FORMAT_INTERNAL) 144 | t_end = datetime.strptime('05/07/2017 11:09:00 AM', DATE_FORMAT_INTERNAL) 145 | attacker = '172.16.0.1' 146 | victim = '192.168.10.50' 147 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE 148 | if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end: 149 | return payload_filter(row, "DoS Hulk") 150 | return None 151 | 152 | 153 | def wednesday_dos_goldeneye(row): 154 | t_start = datetime.strptime('05/07/2017 11:10:00 AM', DATE_FORMAT_INTERNAL) 155 | t_end = datetime.strptime('05/07/2017 11:23:00 AM', DATE_FORMAT_INTERNAL) 156 | attacker = '172.16.0.1' 157 | victim = '192.168.10.50' 158 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE 159 | if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end: 160 | return payload_filter(row, "DoS GoldenEye") 161 | return None 162 | 163 | 164 | def wednesday_heartbleed(row): 165 | t_start = datetime.strptime('05/07/2017 03:11:00 PM', DATE_FORMAT_INTERNAL) 166 | t_end = datetime.strptime('05/07/2017 03:33:00 PM', DATE_FORMAT_INTERNAL) 167 | attacker = '172.16.0.1' 168 | victim = '192.168.10.51' 169 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE 170 | if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end and row[4] == '444': 171 | return payload_filter(row, "Heartbleed") 172 | return None 173 | 174 | 175 | def thursday_web_attack_brute_force(row): 176 | t_start = datetime.strptime('06/07/2017 09:10:00 AM', DATE_FORMAT_INTERNAL) 177 | t_end = datetime.strptime('06/07/2017 10:12:00 AM', DATE_FORMAT_INTERNAL) 178 | attacker = '172.16.0.1' 179 | victim = '192.168.10.50' 180 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE 181 | if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end: 182 | return payload_filter(row, "Web Attack - Brute Force") 183 | return None 184 | 185 | 186 | def thursday_web_attack_xss(row): 187 | t_start = datetime.strptime('06/07/2017 10:13:00 AM', DATE_FORMAT_INTERNAL) 188 | t_end = datetime.strptime('06/07/2017 10:37:00 AM', DATE_FORMAT_INTERNAL) 189 | attacker = '172.16.0.1' 190 | victim = '192.168.10.50' 191 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE 192 | if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end: 193 | return payload_filter(row, "Web Attack - XSS") 194 | return None 195 | 196 | 197 | def thursday_web_attack_sql_injection(row): 198 | t_start = datetime.strptime('06/07/2017 10:39:00 AM', DATE_FORMAT_INTERNAL) 199 | t_end = datetime.strptime('06/07/2017 10:45:00 AM', DATE_FORMAT_INTERNAL) 200 | attacker = '172.16.0.1' 201 | victim = '192.168.10.50' 202 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE 203 | if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end: 204 | return payload_filter(row, "Web Attack - Sql Injection") 205 | return None 206 | 207 | 208 | def thursday_web_attack_infiltration(row): 209 | t_start = datetime.strptime('06/07/2017 02:15:00 PM', DATE_FORMAT_INTERNAL) 210 | t_end = datetime.strptime('06/07/2017 03:50:00 PM', DATE_FORMAT_INTERNAL) 211 | attacker = '192.168.10.8' 212 | victim = '205.174.165.73' 213 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE 214 | if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end: 215 | return payload_filter(row, "Infiltration") 216 | return None 217 | 218 | 219 | def friday_botnet(row): 220 | t_start = datetime.strptime('07/07/2017 09:30:00 AM', DATE_FORMAT_INTERNAL) 221 | t_end = datetime.strptime('07/07/2017 12:59:59 PM', DATE_FORMAT_INTERNAL) 222 | cond_hosts = (row[1] == '205.174.165.73' or row[3] == '205.174.165.73') or ( 223 | row[1] == '192.168.10.17' and row[3] == '52.7.235.158') or ( 224 | row[1] == '192.168.10.12' and row[3] == '52.6.13.28') 225 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE 226 | if t_start <= t_flow <= t_end and cond_hosts and (row[2] == '8080' or row[4] == '8080') and row[5] == '6': 227 | return payload_filter(row, "Bot") 228 | return None 229 | 230 | 231 | def friday_portscan(row): 232 | t_start = datetime.strptime('07/07/2017 12:30:00 PM', DATE_FORMAT_INTERNAL) 233 | t_end = datetime.strptime('07/07/2017 03:40:00 PM', DATE_FORMAT_INTERNAL) 234 | attacker = '172.16.0.1' 235 | victim = '192.168.10.50' 236 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE 237 | if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end: 238 | return "PortScan" 239 | return None 240 | 241 | 242 | def friday_ddos(row): 243 | t_start = datetime.strptime('07/07/2017 03:40:00 PM', DATE_FORMAT_INTERNAL) 244 | t_end = datetime.strptime('07/07/2017 04:30:00 PM', DATE_FORMAT_INTERNAL) 245 | attacker = '172.16.0.1' 246 | victim = '192.168.10.50' 247 | t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE 248 | if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end: 249 | return payload_filter(row, "DDoS") 250 | return None 251 | 252 | 253 | def dataset_labeling(day): 254 | day_str = [None, "Monday", "Tuesday", "Wednesday", "Thursday", "Friday"][day] 255 | day_filters = [None, 256 | [monday_benign], 257 | [tuesday_ftp_patator, tuesday_ssh_patator], 258 | [wednesday_dos_slowloris, wednesday_dos_slowhttptest, wednesday_dos_hulk, wednesday_dos_goldeneye, 259 | wednesday_heartbleed], 260 | [thursday_web_attack_brute_force, thursday_web_attack_xss, thursday_web_attack_sql_injection, 261 | thursday_web_attack_infiltration], 262 | [friday_botnet, friday_portscan, friday_ddos]][day] 263 | with open(INPUT_DIR + day_str + '-WorkingHours.pcap_Flow.csv', 264 | newline='') as csv_flow: 265 | with open(OUTPUT_DIR + day_str + '-WorkingHours.pcap_REVI.csv', 'w', 266 | newline='') as csv_revised: 267 | spamreader = csv.reader(csv_flow, delimiter=',', quotechar='|') 268 | spamwriter = csv.writer(csv_revised, delimiter=',', quotechar='|') 269 | header = next(spamreader) 270 | spamwriter.writerow(header) 271 | 272 | total = 0 273 | all_attacks = {} 274 | for row in spamreader: 275 | lbl = "BENIGN" 276 | for filter in day_filters: 277 | lbl_attack = filter(row) 278 | if lbl_attack: 279 | lbl = lbl_attack 280 | break 281 | row[-1] = lbl 282 | 283 | if lbl not in all_attacks: 284 | all_attacks[lbl] = 1 285 | else: 286 | all_attacks[lbl] += 1 287 | 288 | spamwriter.writerow(row) 289 | total += 1 290 | # if total % PRINT_STEPS == 0: 291 | # print('> ' + str(total)) 292 | print('REVI Stat ' + day_str + ':') 293 | print(all_attacks) 294 | print('Total: ' + str(total)) 295 | 296 | 297 | def show_all_stats(): 298 | # dataset_stat_attack(5, 'ISCX') 299 | dataset_stat_attack(5, 'REVI') 300 | 301 | 302 | def label_all_datasets(): 303 | for i in range(1, 6): 304 | dataset_labeling(i) 305 | 306 | for i in range(1, 6): 307 | # dataset_stat_attack(i, 'ISCX') 308 | dataset_stat_attack(i, 'REVI') 309 | print('\n') 310 | 311 | 312 | def merge_all_labels(): 313 | for i in range(1, 6): 314 | merge_label(i) 315 | 316 | 317 | if __name__ == '__main__': 318 | label_all_datasets() 319 | # merge_all_labels() 320 | --------------------------------------------------------------------------------