├── .DS_Store
├── .gitignore
├── .idea
    ├── .gitignore
    ├── inspectionProfiles
    │   └── profiles_settings.xml
    └── vcs.xml
├── Benchmarking_RF.py
├── Class-based_metrics
    └── .gitignore
├── FeatureImportance
    └── .gitignore
├── Figures
    └── .gitignore
├── LICENSE
├── LabelledDataset
    └── .gitignore
├── MakeDataNumpyFriendly.py
├── NumpyFriendlyData
    └── .gitignore
├── README.md
├── Scores
    └── .gitignore
├── UnlabelledDataset
    └── .gitignore
└── labelling_CSV_flows.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GintsEngelen/WTMC2021-Code/14ee845f0d1c2f5d703d678233e25fb4d051e9d1/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.toptal.com/developers/gitignore/api/python,pycharm+iml
  3 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,pycharm+iml
  4 | 
  5 | ### PyCharm+iml ###
  6 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
  7 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
  8 | 
  9 | # User-specific stuff
 10 | .idea/**/workspace.xml
 11 | .idea/**/tasks.xml
 12 | .idea/**/usage.statistics.xml
 13 | .idea/**/dictionaries
 14 | .idea/**/shelf
 15 | 
 16 | # Generated files
 17 | .idea/**/contentModel.xml
 18 | 
 19 | # Sensitive or high-churn files
 20 | .idea/**/dataSources/
 21 | .idea/**/dataSources.ids
 22 | .idea/**/dataSources.local.xml
 23 | .idea/**/sqlDataSources.xml
 24 | .idea/**/dynamic.xml
 25 | .idea/**/uiDesigner.xml
 26 | .idea/**/dbnavigator.xml
 27 | 
 28 | # Gradle
 29 | .idea/**/gradle.xml
 30 | .idea/**/libraries
 31 | 
 32 | # Gradle and Maven with auto-import
 33 | # When using Gradle or Maven with auto-import, you should exclude module files,
 34 | # since they will be recreated, and may cause churn.  Uncomment if using
 35 | # auto-import.
 36 | # .idea/artifacts
 37 | # .idea/compiler.xml
 38 | # .idea/jarRepositories.xml
 39 | # .idea/modules.xml
 40 | # .idea/*.iml
 41 | # .idea/modules
 42 | # *.iml
 43 | # *.ipr
 44 | 
 45 | # CMake
 46 | cmake-build-*/
 47 | 
 48 | # Mongo Explorer plugin
 49 | .idea/**/mongoSettings.xml
 50 | 
 51 | # File-based project format
 52 | *.iws
 53 | 
 54 | # IntelliJ
 55 | out/
 56 | 
 57 | # mpeltonen/sbt-idea plugin
 58 | .idea_modules/
 59 | 
 60 | # JIRA plugin
 61 | atlassian-ide-plugin.xml
 62 | 
 63 | # Cursive Clojure plugin
 64 | .idea/replstate.xml
 65 | 
 66 | # Crashlytics plugin (for Android Studio and IntelliJ)
 67 | com_crashlytics_export_strings.xml
 68 | crashlytics.properties
 69 | crashlytics-build.properties
 70 | fabric.properties
 71 | 
 72 | # Editor-based Rest Client
 73 | .idea/httpRequests
 74 | 
 75 | # Android studio 3.1+ serialized cache file
 76 | .idea/caches/build_file_checksums.ser
 77 | 
 78 | ### PyCharm+iml Patch ###
 79 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023
 80 | 
 81 | *.iml
 82 | modules.xml
 83 | .idea/misc.xml
 84 | *.ipr
 85 | 
 86 | ### Python ###
 87 | # Byte-compiled / optimized / DLL files
 88 | __pycache__/
 89 | *.py[cod]
 90 | *$py.class
 91 | 
 92 | # C extensions
 93 | *.so
 94 | 
 95 | # Distribution / packaging
 96 | .Python
 97 | build/
 98 | develop-eggs/
 99 | dist/
100 | downloads/
101 | eggs/
102 | .eggs/
103 | lib/
104 | lib64/
105 | parts/
106 | sdist/
107 | var/
108 | wheels/
109 | pip-wheel-metadata/
110 | share/python-wheels/
111 | *.egg-info/
112 | .installed.cfg
113 | *.egg
114 | MANIFEST
115 | 
116 | # PyInstaller
117 | #  Usually these files are written by a python script from a template
118 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
119 | *.manifest
120 | *.spec
121 | 
122 | # Installer logs
123 | pip-log.txt
124 | pip-delete-this-directory.txt
125 | 
126 | # Unit test / coverage reports
127 | htmlcov/
128 | .tox/
129 | .nox/
130 | .coverage
131 | .coverage.*
132 | .cache
133 | nosetests.xml
134 | coverage.xml
135 | *.cover
136 | *.py,cover
137 | .hypothesis/
138 | .pytest_cache/
139 | pytestdebug.log
140 | 
141 | # Translations
142 | *.mo
143 | *.pot
144 | 
145 | # Django stuff:
146 | *.log
147 | local_settings.py
148 | db.sqlite3
149 | db.sqlite3-journal
150 | 
151 | # Flask stuff:
152 | instance/
153 | .webassets-cache
154 | 
155 | # Scrapy stuff:
156 | .scrapy
157 | 
158 | # Sphinx documentation
159 | docs/_build/
160 | doc/_build/
161 | 
162 | # PyBuilder
163 | target/
164 | 
165 | # Jupyter Notebook
166 | .ipynb_checkpoints
167 | 
168 | # IPython
169 | profile_default/
170 | ipython_config.py
171 | 
172 | # pyenv
173 | .python-version
174 | 
175 | # pipenv
176 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
177 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
178 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
179 | #   install all needed dependencies.
180 | #Pipfile.lock
181 | 
182 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
183 | __pypackages__/
184 | 
185 | # Celery stuff
186 | celerybeat-schedule
187 | celerybeat.pid
188 | 
189 | # SageMath parsed files
190 | *.sage.py
191 | 
192 | # Environments
193 | .env
194 | .venv
195 | env/
196 | venv/
197 | ENV/
198 | env.bak/
199 | venv.bak/
200 | pythonenv*
201 | 
202 | # Spyder project settings
203 | .spyderproject
204 | .spyproject
205 | 
206 | # Rope project settings
207 | .ropeproject
208 | 
209 | # mkdocs documentation
210 | /site
211 | 
212 | # mypy
213 | .mypy_cache/
214 | .dmypy.json
215 | dmypy.json
216 | 
217 | # Pyre type checker
218 | .pyre/
219 | 
220 | # pytype static type analyzer
221 | .pytype/
222 | 
223 | # profiling data
224 | .prof
225 | 
226 | # End of https://www.toptal.com/developers/gitignore/api/python,pycharm+iml
227 | 
228 | 


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Datasource local storage ignored files
5 | /dataSources/
6 | /dataSources.local.xml
7 | # Editor-based HTTP Client requests
8 | /httpRequests/
9 | 


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/Benchmarking_RF.py:
--------------------------------------------------------------------------------
  1 | # %%
  2 | 
  3 | # https://stackoverflow.com/questions/48484807/training-a-decision-tree-using-id3-algorithm-by-sklearn
  4 | # https://scikit-learn.org/stable/modules/tree.html#tree
  5 | # https://medium.com/@mohtedibf/indepth-parameter-tuning-for-decision-tree-6753118a03c3
  6 | # https://medium.com/datadriveninvestor/tree-algorithms-id3-c4-5-c5-0-and-cart-413387342164
  7 | # https://scikit-learn.org/stable/modules/cross_validation.html
  8 | import json
  9 | 
 10 | import matplotlib
 11 | from datetime import datetime
 12 | import pandas as pd, numpy as np
 13 | import math
 14 | from sklearn.model_selection import cross_val_predict, KFold, cross_val_score, train_test_split, learning_curve, \
 15 |     cross_validate
 16 | from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, \
 17 |     make_scorer, recall_score, precision_recall_fscore_support
 18 | from sklearn.preprocessing import MinMaxScaler, minmax_scale, scale
 19 | import matplotlib.pyplot as plt
 20 | from copy import deepcopy
 21 | from sklearn.ensemble import RandomForestClassifier
 22 | from sklearn.neural_network import MLPClassifier
 23 | 
 24 | 
 25 | DITCH_DEST_PORT = True # Remove destination port!
 26 | MLP = False
 27 | RF = True
 28 | CR_VAL_TRAIN = False
 29 | DATA_VERSION = "no_artefacts_with_payload_filter"
 30 | # 3 random states for each dataset iteration: 42, 43, 44
 31 | DATA_SPLIT_RANDOM_STATE = 44
 32 | RF_RANDOM_STATE = 44
 33 | 
 34 | def gen_id():
 35 |     return datetime.utcnow().strftime("%d-%m_%H%-M%-S")
 36 | 
 37 | 
 38 | def plot_confusion_matrix(y_true, y_pred, classes,
 39 |                           normalize=True,
 40 |                           cmap=plt.cm.Reds,
 41 |                           save=False,
 42 |                           name=None):
 43 |     # title =
 44 | 
 45 |     # Compute confusion matrix
 46 |     cm2 = confusion_matrix(y_true, y_pred)
 47 | 
 48 |     if normalize:
 49 |         cm = cm2.astype('float') / cm2.sum(axis=1)[:, np.newaxis]
 50 |         print("Normalized confusion matrix")
 51 |     else:
 52 |         print('Confusion matrix, without normalization')
 53 | 
 54 |     # print(cm)
 55 | 
 56 |     fig, ax = plt.subplots(figsize=(9, 9))
 57 |     im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
 58 |     # ax.figure.colorbar(im, ax=ax)
 59 |     # We want to show all ticks...
 60 |     ax.set(xticks=np.arange(cm.shape[1]),
 61 |            yticks=np.arange(cm.shape[0]),
 62 |            # ... and label them with the respective list entries
 63 |            # xticklabels=classes, yticklabels=classes,
 64 |            # title=title,
 65 |            # ylabel='True label',
 66 |            # xlabel='Predicted label'
 67 |            )
 68 |     hfont = {"fontname": "serif"}
 69 |     fontsize = "x-large"
 70 |     # ax.set_xlabel('Predicted', fontsize=fontsize,**hfont),
 71 |     # ax.set_ylabel('True', fontsize=fontsize,**hfont),
 72 |     ax.set_xticklabels(classes, fontsize=fontsize, **hfont)
 73 |     ax.set_yticklabels(classes, fontsize=fontsize, **hfont, fontweight='bold')
 74 | 
 75 |     # Rotate the tick labels and set their alignment.
 76 |     plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
 77 |              rotation_mode="anchor")
 78 | 
 79 |     # Loop over data dimensions and create text annotations.
 80 |     fmt = '.2f' if normalize else 'd'
 81 |     thresh = cm.max() / 2.
 82 |     for i in range(cm.shape[0]):
 83 |         for j in range(cm.shape[1]):
 84 |             ax.text(j, i, format(cm2[i, j], 'd'),  # format(cm[i, j], fmt)
 85 |                     ha="center", va="center",
 86 |                     color="white" if cm[i, j] > thresh else "black")
 87 |     fig.tight_layout()
 88 | 
 89 |     if save:
 90 |         fig.savefig("Figures/" + name + ".pdf", dpi=400,
 91 |                     bbox_inches='tight', pad_inches=0)
 92 | 
 93 |     plt.show()
 94 |     return ax
 95 | 
 96 | 
 97 | # https://stackoverflow.com/questions/35249760/using-scikit-to-determine-contributions-of-each-feature-to-a-specific-class-pred
 98 | def class_feature_importance(X, Y, feature_importances):
 99 |     N, M = X.shape
100 |     X = scale(X)
101 | 
102 |     out = {}
103 |     for c in set(Y):
104 |         out[c] = dict(
105 |             zip(range(N), np.mean(X[Y == c, :], axis=0) * feature_importances)
106 |         )
107 | 
108 |     return out
109 | 
110 | 
111 | def load_data():
112 |     print("Loading training data ...")
113 |     full_dataset = np.load("NumpyFriendlyData/full_dataset_" + DATA_VERSION + ".npy")
114 | 
115 |     full_dataset = full_dataset[~np.isnan(full_dataset).any(axis=1)]
116 |     full_dataset = full_dataset[~np.isinf(full_dataset).any(axis=1)]
117 | 
118 |     data_x = full_dataset[:, :-1]
119 |     data_y = full_dataset[:, -1]
120 | 
121 |     print(np.unique(data_y, return_counts=True))
122 | 
123 |     if DITCH_DEST_PORT:
124 |         data_x = data_x[:, 1:]  # Dest Port index = 0
125 | 
126 |     splits = train_test_split(data_x, data_y, test_size=0.25, stratify=data_y,
127 |                                                         random_state=DATA_SPLIT_RANDOM_STATE)
128 |     return splits
129 | 
130 | 
131 | if __name__ == "__main__":
132 | 
133 |     time_id = gen_id()
134 | 
135 |     (X_train, X_test, Y_train, Y_test) = load_data()
136 |     print(X_train.shape, X_test.shape)
137 | 
138 |     # from sklearn.tree import DecisionTreeClassifier
139 |     # print("Decision Tree")
140 |     # clf = DecisionTreeClassifier(criterion='entropy', random_state=0)
141 | 
142 |     if MLP:
143 |         #X_train = minmax_scale(X_train)
144 |         #X_test = minmax_scale(X_test)
145 | 
146 |         print("Applying minmax-scaling on train and test set")
147 |         scaler = MinMaxScaler()
148 |         scaler.fit(X_train)
149 |         X_train = scaler.transform(X_train)
150 |         X_test = scaler.transform(X_test)
151 | 
152 |         print("Multilayered Perceptron")
153 |         mlp_classifier = MLPClassifier(hidden_layer_sizes=(156,78,39))
154 | 
155 |         print("wrong script for MLP")
156 |         exit(0)
157 | 
158 |     scoring = {
159 |         'accuracy': make_scorer(accuracy_score),
160 |         'precision': make_scorer(precision_score, average='weighted'),
161 |         'f1_score': make_scorer(f1_score, average='weighted'),
162 |         'recall': make_scorer(recall_score, average='weighted')
163 |     }
164 | 
165 |     print("Random Forest")
166 |     rf_classifier = RandomForestClassifier(n_estimators=50, max_depth=20, random_state=RF_RANDOM_STATE)
167 | 
168 |     if CR_VAL_TRAIN:
169 |         print("Cross validating ...")
170 |         sc = cross_validate(rf_classifier, X_train, Y_train, cv=5, scoring=scoring)
171 |         print("Score:\n", sc)
172 | 
173 |         # print("Fit time:   " % (sc['fit_time']))
174 |         # print("Score time: " % (sc['score_time']))
175 | 
176 |         print("Precision: %0.8f (%0.8f)" % (sc['test_precision'].mean(), sc['test_precision'].std()))
177 |         print("Recall: %0.8f (%0.8f)" % (sc['test_recall'].mean(), sc['test_recall'].std()))
178 |         print("F1_score: %0.8f (%0.8f)" % (sc['test_f1_score'].mean(), sc['test_f1_score'].std()))
179 |         print("Accuracy: %0.8f (%0.8f)" % (sc['test_accuracy'].mean(), sc['test_accuracy'].std()))
180 | 
181 |     print("Fitting model ...")
182 |     rf_classifier.fit(X_train, Y_train)
183 | 
184 |     Y_pred = rf_classifier.predict(X_test)
185 | 
186 |     nY_test = []
187 |     nY_pred = []
188 |     for i in range(len(Y_test)):
189 |         nY_test += [Y_test[i]]
190 |         nY_pred += [Y_pred[i]]
191 |     Y_test = np.array(nY_test)
192 |     Y_pred = np.array(nY_pred)
193 |     print(Y_test.shape, Y_pred.shape)
194 | 
195 |     # %%
196 | 
197 |     classes = ["Benign", "FTP-Patator", "SSH-Patator", "DoS GoldenEye", "DoS Hulk", "DoS Slowhttptest", "DoS slowloris",
198 |                "Heartbleed", "Web Attack - Brute Force", "Web Attack - XSS", "Web Attack - Sql Injection", "Infiltration",
199 |                "Bot", "PortScan", "DDoS"]
200 | 
201 |     plot_confusion_matrix(Y_test, Y_pred, classes, save=True, name="RF_" + DATA_VERSION + "_" + time_id )
202 | 
203 |     prfs = precision_recall_fscore_support(Y_test, Y_pred, average='weighted')
204 |     print("Precision, Recall, F-Score, Support:", prfs)
205 | 
206 |     # %%
207 | 
208 |     plot_confusion_matrix(Y_test, Y_pred, classes)
209 | 
210 |     prfs = precision_recall_fscore_support(Y_test, Y_pred, average='weighted')
211 |     print("Precision, Recall, F-Score, Support:", prfs)
212 |     with open("Scores/RF_" + DATA_VERSION + "_" + time_id  + "_metrics_aggregated.txt", 'w') as out_file:
213 |         out_file.write("Precision, Recall, F-Score, Support: " + str(prfs))
214 | 
215 |     Y_pred = list(Y_pred)
216 | 
217 |     np.save("Class-based_metrics/Y_test_" + DATA_VERSION + "_" + time_id, Y_test)
218 |     np.save("Class-based_metrics/Y_pred_" + DATA_VERSION + "_" + time_id, Y_pred)
219 | 
220 |     class_based_metrics = classification_report(Y_test, Y_pred, target_names=classes, zero_division="warn", digits=4)
221 |     print("Class based metrics:\r\n", class_based_metrics)
222 |     with open("Scores/RF_" + DATA_VERSION + "_metrics_class_based_" + time_id + ".txt", 'w') as out_file:
223 |         out_file.write(class_based_metrics)
224 | 
225 |     # This next part is to calculate the feature importance for a RF classifier. Comment this out if you're using MLP
226 | 
227 |     feature_importances = rf_classifier.feature_importances_
228 | 
229 |     result = class_feature_importance(X_test, Y_pred, feature_importances)
230 | 
231 |     print(json.dumps(result, indent=4))
232 | 
233 |     with open("FeatureImportance/feature_importance_full_dataset_" + DATA_VERSION + "_" + time_id + ".json", 'w') as f:
234 |         json.dump(result, f)
235 | 
236 | '''
237 | features_list = "Dst Port,Protocol,Flow Duration,Total Fwd Packet," \
238 |                 "Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max," \
239 |                 "Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max," \
240 |                 "Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s," \
241 |                 "Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std," \
242 |                 "Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min," \
243 |                 "Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Length,Bwd Header Length," \
244 |                 "Fwd Packets/s,Bwd Packets/s,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std," \
245 |                 "Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count," \
246 |                 "URG Flag Count,CWR Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Fwd Segment Size Avg," \
247 |                 "Bwd Segment Size Avg,Fwd Bytes/Bulk Avg,Fwd Packet/Bulk Avg,Fwd Bulk Rate Avg,Bwd Bytes/Bulk Avg," \
248 |                 "Bwd Packet/Bulk Avg,Bwd Bulk Rate Avg,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets," \
249 |                 "Subflow Bwd Bytes,FWD Init Win Bytes,Bwd Init Win Bytes,Fwd Act Data Pkts,Fwd Seg Size Min," \
250 |                 "Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label".split(',')
251 | 
252 | label_dictionary = {
253 |     'BENIGN': '0',
254 |     'FTP-Patator': '1',
255 |     'SSH-Patator': '2',
256 |     'DoS GoldenEye': '3',
257 |     'DoS Hulk': '4',
258 |     'DoS Slowhttptest': '5',
259 |     'DoS slowloris': '6',
260 |     'Heartbleed': '7',
261 |     'Web Attack – Brute Force': '8',
262 |     'Web Attack – XSS': '9',
263 |     'Web Attack – Sql Injection': '10',
264 |     'Infiltration': '11',
265 |     'Bot': '12',
266 |     'PortScan': '13',
267 |     'DDoS': '14'
268 | }
269 | '''
270 | 


--------------------------------------------------------------------------------
/Class-based_metrics/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | */
3 | !.gitignore


--------------------------------------------------------------------------------
/FeatureImportance/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | */
3 | !.gitignore


--------------------------------------------------------------------------------
/Figures/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | */
3 | !.gitignore


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 Gints Engelen (gints.engelen@kuleuven.be)
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 
21 | # When using this code, please cite our paper:
22 | G. Engelen, V. Rimmer, W. Joosen, "Troubleshooting an Intrusion Detection Dataset: the CICIDS2017 Case Study", 2021 IEEE European Symposium on Security and Privacy Workshops (EuroS&PW), 2021.
23 | 
24 | Our paper as well as its extended documentation can be found at https://downloads.distrinet-research.be/WTMC2021/
25 | 
26 | # Contributors
27 | 
28 | For labelling_CSV_flows.py:
29 | Jin Li
30 | Vera Rimmer
31 | Gints Engelen
32 | 
33 | For all other code:
34 | Gints Engelen
35 | Vera Rimmer


--------------------------------------------------------------------------------
/LabelledDataset/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | */
3 | !.gitignore


--------------------------------------------------------------------------------
/MakeDataNumpyFriendly.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This Should be run after all CSV files are fully labelled.
  3 | The script does two things:
  4 | 1. It removes flow id, source ip, destination ip, source port, and timestamp as features
  5 | 2. It converts all labels to numerical labels
  6 | '''
  7 | import copy
  8 | import csv
  9 | import pandas as pd
 10 | import numpy as np
 11 | 
 12 | dataset_directory = 'LabelledDataset'
 13 | saved_numpy_name = 'full_dataset_no_artefacts_with_payload_filter.npy'
 14 | 
 15 | 
 16 | def importCsvAsDict(path):
 17 |     print('Importing from ', path)
 18 |     csvfile = csv.DictReader(open(path), delimiter=',')
 19 |     return [x for x in csvfile]
 20 | 
 21 | 
 22 | def convertToNumericalLabels(flows_list_of_dict):
 23 |     print('Relabelling flows')
 24 | 
 25 |     label_dictionary = {
 26 |         'BENIGN': '0',
 27 |         'FTP-Patator': '1',
 28 |         'SSH-Patator': '2',
 29 |         'DoS GoldenEye': '3',
 30 |         'DoS Hulk': '4',
 31 |         'DoS Slowhttptest': '5',
 32 |         'DoS slowloris': '6',
 33 |         'Heartbleed': '7',
 34 |         'Web Attack - Brute Force': '8',
 35 |         'Web Attack - XSS': '9',
 36 |         'Web Attack - Sql Injection': '10',
 37 |         'Infiltration': '11',
 38 |         'Bot': '12',
 39 |         'PortScan': '13',
 40 |         'DDoS': '14',
 41 |         # IMPORTANT NOTE: For our experiments, we treated all "X - Attempted" flows as BENIGN. If you want to keep the
 42 |         # "X - Attempted" flows separate, please change the values corresponding to the keys below
 43 |         'FTP-Patator - Attempted' : '0',
 44 |         'SSH-Patator - Attempted' : '0',
 45 |         'DoS GoldenEye - Attempted' : '0',
 46 |         'DoS Hulk - Attempted' : '0',
 47 |         'DoS Slowhttptest - Attempted' : '0',
 48 |         'DoS slowloris - Attempted' : '0',
 49 |         'Heartbleed - Attempted' : '0',
 50 |         'Web Attack - Brute Force - Attempted' : '0',
 51 |         'Web Attack - XSS - Attempted' : '0',
 52 |         'Web Attack - Sql Injection - Attempted' : '0',
 53 |         'Infiltration - Attempted' : '0',
 54 |         'Bot - Attempted' : '0',
 55 |         # Note that PortScan doesn't have any 'Attempted' flows because it doesn't rely on a payload transfer for its
 56 |         # effectiveness
 57 |         'DDoS - Attempted' : '0'
 58 |     }
 59 | 
 60 |     for (index, row) in enumerate(flows_list_of_dict):
 61 |         current_label = row['Label']
 62 |         flows_list_of_dict[index]['Label'] = label_dictionary[current_label]
 63 | 
 64 | 
 65 | def listOfDictToNumpyArray(list_of_dict):
 66 |     dataframe = pd.DataFrame(list_of_dict)
 67 |     numpy_string_array = dataframe.values
 68 |     # See point 1 in the description at the top of the file
 69 |     trimmed_values = np.concatenate((numpy_string_array[:, 4:6], numpy_string_array[:, 7:]), axis=1)
 70 |     return trimmed_values.astype(np.float)
 71 | 
 72 | 
 73 | print("monday")
 74 | monday_dict = importCsvAsDict(dataset_directory + '/Monday-WorkingHours.pcap_REVI.csv')
 75 | convertToNumericalLabels(monday_dict)
 76 | monday_numpy_array = listOfDictToNumpyArray(monday_dict)
 77 | 
 78 | print("tuesday")
 79 | tuesday_dict = importCsvAsDict(dataset_directory + '/Tuesday-WorkingHours.pcap_REVI.csv')
 80 | convertToNumericalLabels(tuesday_dict)
 81 | tuesday_numpy_array = listOfDictToNumpyArray(tuesday_dict)
 82 | 
 83 | print("wednesday")
 84 | wednesday_dict = importCsvAsDict(dataset_directory + '/Wednesday-WorkingHours.pcap_REVI.csv')
 85 | convertToNumericalLabels(wednesday_dict)
 86 | wednesday_numpy_array = listOfDictToNumpyArray(wednesday_dict)
 87 | 
 88 | print("thursday")
 89 | thursday_dict = importCsvAsDict(dataset_directory + '/Thursday-WorkingHours.pcap_REVI.csv')
 90 | convertToNumericalLabels(thursday_dict)
 91 | thursday_numpy_array = listOfDictToNumpyArray(thursday_dict)
 92 | 
 93 | print("friday")
 94 | friday_dict = importCsvAsDict(dataset_directory + '/Friday-WorkingHours.pcap_REVI.csv')
 95 | convertToNumericalLabels(friday_dict)
 96 | friday_numpy_array = listOfDictToNumpyArray(friday_dict)
 97 | 
 98 | full_dataset = np.concatenate((monday_numpy_array, tuesday_numpy_array, wednesday_numpy_array, thursday_numpy_array,
 99 |                                friday_numpy_array), axis=0)
100 | 
101 | print("saving dataset")
102 | np.save('NumpyFriendlyData/' + saved_numpy_name, full_dataset)
103 | 


--------------------------------------------------------------------------------
/NumpyFriendlyData/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | */
3 | !.gitignore


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Troubleshooting an Intrusion Detection Dataset: the CICIDS2017 Case Study
 2 | 
 3 | This repository contains the code used for our [paper](https://downloads.distrinet-research.be/WTMC2021/Resources/wtmc2021_Engelen_Troubleshooting.pdf). 
 4 | The code performs the labelling and benchmarking for the [CICIDS 2017 dataset](https://www.unb.ca/cic/datasets/ids-2017.html)
 5 |  after it has been processed by [our modified version of the CICFlowMeter tool](https://github.com/GintsEngelen/CICFlowMeter). 
 6 | 
 7 | Note that all of this is *research code*.
 8 | 
 9 | If you use the code in this repository, please cite our paper:
10 | 
11 |             @inproceedings{engelen2021troubleshooting,
12 |             title={Troubleshooting an Intrusion Detection Dataset: the CICIDS2017 Case Study},
13 |             author={Engelen, Gints and Rimmer, Vera and Joosen, Wouter},
14 |             booktitle={2021 IEEE Security and Privacy Workshops (SPW)},
15 |             pages={7--12},
16 |             year={2021},
17 |             organization={IEEE}
18 |             }
19 | 
20 | An extended documentation of our paper can be found [here](https://downloads.distrinet-research.be/WTMC2021/).
21 | 
22 | ## How to use this repository
23 | 
24 | First, head over to the website of the [CICIDS 2017 dataset](https://www.unb.ca/cic/datasets/ids-2017.html) and download 
25 | the raw version of the dataset (PCAP file format). There are 5 files in total, one for each day. 
26 | 
27 | Then, run our [our modified version of the CICFlowMeter tool](https://github.com/GintsEngelen/CICFlowMeter) on the data
28 | obtained in the previous step:
29 |  
30 | 1. Start the CICFlowMeter tool
31 | 2. Under the "NetWork" menu option, select "Offline"
32 | 3. For "Pcap dir", choose the directory containing the 5 PCAP files of the CICIDS 2017 dataset
33 | 4. For "Output dir", choose the "UnlabelledDataset" directory of this WTCM2021-Code project.
34 | 5. Keep the default values for the "Flow TimeOut" and "Activity Timeout" parameters (120000000 and 5000000 respectively)
35 | 
36 | This will generate 5 CSV files with the flows extracted from the raw PCAP files. 
37 | 
38 | After this, verify the `TIME_DIFFERENCE`, `INPUT_DIR`, `OUTPUT_DIR` and `PAYLOAD_FILTER_ACTIVE` attributes in the 
39 | `labelling_CSV_flows.py` script, and then run it (no need to specify any command-line options). This will label all the 
40 | flows in the CSV files generated by the CICFlowMeter tool.
41 | 
42 | Then, run the `MakeDataNumpyFriendly.py` script, which will convert the labelled CSV files into a single numpy array. 
43 | Note that, in our experiments, we chose to relabel all "Attempted" flows as BENIGN. If you wish to keep them separate, 
44 | make sure to change the numerical labels in the `convertToNumericalLabels(flows_list_of_dict)` function.
45 | 
46 | Finally, run the `Benchmarking_RF.py` script to perform benchmarking on the dataset using a Random Forest classifier. 
47 | Random seeds and various other options can be specified in the first few lines of the script. 


--------------------------------------------------------------------------------
/Scores/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | */
3 | !.gitignore


--------------------------------------------------------------------------------
/UnlabelledDataset/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | */
3 | !.gitignore


--------------------------------------------------------------------------------
/labelling_CSV_flows.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | from datetime import datetime
  3 | from datetime import timedelta
  4 | 
  5 | DAY_STR = [None, "Monday", "Tuesday", "Wednesday", "Thursday", "Friday"]
  6 | 
  7 | PRINT_STEPS = 3000
  8 | DATE_FORMAT_INTERNAL = '%d/%m/%Y %I:%M:%S %p'
  9 | DATE_FORMAT_DATASET = '%d/%m/%Y %I:%M:%S %p'
 10 | # The CICIDS 2017 dataset was generated in New Brunswick, Canada. Running the CICFlowMeter tool on this data automatically
 11 | # converts all timestamps in the data from the timezone of New Brunswick, Canada, to the timezone of the host running
 12 | # the CICFlowMeter tool. The TIME_DIFFERENCE attribute specifies the time difference between these two timezones.
 13 | # specifically: TIME_DIFFERENCE = {CICFlowMeter host timezone} - {New Brunswick, Canada timezone}
 14 | TIME_DIFFERENCE = timedelta(hours=5)
 15 | 
 16 | INPUT_DIR = 'UnlabelledDataset/'
 17 | OUTPUT_DIR = 'LabelledDataset/'
 18 | 
 19 | # Some attack categories rely on transfer of a payload in order to be effective. When a malicious flow belongs to such a
 20 | # category but doesn't contain a payload, setting this filter to True will label these flows as "X - Attempted" with "X"
 21 | # the original attack class. Setting this filter to False will simply label the flow as part of the attack category.
 22 | PAYLOAD_FILTER_ACTIVE = True
 23 | 
 24 | 
 25 | # DATE_FORMAT_DATASET = '%d/%m/%Y %H:%M'
 26 | # TIME_DIFFERENCE = timedelta(hours=0)
 27 | 
 28 | 
 29 | def merge_label(day):
 30 |     day_str = DAY_STR[day]  # 3-reorganize
 31 |     with open('G:\\Datasets\\CICIDS2017-PCAPs\\2-labeling\\' + day_str + '-WorkingHours.pcap_REVI.csv') as csv_flow:
 32 |         spamreader = csv.reader(csv_flow, delimiter=',', quotechar='|')
 33 |         next(spamreader)
 34 |         total = 0
 35 |         with open('G:\\Datasets\\CICIDS2017-PCAPs\\2-labeling\\' + day_str + '-WorkingHours.pcap_SeqInfo.txt',
 36 |                   'r') as txt_input:
 37 |             with open('G:\\Datasets\\CICIDS2017-PCAPs\\2-labeling\\' + day_str + '-WorkingHours.pcap_SeqInfoLabel.txt',
 38 |                       'w') as txt_output:
 39 |                 for row_seq in txt_input:
 40 |                     txt_row = row_seq.split(';')
 41 |                     csv_row = next(spamreader)
 42 |                     assert (txt_row[0] == csv_row[-1])  # same uid
 43 | 
 44 |                     txt_row.insert(1, csv_row[-1])  # insert label into text file
 45 |                     txt_output.write(';'.join(txt_row))
 46 |                     txt_output.flush()
 47 | 
 48 |                     total += 1
 49 |     print(day_str + " merged")
 50 | 
 51 | 
 52 | def dataset_stat_attack(day, ver='ISCX'):
 53 |     day_str = DAY_STR[day]
 54 |     col = -1  # if ver == 'ISCX' else -2
 55 |     with open(OUTPUT_DIR + day_str + '-WorkingHours.pcap_' + ver + '.csv',
 56 |               newline='') as csv_flow:
 57 |         spamreader = csv.reader(csv_flow, delimiter=',', quotechar='|')
 58 |         next(spamreader)
 59 |         total = 0
 60 |         all_attacks = {}
 61 |         for row in spamreader:
 62 |             lbl_attack = row[col]
 63 |             if lbl_attack not in all_attacks:
 64 |                 all_attacks[lbl_attack] = 1
 65 |             else:
 66 |                 all_attacks[lbl_attack] += 1
 67 |             total += 1
 68 |             # if total % PRINT_STEPS == 0:
 69 |             #     print('> ' + str(total))
 70 |     print(ver + ' Stat ' + day_str + ':')
 71 |     print(all_attacks)
 72 |     print('Total: ' + str(total))
 73 | 
 74 | 
 75 | # row = a row in the CSV file, corresponding to one flow
 76 | # attack_class = String name of the attack class
 77 | # Returns a string of the attack class if it passes through the filter
 78 | # Returns "X - Attempted" with X the attack_class if the flow is a TCP flow and does not contain any data transfer in
 79 | # the forward direction.
 80 | # Note that if the payload filter is not active, or the underlying protocol is not TCP, it returns the attack class
 81 | # by default.
 82 | def payload_filter(row, attack_class):
 83 |     # row[10] = total Length of payload bytes in Fwd direction
 84 |     # row[5] = Protocol, we only want TCP connections, 6 = TCP
 85 |     if PAYLOAD_FILTER_ACTIVE and int(row[5]) == 6:
 86 |         if float(row[10]) > 0.0:
 87 |             return attack_class
 88 |         else:
 89 |             return attack_class + " - Attempted"
 90 |     else:
 91 |         return attack_class
 92 | 
 93 | 
 94 | def monday_benign(_):
 95 |     return "BENIGN"
 96 | 
 97 | 
 98 | def tuesday_ftp_patator(row):
 99 |     t_start = datetime.strptime('04/07/2017 09:17:00 AM', DATE_FORMAT_INTERNAL)
100 |     t_end = datetime.strptime('04/07/2017 10:30:00 AM', DATE_FORMAT_INTERNAL)
101 |     attacker = '172.16.0.1'
102 |     victim = '192.168.10.50'
103 |     t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
104 |     if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end:
105 |         return payload_filter(row, "FTP-Patator")
106 |     return None
107 | 
108 | 
109 | def tuesday_ssh_patator(row):
110 |     t_start = datetime.strptime('04/07/2017 01:00:00 PM', DATE_FORMAT_INTERNAL)
111 |     t_end = datetime.strptime('04/07/2017 04:11:00 PM', DATE_FORMAT_INTERNAL)
112 |     attacker = '172.16.0.1'
113 |     victim = '192.168.10.50'
114 |     t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
115 |     if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end:
116 |         return payload_filter(row, "SSH-Patator")
117 |     return None
118 | 
119 | 
120 | def wednesday_dos_slowloris(row):
121 |     t_start = datetime.strptime('05/07/2017 02:23:00 AM', DATE_FORMAT_INTERNAL)
122 |     t_end = datetime.strptime('05/07/2017 10:12:59 AM', DATE_FORMAT_INTERNAL)
123 |     attacker = '172.16.0.1'
124 |     victim = '192.168.10.50'
125 |     t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
126 |     if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end:
127 |         return payload_filter(row, "DoS slowloris")
128 |     return None
129 | 
130 | 
131 | def wednesday_dos_slowhttptest(row):
132 |     t_start = datetime.strptime('05/07/2017 10:13:00 AM', DATE_FORMAT_INTERNAL)
133 |     t_end = datetime.strptime('05/07/2017 10:38:00 AM', DATE_FORMAT_INTERNAL)
134 |     attacker = '172.16.0.1'
135 |     victim = '192.168.10.50'
136 |     t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
137 |     if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end:
138 |         return payload_filter(row, "DoS Slowhttptest")
139 |     return None
140 | 
141 | 
142 | def wednesday_dos_hulk(row):
143 |     t_start = datetime.strptime('05/07/2017 10:39:00 AM', DATE_FORMAT_INTERNAL)
144 |     t_end = datetime.strptime('05/07/2017 11:09:00 AM', DATE_FORMAT_INTERNAL)
145 |     attacker = '172.16.0.1'
146 |     victim = '192.168.10.50'
147 |     t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
148 |     if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end:
149 |         return payload_filter(row, "DoS Hulk")
150 |     return None
151 | 
152 | 
153 | def wednesday_dos_goldeneye(row):
154 |     t_start = datetime.strptime('05/07/2017 11:10:00 AM', DATE_FORMAT_INTERNAL)
155 |     t_end = datetime.strptime('05/07/2017 11:23:00 AM', DATE_FORMAT_INTERNAL)
156 |     attacker = '172.16.0.1'
157 |     victim = '192.168.10.50'
158 |     t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
159 |     if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end:
160 |         return payload_filter(row, "DoS GoldenEye")
161 |     return None
162 | 
163 | 
164 | def wednesday_heartbleed(row):
165 |     t_start = datetime.strptime('05/07/2017 03:11:00 PM', DATE_FORMAT_INTERNAL)
166 |     t_end = datetime.strptime('05/07/2017 03:33:00 PM', DATE_FORMAT_INTERNAL)
167 |     attacker = '172.16.0.1'
168 |     victim = '192.168.10.51'
169 |     t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
170 |     if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end and row[4] == '444':
171 |         return payload_filter(row, "Heartbleed")
172 |     return None
173 | 
174 | 
175 | def thursday_web_attack_brute_force(row):
176 |     t_start = datetime.strptime('06/07/2017 09:10:00 AM', DATE_FORMAT_INTERNAL)
177 |     t_end = datetime.strptime('06/07/2017 10:12:00 AM', DATE_FORMAT_INTERNAL)
178 |     attacker = '172.16.0.1'
179 |     victim = '192.168.10.50'
180 |     t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
181 |     if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end:
182 |         return payload_filter(row, "Web Attack - Brute Force")
183 |     return None
184 | 
185 | 
186 | def thursday_web_attack_xss(row):
187 |     t_start = datetime.strptime('06/07/2017 10:13:00 AM', DATE_FORMAT_INTERNAL)
188 |     t_end = datetime.strptime('06/07/2017 10:37:00 AM', DATE_FORMAT_INTERNAL)
189 |     attacker = '172.16.0.1'
190 |     victim = '192.168.10.50'
191 |     t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
192 |     if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end:
193 |         return payload_filter(row, "Web Attack - XSS")
194 |     return None
195 | 
196 | 
197 | def thursday_web_attack_sql_injection(row):
198 |     t_start = datetime.strptime('06/07/2017 10:39:00 AM', DATE_FORMAT_INTERNAL)
199 |     t_end = datetime.strptime('06/07/2017 10:45:00 AM', DATE_FORMAT_INTERNAL)
200 |     attacker = '172.16.0.1'
201 |     victim = '192.168.10.50'
202 |     t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
203 |     if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end:
204 |         return payload_filter(row, "Web Attack - Sql Injection")
205 |     return None
206 | 
207 | 
208 | def thursday_web_attack_infiltration(row):
209 |     t_start = datetime.strptime('06/07/2017 02:15:00 PM', DATE_FORMAT_INTERNAL)
210 |     t_end = datetime.strptime('06/07/2017 03:50:00 PM', DATE_FORMAT_INTERNAL)
211 |     attacker = '192.168.10.8'
212 |     victim = '205.174.165.73'
213 |     t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
214 |     if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end:
215 |         return payload_filter(row, "Infiltration")
216 |     return None
217 | 
218 | 
219 | def friday_botnet(row):
220 |     t_start = datetime.strptime('07/07/2017 09:30:00 AM', DATE_FORMAT_INTERNAL)
221 |     t_end = datetime.strptime('07/07/2017 12:59:59 PM', DATE_FORMAT_INTERNAL)
222 |     cond_hosts = (row[1] == '205.174.165.73' or row[3] == '205.174.165.73') or (
223 |             row[1] == '192.168.10.17' and row[3] == '52.7.235.158') or (
224 |                          row[1] == '192.168.10.12' and row[3] == '52.6.13.28')
225 |     t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
226 |     if t_start <= t_flow <= t_end and cond_hosts and (row[2] == '8080' or row[4] == '8080') and row[5] == '6':
227 |         return payload_filter(row, "Bot")
228 |     return None
229 | 
230 | 
231 | def friday_portscan(row):
232 |     t_start = datetime.strptime('07/07/2017 12:30:00 PM', DATE_FORMAT_INTERNAL)
233 |     t_end = datetime.strptime('07/07/2017 03:40:00 PM', DATE_FORMAT_INTERNAL)
234 |     attacker = '172.16.0.1'
235 |     victim = '192.168.10.50'
236 |     t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
237 |     if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end:
238 |         return "PortScan"
239 |     return None
240 | 
241 | 
242 | def friday_ddos(row):
243 |     t_start = datetime.strptime('07/07/2017 03:40:00 PM', DATE_FORMAT_INTERNAL)
244 |     t_end = datetime.strptime('07/07/2017 04:30:00 PM', DATE_FORMAT_INTERNAL)
245 |     attacker = '172.16.0.1'
246 |     victim = '192.168.10.50'
247 |     t_flow = datetime.strptime(row[6], DATE_FORMAT_DATASET) - TIME_DIFFERENCE
248 |     if row[1] == attacker and row[3] == victim and t_start <= t_flow <= t_end:
249 |         return payload_filter(row, "DDoS")
250 |     return None
251 | 
252 | 
253 | def dataset_labeling(day):
254 |     day_str = [None, "Monday", "Tuesday", "Wednesday", "Thursday", "Friday"][day]
255 |     day_filters = [None,
256 |                    [monday_benign],
257 |                    [tuesday_ftp_patator, tuesday_ssh_patator],
258 |                    [wednesday_dos_slowloris, wednesday_dos_slowhttptest, wednesday_dos_hulk, wednesday_dos_goldeneye,
259 |                     wednesday_heartbleed],
260 |                    [thursday_web_attack_brute_force, thursday_web_attack_xss, thursday_web_attack_sql_injection,
261 |                     thursday_web_attack_infiltration],
262 |                    [friday_botnet, friday_portscan, friday_ddos]][day]
263 |     with open(INPUT_DIR + day_str + '-WorkingHours.pcap_Flow.csv',
264 |               newline='') as csv_flow:
265 |         with open(OUTPUT_DIR + day_str + '-WorkingHours.pcap_REVI.csv', 'w',
266 |                   newline='') as csv_revised:
267 |             spamreader = csv.reader(csv_flow, delimiter=',', quotechar='|')
268 |             spamwriter = csv.writer(csv_revised, delimiter=',', quotechar='|')
269 |             header = next(spamreader)
270 |             spamwriter.writerow(header)
271 | 
272 |             total = 0
273 |             all_attacks = {}
274 |             for row in spamreader:
275 |                 lbl = "BENIGN"
276 |                 for filter in day_filters:
277 |                     lbl_attack = filter(row)
278 |                     if lbl_attack:
279 |                         lbl = lbl_attack
280 |                         break
281 |                 row[-1] = lbl
282 | 
283 |                 if lbl not in all_attacks:
284 |                     all_attacks[lbl] = 1
285 |                 else:
286 |                     all_attacks[lbl] += 1
287 | 
288 |                 spamwriter.writerow(row)
289 |                 total += 1
290 |                 # if total % PRINT_STEPS == 0:
291 |                 #     print('> ' + str(total))
292 |     print('REVI Stat ' + day_str + ':')
293 |     print(all_attacks)
294 |     print('Total: ' + str(total))
295 | 
296 | 
297 | def show_all_stats():
298 |     # dataset_stat_attack(5, 'ISCX')
299 |     dataset_stat_attack(5, 'REVI')
300 | 
301 | 
302 | def label_all_datasets():
303 |     for i in range(1, 6):
304 |         dataset_labeling(i)
305 | 
306 |     for i in range(1, 6):
307 |         # dataset_stat_attack(i, 'ISCX')
308 |         dataset_stat_attack(i, 'REVI')
309 |         print('\n')
310 | 
311 | 
312 | def merge_all_labels():
313 |     for i in range(1, 6):
314 |         merge_label(i)
315 | 
316 | 
317 | if __name__ == '__main__':
318 |     label_all_datasets()
319 |     # merge_all_labels()
320 | 


--------------------------------------------------------------------------------