├── NID_NSL_KDD_updated.py
├── README.md
├── UNSW_BINARY_UPDATED.py
├── UNSW_MULTI_UPDATED.py
└── cicids2018_updated.py


/NID_NSL_KDD_updated.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from sklearn.model_selection import cross_val_score
  4 | from sklearn import metrics
  5 | import sys
  6 | import sklearn
  7 | import io
  8 | import random
  9 | 
 10 | train_url = "https://raw.githubusercontent.com/merteroglu/NSL-KDD-Network-Instrusion-Detection/master/NSL_KDD_Train.csv"
 11 | test_url = "https://raw.githubusercontent.com/merteroglu/NSL-KDD-Network-Instrusion-Detection/master/NSL_KDD_Test.csv"
 12 | col_names = [
 13 |     "duration",
 14 |     "protocol_type",
 15 |     "service",
 16 |     "flag",
 17 |     "src_bytes",
 18 |     "dst_bytes",
 19 |     "land",
 20 |     "wrong_fragment",
 21 |     "urgent",
 22 |     "hot",
 23 |     "num_failed_logins",
 24 |     "logged_in",
 25 |     "num_compromised",
 26 |     "root_shell",
 27 |     "su_attempted",
 28 |     "num_root",
 29 |     "num_file_creations",
 30 |     "num_shells",
 31 |     "num_access_files",
 32 |     "num_outbound_cmds",
 33 |     "is_host_login",
 34 |     "is_guest_login",
 35 |     "count",
 36 |     "srv_count",
 37 |     "serror_rate",
 38 |     "srv_serror_rate",
 39 |     "rerror_rate",
 40 |     "srv_rerror_rate",
 41 |     "same_srv_rate",
 42 |     "diff_srv_rate",
 43 |     "srv_diff_host_rate",
 44 |     "dst_host_count",
 45 |     "dst_host_srv_count",
 46 |     "dst_host_same_srv_rate",
 47 |     "dst_host_diff_srv_rate",
 48 |     "dst_host_same_src_port_rate",
 49 |     "dst_host_srv_diff_host_rate",
 50 |     "dst_host_serror_rate",
 51 |     "dst_host_srv_serror_rate",
 52 |     "dst_host_rerror_rate",
 53 |     "dst_host_srv_rerror_rate",
 54 |     "label",
 55 | ]
 56 | df = pd.read_csv(train_url, header=None, names=col_names)
 57 | df_test = pd.read_csv(test_url, header=None, names=col_names)
 58 | print("Dimensions of the Training set:", df.shape)
 59 | print("Dimensions of the Test set:", df_test.shape)
 60 | df.head(5)
 61 | print("Label distribution Training set:")
 62 | print(df["label"].value_counts())
 63 | print()
 64 | print("Label distribution Test set:")
 65 | print(df_test["label"].value_counts())
 66 | print("Training set:")
 67 | for col_name in df.columns:
 68 |     if df[col_name].dtypes == "object":
 69 |         unique_cat = len(df[col_name].unique())
 70 |         print(
 71 |             "Feature '{col_name}' has {unique_cat} categories".format(
 72 |                 col_name=col_name, unique_cat=unique_cat
 73 |             )
 74 |         )
 75 | print()
 76 | print("Distribution of categories in service:")
 77 | print(df["service"].value_counts().sort_values(ascending=False).head())
 78 | print("Test set:")
 79 | for col_name in df_test.columns:
 80 |     if df_test[col_name].dtypes == "object":
 81 |         unique_cat = len(df_test[col_name].unique())
 82 |         print(
 83 |             "Feature '{col_name}' has {unique_cat} categories".format(
 84 |                 col_name=col_name, unique_cat=unique_cat
 85 |             )
 86 |         )
 87 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder
 88 | 
 89 | categorical_columns = ["protocol_type", "service", "flag"]
 90 | df_categorical_values = df[categorical_columns]
 91 | testdf_categorical_values = df_test[categorical_columns]
 92 | df_categorical_values.head()
 93 | unique_protocol = sorted(df.protocol_type.unique())
 94 | string1 = "Protocol_type_"
 95 | unique_protocol2 = [string1 + x for x in unique_protocol]
 96 | print(unique_protocol2)
 97 | unique_service = sorted(df.service.unique())
 98 | string2 = "service_"
 99 | unique_service2 = [string2 + x for x in unique_service]
100 | print(unique_service2)
101 | unique_flag = sorted(df.flag.unique())
102 | string3 = "flag_"
103 | unique_flag2 = [string3 + x for x in unique_flag]
104 | print(unique_flag2)
105 | dumcols = unique_protocol2 + unique_service2 + unique_flag2
106 | unique_service_test = sorted(df_test.service.unique())
107 | unique_service2_test = [string2 + x for x in unique_service_test]
108 | testdumcols = unique_protocol2 + unique_service2_test + unique_flag2
109 | df_categorical_values_enc = df_categorical_values.apply(LabelEncoder().fit_transform)
110 | print(df_categorical_values.head())
111 | print("--------------------")
112 | print(df_categorical_values_enc.head())
113 | testdf_categorical_values_enc = testdf_categorical_values.apply(
114 |     LabelEncoder().fit_transform
115 | )
116 | enc = OneHotEncoder(categories="auto")
117 | df_categorical_values_encenc = enc.fit_transform(df_categorical_values_enc)
118 | df_cat_data = pd.DataFrame(df_categorical_values_encenc.toarray(), columns=dumcols)
119 | testdf_categorical_values_encenc = enc.fit_transform(testdf_categorical_values_enc)
120 | testdf_cat_data = pd.DataFrame(
121 |     testdf_categorical_values_encenc.toarray(), columns=testdumcols
122 | )
123 | df_cat_data.head()
124 | trainservice = df["service"].tolist()
125 | testservice = df_test["service"].tolist()
126 | difference = list(set(trainservice) - set(testservice))
127 | string = "service_"
128 | difference = [string + x for x in difference]
129 | difference
130 | for col in difference:
131 |     testdf_cat_data[col] = 0
132 | print(df_cat_data.shape)
133 | print(testdf_cat_data.shape)
134 | newdf = df.join(df_cat_data)
135 | newdf.drop("flag", axis=1, inplace=True)
136 | newdf.drop("protocol_type", axis=1, inplace=True)
137 | newdf.drop("service", axis=1, inplace=True)
138 | newdf_test = df_test.join(testdf_cat_data)
139 | newdf_test.drop("flag", axis=1, inplace=True)
140 | newdf_test.drop("protocol_type", axis=1, inplace=True)
141 | newdf_test.drop("service", axis=1, inplace=True)
142 | print(newdf.shape)
143 | print(newdf_test.shape)
144 | labeldf = newdf["label"]
145 | labeldf_test = newdf_test["label"]
146 | newlabeldf = labeldf.replace(
147 |     {
148 |         "normal": 0,
149 |         "neptune": 1,
150 |         "back": 1,
151 |         "land": 1,
152 |         "pod": 1,
153 |         "smurf": 1,
154 |         "teardrop": 1,
155 |         "mailbomb": 1,
156 |         "apache2": 1,
157 |         "processtable": 1,
158 |         "udpstorm": 1,
159 |         "worm": 1,
160 |         "ipsweep": 2,
161 |         "nmap": 2,
162 |         "portsweep": 2,
163 |         "satan": 2,
164 |         "mscan": 2,
165 |         "saint": 2,
166 |         "ftp_write": 3,
167 |         "guess_passwd": 3,
168 |         "imap": 3,
169 |         "multihop": 3,
170 |         "phf": 3,
171 |         "spy": 3,
172 |         "warezclient": 3,
173 |         "warezmaster": 3,
174 |         "sendmail": 3,
175 |         "named": 3,
176 |         "snmpgetattack": 3,
177 |         "snmpguess": 3,
178 |         "xlock": 3,
179 |         "xsnoop": 3,
180 |         "httptunnel": 3,
181 |         "buffer_overflow": 4,
182 |         "loadmodule": 4,
183 |         "perl": 4,
184 |         "rootkit": 4,
185 |         "ps": 4,
186 |         "sqlattack": 4,
187 |         "xterm": 4,
188 |     }
189 | )
190 | newlabeldf_test = labeldf_test.replace(
191 |     {
192 |         "normal": 0,
193 |         "neptune": 1,
194 |         "back": 1,
195 |         "land": 1,
196 |         "pod": 1,
197 |         "smurf": 1,
198 |         "teardrop": 1,
199 |         "mailbomb": 1,
200 |         "apache2": 1,
201 |         "processtable": 1,
202 |         "udpstorm": 1,
203 |         "worm": 1,
204 |         "ipsweep": 2,
205 |         "nmap": 2,
206 |         "portsweep": 2,
207 |         "satan": 2,
208 |         "mscan": 2,
209 |         "saint": 2,
210 |         "ftp_write": 3,
211 |         "guess_passwd": 3,
212 |         "imap": 3,
213 |         "multihop": 3,
214 |         "phf": 3,
215 |         "spy": 3,
216 |         "warezclient": 3,
217 |         "warezmaster": 3,
218 |         "sendmail": 3,
219 |         "named": 3,
220 |         "snmpgetattack": 3,
221 |         "snmpguess": 3,
222 |         "xlock": 3,
223 |         "xsnoop": 3,
224 |         "httptunnel": 3,
225 |         "buffer_overflow": 4,
226 |         "loadmodule": 4,
227 |         "perl": 4,
228 |         "rootkit": 4,
229 |         "ps": 4,
230 |         "sqlattack": 4,
231 |         "xterm": 4,
232 |     }
233 | )
234 | newdf["label"] = newlabeldf
235 | newdf_test["label"] = newlabeldf_test
236 | to_drop_DoS = [0, 1]
237 | to_drop_Probe = [0, 2]
238 | to_drop_R2L = [0, 3]
239 | to_drop_U2R = [0, 4]
240 | DoS_df = newdf[newdf["label"].isin(to_drop_DoS)]
241 | Probe_df = newdf[newdf["label"].isin(to_drop_Probe)]
242 | R2L_df = newdf[newdf["label"].isin(to_drop_R2L)]
243 | U2R_df = newdf[newdf["label"].isin(to_drop_U2R)]
244 | DoS_df_test = newdf_test[newdf_test["label"].isin(to_drop_DoS)]
245 | Probe_df_test = newdf_test[newdf_test["label"].isin(to_drop_Probe)]
246 | R2L_df_test = newdf_test[newdf_test["label"].isin(to_drop_R2L)]
247 | U2R_df_test = newdf_test[newdf_test["label"].isin(to_drop_U2R)]
248 | print("Train:")
249 | print("Dimensions of DoS:", DoS_df.shape)
250 | print("Dimensions of Probe:", Probe_df.shape)
251 | print("Dimensions of R2L:", R2L_df.shape)
252 | print("Dimensions of U2R:", U2R_df.shape)
253 | print()
254 | print("Test:")
255 | print("Dimensions of DoS:", DoS_df_test.shape)
256 | print("Dimensions of Probe:", Probe_df_test.shape)
257 | print("Dimensions of R2L:", R2L_df_test.shape)
258 | print("Dimensions of U2R:", U2R_df_test.shape)
259 | X_DoS = DoS_df.drop("label", 1)
260 | Y_DoS = DoS_df.label
261 | X_Probe = Probe_df.drop("label", 1)
262 | Y_Probe = Probe_df.label
263 | X_R2L = R2L_df.drop("label", 1)
264 | Y_R2L = R2L_df.label
265 | X_U2R = U2R_df.drop("label", 1)
266 | Y_U2R = U2R_df.label
267 | X_DoS_test = DoS_df_test.drop("label", 1)
268 | Y_DoS_test = DoS_df_test.label
269 | X_Probe_test = Probe_df_test.drop("label", 1)
270 | Y_Probe_test = Probe_df_test.label
271 | X_R2L_test = R2L_df_test.drop("label", 1)
272 | Y_R2L_test = R2L_df_test.label
273 | X_U2R_test = U2R_df_test.drop("label", 1)
274 | Y_U2R_test = U2R_df_test.label
275 | colNames = list(X_DoS)
276 | colNames_test = list(X_DoS_test)
277 | from sklearn import preprocessing
278 | 
279 | scaler1 = preprocessing.StandardScaler().fit(X_DoS)
280 | X_DoS = scaler1.transform(X_DoS)
281 | scaler2 = preprocessing.StandardScaler().fit(X_Probe)
282 | X_Probe = scaler2.transform(X_Probe)
283 | scaler3 = preprocessing.StandardScaler().fit(X_R2L)
284 | X_R2L = scaler3.transform(X_R2L)
285 | scaler4 = preprocessing.StandardScaler().fit(X_U2R)
286 | X_U2R = scaler4.transform(X_U2R)
287 | scaler5 = preprocessing.StandardScaler().fit(X_DoS_test)
288 | X_DoS_test = scaler5.transform(X_DoS_test)
289 | scaler6 = preprocessing.StandardScaler().fit(X_Probe_test)
290 | X_Probe_test = scaler6.transform(X_Probe_test)
291 | scaler7 = preprocessing.StandardScaler().fit(X_R2L_test)
292 | X_R2L_test = scaler7.transform(X_R2L_test)
293 | scaler8 = preprocessing.StandardScaler().fit(X_U2R_test)
294 | X_U2R_test = scaler8.transform(X_U2R_test)
295 | from sklearn.feature_selection import RFE
296 | from sklearn.ensemble import RandomForestClassifier
297 | 
298 | clf = RandomForestClassifier(n_estimators=10, n_jobs=2)
299 | rfe = RFE(estimator=clf, n_features_to_select=13, step=1)
300 | rfe.fit(X_DoS, Y_DoS.astype(int))
301 | X_rfeDoS = rfe.transform(X_DoS)
302 | true = rfe.support_
303 | rfecolindex_DoS = [i for i, x in enumerate(true) if x]
304 | rfecolname_DoS = list(colNames[i] for i in rfecolindex_DoS)
305 | rfe.fit(X_Probe, Y_Probe.astype(int))
306 | X_rfeProbe = rfe.transform(X_Probe)
307 | true = rfe.support_
308 | rfecolindex_Probe = [i for i, x in enumerate(true) if x]
309 | rfecolname_Probe = list(colNames[i] for i in rfecolindex_Probe)
310 | rfe.fit(X_R2L, Y_R2L.astype(int))
311 | X_rfeR2L = rfe.transform(X_R2L)
312 | true = rfe.support_
313 | rfecolindex_R2L = [i for i, x in enumerate(true) if x]
314 | rfecolname_R2L = list(colNames[i] for i in rfecolindex_R2L)
315 | rfe.fit(X_U2R, Y_U2R.astype(int))
316 | X_rfeU2R = rfe.transform(X_U2R)
317 | true = rfe.support_
318 | rfecolindex_U2R = [i for i, x in enumerate(true) if x]
319 | rfecolname_U2R = list(colNames[i] for i in rfecolindex_U2R)
320 | print("Features selected for DoS:", rfecolname_DoS)
321 | print()
322 | print("Features selected for Probe:", rfecolname_Probe)
323 | print()
324 | print("Features selected for R2L:", rfecolname_R2L)
325 | print()
326 | print("Features selected for U2R:", rfecolname_U2R)
327 | print(X_rfeDoS.shape)
328 | print(X_rfeProbe.shape)
329 | print(X_rfeR2L.shape)
330 | print(X_rfeU2R.shape)
331 | clf_DoS = RandomForestClassifier(n_estimators=10, n_jobs=2)
332 | clf_Probe = RandomForestClassifier(n_estimators=10, n_jobs=2)
333 | clf_R2L = RandomForestClassifier(n_estimators=10, n_jobs=2)
334 | clf_U2R = RandomForestClassifier(n_estimators=10, n_jobs=2)
335 | clf_DoS.fit(X_DoS, Y_DoS.astype(int))
336 | clf_Probe.fit(X_Probe, Y_Probe.astype(int))
337 | clf_R2L.fit(X_R2L, Y_R2L.astype(int))
338 | clf_U2R.fit(X_U2R, Y_U2R.astype(int))
339 | clf_rfeDoS = RandomForestClassifier(n_estimators=10, n_jobs=2)
340 | clf_rfeProbe = RandomForestClassifier(n_estimators=10, n_jobs=2)
341 | clf_rfeR2L = RandomForestClassifier(n_estimators=10, n_jobs=2)
342 | clf_rfeU2R = RandomForestClassifier(n_estimators=10, n_jobs=2)
343 | clf_rfeDoS.fit(X_rfeDoS, Y_DoS.astype(int))
344 | clf_rfeProbe.fit(X_rfeProbe, Y_Probe.astype(int))
345 | clf_rfeR2L.fit(X_rfeR2L, Y_R2L.astype(int))
346 | clf_rfeU2R.fit(X_rfeU2R, Y_U2R.astype(int))
347 | clf_DoS.predict(X_DoS_test)
348 | clf_DoS.predict_proba(X_DoS_test)[0:10]
349 | Y_DoS_pred = clf_DoS.predict(X_DoS_test)
350 | pd.crosstab(
351 |     Y_DoS_test, Y_DoS_pred, rownames=["Actual attacks"], colnames=["Predicted attacks"]
352 | )
353 | Y_Probe_pred = clf_Probe.predict(X_Probe_test)
354 | pd.crosstab(
355 |     Y_Probe_test,
356 |     Y_Probe_pred,
357 |     rownames=["Actual attacks"],
358 |     colnames=["Predicted attacks"],
359 | )
360 | Y_R2L_pred = clf_R2L.predict(X_R2L_test)
361 | pd.crosstab(
362 |     Y_R2L_test, Y_R2L_pred, rownames=["Actual attacks"], colnames=["Predicted attacks"]
363 | )
364 | Y_U2R_pred = clf_U2R.predict(X_U2R_test)
365 | pd.crosstab(
366 |     Y_U2R_test, Y_U2R_pred, rownames=["Actual attacks"], colnames=["Predicted attacks"]
367 | )
368 | from sklearn.model_selection import cross_val_score
369 | from sklearn import metrics
370 | 
371 | accuracy = cross_val_score(clf_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="accuracy")
372 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
373 | precision = cross_val_score(clf_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="precision")
374 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
375 | recall = cross_val_score(clf_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="recall")
376 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
377 | f = cross_val_score(clf_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="f1")
378 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
379 | accuracy = cross_val_score(
380 |     clf_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="accuracy"
381 | )
382 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
383 | precision = cross_val_score(
384 |     clf_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="precision_macro"
385 | )
386 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
387 | recall = cross_val_score(
388 |     clf_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="recall_macro"
389 | )
390 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
391 | f = cross_val_score(clf_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="f1_macro")
392 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
393 | accuracy = cross_val_score(clf_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="accuracy")
394 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
395 | precision = cross_val_score(
396 |     clf_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="precision_macro"
397 | )
398 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
399 | recall = cross_val_score(clf_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="recall_macro")
400 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
401 | f = cross_val_score(clf_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="f1_macro")
402 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
403 | accuracy = cross_val_score(clf_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="accuracy")
404 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
405 | precision = cross_val_score(
406 |     clf_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="precision_macro"
407 | )
408 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
409 | recall = cross_val_score(clf_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="recall_macro")
410 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
411 | f = cross_val_score(clf_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="f1_macro")
412 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
413 | X_DoS_test2 = X_DoS_test[:, rfecolindex_DoS]
414 | X_Probe_test2 = X_Probe_test[:, rfecolindex_Probe]
415 | X_R2L_test2 = X_R2L_test[:, rfecolindex_R2L]
416 | X_U2R_test2 = X_U2R_test[:, rfecolindex_U2R]
417 | X_U2R_test2.shape
418 | Y_DoS_pred2 = clf_rfeDoS.predict(X_DoS_test2)
419 | pd.crosstab(
420 |     Y_DoS_test, Y_DoS_pred2, rownames=["Actual attacks"], colnames=["Predicted attacks"]
421 | )
422 | Y_Probe_pred2 = clf_rfeProbe.predict(X_Probe_test2)
423 | pd.crosstab(
424 |     Y_Probe_test,
425 |     Y_Probe_pred2,
426 |     rownames=["Actual attacks"],
427 |     colnames=["Predicted attacks"],
428 | )
429 | Y_R2L_pred2 = clf_rfeR2L.predict(X_R2L_test2)
430 | pd.crosstab(
431 |     Y_R2L_test, Y_R2L_pred2, rownames=["Actual attacks"], colnames=["Predicted attacks"]
432 | )
433 | Y_U2R_pred2 = clf_rfeU2R.predict(X_U2R_test2)
434 | pd.crosstab(
435 |     Y_U2R_test, Y_U2R_pred2, rownames=["Actual attacks"], colnames=["Predicted attacks"]
436 | )
437 | accuracy = cross_val_score(
438 |     clf_rfeDoS, X_DoS_test2, Y_DoS_test, cv=10, scoring="accuracy"
439 | )
440 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
441 | precision = cross_val_score(
442 |     clf_rfeDoS, X_DoS_test2, Y_DoS_test, cv=10, scoring="precision"
443 | )
444 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
445 | recall = cross_val_score(clf_rfeDoS, X_DoS_test2, Y_DoS_test, cv=10, scoring="recall")
446 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
447 | f = cross_val_score(clf_rfeDoS, X_DoS_test2, Y_DoS_test, cv=10, scoring="f1")
448 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
449 | accuracy = cross_val_score(
450 |     clf_rfeProbe, X_Probe_test2, Y_Probe_test, cv=10, scoring="accuracy"
451 | )
452 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
453 | precision = cross_val_score(
454 |     clf_rfeProbe, X_Probe_test2, Y_Probe_test, cv=10, scoring="precision_macro"
455 | )
456 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
457 | recall = cross_val_score(
458 |     clf_rfeProbe, X_Probe_test2, Y_Probe_test, cv=10, scoring="recall_macro"
459 | )
460 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
461 | f = cross_val_score(
462 |     clf_rfeProbe, X_Probe_test2, Y_Probe_test, cv=10, scoring="f1_macro"
463 | )
464 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
465 | accuracy = cross_val_score(
466 |     clf_rfeR2L, X_R2L_test2, Y_R2L_test, cv=10, scoring="accuracy"
467 | )
468 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
469 | precision = cross_val_score(
470 |     clf_rfeR2L, X_R2L_test2, Y_R2L_test, cv=10, scoring="precision_macro"
471 | )
472 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
473 | recall = cross_val_score(
474 |     clf_rfeR2L, X_R2L_test2, Y_R2L_test, cv=10, scoring="recall_macro"
475 | )
476 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
477 | f = cross_val_score(clf_rfeR2L, X_R2L_test2, Y_R2L_test, cv=10, scoring="f1_macro")
478 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
479 | accuracy = cross_val_score(
480 |     clf_rfeU2R, X_U2R_test2, Y_U2R_test, cv=10, scoring="accuracy"
481 | )
482 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
483 | precision = cross_val_score(
484 |     clf_rfeU2R, X_U2R_test2, Y_U2R_test, cv=10, scoring="precision_macro"
485 | )
486 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
487 | recall = cross_val_score(
488 |     clf_rfeU2R, X_U2R_test2, Y_U2R_test, cv=10, scoring="recall_macro"
489 | )
490 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
491 | f = cross_val_score(clf_rfeU2R, X_U2R_test2, Y_U2R_test, cv=10, scoring="f1_macro")
492 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
493 | from sklearn.neighbors import KNeighborsClassifier
494 | 
495 | clf_KNN_DoS = KNeighborsClassifier()
496 | clf_KNN_Probe = KNeighborsClassifier()
497 | clf_KNN_R2L = KNeighborsClassifier()
498 | clf_KNN_U2R = KNeighborsClassifier()
499 | clf_KNN_DoS.fit(X_DoS, Y_DoS.astype(int))
500 | clf_KNN_Probe.fit(X_Probe, Y_Probe.astype(int))
501 | clf_KNN_R2L.fit(X_R2L, Y_R2L.astype(int))
502 | clf_KNN_U2R.fit(X_U2R, Y_U2R.astype(int))
503 | Y_DoS_pred = clf_KNN_DoS.predict(X_DoS_test)
504 | pd.crosstab(
505 |     Y_DoS_test, Y_DoS_pred, rownames=["Actual attacks"], colnames=["Predicted attacks"]
506 | )
507 | Y_Probe_pred = clf_KNN_Probe.predict(X_Probe_test)
508 | pd.crosstab(
509 |     Y_Probe_test,
510 |     Y_Probe_pred,
511 |     rownames=["Actual attacks"],
512 |     colnames=["Predicted attacks"],
513 | )
514 | Y_R2L_pred = clf_KNN_R2L.predict(X_R2L_test)
515 | pd.crosstab(
516 |     Y_R2L_test, Y_R2L_pred, rownames=["Actual attacks"], colnames=["Predicted attacks"]
517 | )
518 | Y_U2R_pred = clf_KNN_U2R.predict(X_U2R_test)
519 | pd.crosstab(
520 |     Y_U2R_test, Y_U2R_pred, rownames=["Actual attacks"], colnames=["Predicted attacks"]
521 | )
522 | from sklearn.model_selection import cross_val_score
523 | from sklearn import metrics
524 | 
525 | accuracy = cross_val_score(
526 |     clf_KNN_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="accuracy"
527 | )
528 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
529 | precision = cross_val_score(
530 |     clf_KNN_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="precision"
531 | )
532 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
533 | recall = cross_val_score(clf_KNN_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="recall")
534 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
535 | f = cross_val_score(clf_KNN_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="f1")
536 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
537 | accuracy = cross_val_score(
538 |     clf_KNN_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="accuracy"
539 | )
540 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
541 | precision = cross_val_score(
542 |     clf_KNN_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="precision_macro"
543 | )
544 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
545 | recall = cross_val_score(
546 |     clf_KNN_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="recall_macro"
547 | )
548 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
549 | f = cross_val_score(
550 |     clf_KNN_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="f1_macro"
551 | )
552 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
553 | accuracy = cross_val_score(
554 |     clf_KNN_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="accuracy"
555 | )
556 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
557 | precision = cross_val_score(
558 |     clf_KNN_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="precision_macro"
559 | )
560 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
561 | recall = cross_val_score(
562 |     clf_KNN_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="recall_macro"
563 | )
564 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
565 | f = cross_val_score(clf_KNN_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="f1_macro")
566 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
567 | accuracy = cross_val_score(
568 |     clf_KNN_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="accuracy"
569 | )
570 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
571 | precision = cross_val_score(
572 |     clf_KNN_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="precision_macro"
573 | )
574 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
575 | recall = cross_val_score(
576 |     clf_KNN_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="recall_macro"
577 | )
578 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
579 | f = cross_val_score(clf_KNN_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="f1_macro")
580 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
581 | from sklearn.svm import SVC
582 | 
583 | clf_SVM_DoS = SVC(kernel="linear", C=1.0, random_state=0)
584 | clf_SVM_Probe = SVC(kernel="linear", C=1.0, random_state=0)
585 | clf_SVM_R2L = SVC(kernel="linear", C=1.0, random_state=0)
586 | clf_SVM_U2R = SVC(kernel="linear", C=1.0, random_state=0)
587 | clf_SVM_DoS.fit(X_DoS, Y_DoS.astype(int))
588 | clf_SVM_Probe.fit(X_Probe, Y_Probe.astype(int))
589 | clf_SVM_R2L.fit(X_R2L, Y_R2L.astype(int))
590 | clf_SVM_U2R.fit(X_U2R, Y_U2R.astype(int))
591 | Y_DoS_pred = clf_SVM_DoS.predict(X_DoS_test)
592 | pd.crosstab(
593 |     Y_DoS_test, Y_DoS_pred, rownames=["Actual attacks"], colnames=["Predicted attacks"]
594 | )
595 | Y_Probe_pred = clf_SVM_Probe.predict(X_Probe_test)
596 | pd.crosstab(
597 |     Y_Probe_test,
598 |     Y_Probe_pred,
599 |     rownames=["Actual attacks"],
600 |     colnames=["Predicted attacks"],
601 | )
602 | Y_R2L_pred = clf_SVM_R2L.predict(X_R2L_test)
603 | pd.crosstab(
604 |     Y_R2L_test, Y_R2L_pred, rownames=["Actual attacks"], colnames=["Predicted attacks"]
605 | )
606 | Y_U2R_pred = clf_SVM_U2R.predict(X_U2R_test)
607 | pd.crosstab(
608 |     Y_U2R_test, Y_U2R_pred, rownames=["Actual attacks"], colnames=["Predicted attacks"]
609 | )
610 | from sklearn.model_selection import cross_val_score
611 | from sklearn import metrics
612 | 
613 | accuracy = cross_val_score(
614 |     clf_SVM_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="accuracy"
615 | )
616 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
617 | precision = cross_val_score(
618 |     clf_SVM_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="precision"
619 | )
620 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
621 | recall = cross_val_score(clf_SVM_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="recall")
622 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
623 | f = cross_val_score(clf_SVM_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="f1")
624 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
625 | accuracy = cross_val_score(
626 |     clf_SVM_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="accuracy"
627 | )
628 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
629 | precision = cross_val_score(
630 |     clf_SVM_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="precision_macro"
631 | )
632 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
633 | recall = cross_val_score(
634 |     clf_SVM_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="recall_macro"
635 | )
636 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
637 | f = cross_val_score(
638 |     clf_SVM_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="f1_macro"
639 | )
640 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
641 | accuracy = cross_val_score(
642 |     clf_SVM_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="accuracy"
643 | )
644 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
645 | precision = cross_val_score(
646 |     clf_SVM_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="precision_macro"
647 | )
648 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
649 | recall = cross_val_score(
650 |     clf_SVM_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="recall_macro"
651 | )
652 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
653 | f = cross_val_score(clf_SVM_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="f1_macro")
654 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
655 | accuracy = cross_val_score(
656 |     clf_SVM_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="accuracy"
657 | )
658 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
659 | precision = cross_val_score(
660 |     clf_SVM_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="precision_macro"
661 | )
662 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
663 | recall = cross_val_score(
664 |     clf_SVM_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="recall_macro"
665 | )
666 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
667 | f = cross_val_score(clf_SVM_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="f1_macro")
668 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
669 | from sklearn.ensemble import VotingClassifier
670 | 
671 | clf_voting_DoS = VotingClassifier(
672 |     estimators=[("rf", clf_DoS), ("knn", clf_KNN_DoS), ("svm", clf_SVM_DoS)],
673 |     voting="hard",
674 | )
675 | clf_voting_Probe = VotingClassifier(
676 |     estimators=[("rf", clf_Probe), ("knn", clf_KNN_Probe), ("svm", clf_SVM_Probe)],
677 |     voting="hard",
678 | )
679 | clf_voting_R2L = VotingClassifier(
680 |     estimators=[("rf", clf_R2L), ("knn", clf_KNN_R2L), ("svm", clf_SVM_R2L)],
681 |     voting="hard",
682 | )
683 | clf_voting_U2R = VotingClassifier(
684 |     estimators=[("rf", clf_U2R), ("knn", clf_KNN_U2R), ("svm", clf_SVM_U2R)],
685 |     voting="hard",
686 | )
687 | clf_voting_DoS.fit(X_DoS, Y_DoS.astype(int))
688 | clf_voting_Probe.fit(X_Probe, Y_Probe.astype(int))
689 | clf_voting_R2L.fit(X_R2L, Y_R2L.astype(int))
690 | clf_voting_U2R.fit(X_U2R, Y_U2R.astype(int))
691 | Y_DoS_pred = clf_voting_DoS.predict(X_DoS_test)
692 | pd.crosstab(
693 |     Y_DoS_test, Y_DoS_pred, rownames=["Actual attacks"], colnames=["Predicted attacks"]
694 | )
695 | Y_Probe_pred = clf_voting_Probe.predict(X_Probe_test)
696 | pd.crosstab(
697 |     Y_Probe_test,
698 |     Y_Probe_pred,
699 |     rownames=["Actual attacks"],
700 |     colnames=["Predicted attacks"],
701 | )
702 | Y_R2L_pred = clf_voting_R2L.predict(X_R2L_test)
703 | pd.crosstab(
704 |     Y_R2L_test, Y_R2L_pred, rownames=["Actual attacks"], colnames=["Predicted attacks"]
705 | )
706 | Y_U2R_pred = clf_voting_U2R.predict(X_U2R_test)
707 | pd.crosstab(
708 |     Y_U2R_test, Y_U2R_pred, rownames=["Actual attacks"], colnames=["Predicted attacks"]
709 | )
710 | 
711 | accuracy = cross_val_score(
712 |     clf_voting_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="accuracy"
713 | )
714 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
715 | precision = cross_val_score(
716 |     clf_voting_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="precision"
717 | )
718 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
719 | recall = cross_val_score(
720 |     clf_voting_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="recall"
721 | )
722 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
723 | f = cross_val_score(clf_voting_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="f1")
724 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
725 | accuracy = cross_val_score(
726 |     clf_voting_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="accuracy"
727 | )
728 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
729 | precision = cross_val_score(
730 |     clf_voting_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="precision_macro"
731 | )
732 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
733 | recall = cross_val_score(
734 |     clf_voting_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="recall_macro"
735 | )
736 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
737 | f = cross_val_score(
738 |     clf_voting_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="f1_macro"
739 | )
740 | print("F-mesaure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
741 | accuracy = cross_val_score(
742 |     clf_voting_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="accuracy"
743 | )
744 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
745 | precision = cross_val_score(
746 |     clf_voting_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="precision_macro"
747 | )
748 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
749 | recall = cross_val_score(
750 |     clf_voting_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="recall_macro"
751 | )
752 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
753 | f = cross_val_score(clf_voting_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="f1_macro")
754 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
755 | accuracy = cross_val_score(
756 |     clf_voting_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="accuracy"
757 | )
758 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
759 | precision = cross_val_score(
760 |     clf_voting_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="precision_macro"
761 | )
762 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
763 | recall = cross_val_score(
764 |     clf_voting_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="recall_macro"
765 | )
766 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
767 | f = cross_val_score(clf_voting_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="f1_macro")
768 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
769 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Network Intrusion Detection using Light Weight ML Ensemble Method
 2 | 
 3 | ![Javatpoint](https://zenodo.org/badge/DOI/10.5281/zenodo.7801597.svg)  
 4 | 
 5 | Codes for the paper entitled "Optimization of Predictive Performance of Intrusion Detection System Using Hybrid Ensemble Model for Secure Systems"
 6 | 
 7 | ### Following datasets were used in this study.
 8 | 
 9 | - [The UNSW-NB15 Dataset](https://research.unsw.edu.au/projects/unsw-nb15-dataset)
10 | - [NSL-KDD dataset](https://www.unb.ca/cic/datasets/nsl.html)
11 | - [CSE-CIC-IDS2018](https://www.unb.ca/cic/datasets/ids-2018.html)
12 | 
13 | ### Following algorithms were studied individually and an ensemble of RF, SVM and MLP was used to develop a cost-effective and accurate model for Intrusion Detection.
14 | - Random Forest
15 | - Decision Tree
16 | - kNN
17 | - SVM
18 | - MLP
19 | - DNN
20 | - CNN (4 Conv1D layers)
21 | - LSTM (3 LSTM layers)
22 | - RNN (3 RNN layers)
23 | 
24 | ### Evaluation Methods
25 | - Accuracy
26 | - Precision
27 | - Recall
28 | - F1-Score
29 | 
30 | ### Machine Specifications used for experimentation
31 | - HP 840 G2 laptop
32 | - Intel core i5 processor (5th generation)
33 | - 64 bit Windows 10 operating system
34 | - 16 GB RAM
35 | 
36 | ### Note
37 | These codes are obtained by downloading .py files from original jupyter notebooks. While reproducing the results make sure you run these codes in a separate Jupyter Notebook.
38 | 
39 | ### Update
40 | Codes for deep learning algorithms will be uploaded soon.
41 | 


--------------------------------------------------------------------------------
/UNSW_BINARY_UPDATED.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import seaborn as sns
  4 | import matplotlib.pyplot as plt
  5 | import pickle
  6 | from os import path
  7 | from sklearn.preprocessing import MinMaxScaler
  8 | from sklearn.preprocessing import StandardScaler
  9 | from sklearn.preprocessing import LabelEncoder
 10 | from sklearn import metrics
 11 | from sklearn import preprocessing
 12 | from sklearn.metrics import accuracy_score
 13 | from sklearn.model_selection import train_test_split
 14 | from sklearn.metrics import classification_report
 15 | from sklearn.svm import SVC
 16 | from sklearn.tree import DecisionTreeClassifier
 17 | from sklearn.ensemble import RandomForestClassifier
 18 | from sklearn.neighbors import KNeighborsClassifier
 19 | from sklearn.neural_network import MLPClassifier
 20 | from sklearn.ensemble import VotingClassifier
 21 | import time
 22 | 
 23 | bin_data_path = "./datasets/bin_data.csv"
 24 | multi_data_path = "./datasets/multi_data.csv"
 25 | df = pd.read_csv(bin_data_path)
 26 | print("Dimensions of the Training set:", df.shape)
 27 | df.shape
 28 | df.head()
 29 | X = df.drop(columns=["label"], axis=1)
 30 | Y = df["label"]
 31 | X_train, X_test, y_train, y_test = train_test_split(
 32 |     X, Y, test_size=0.30, random_state=50
 33 | )
 34 | 
 35 | 
 36 | knn = KNeighborsClassifier(n_neighbors=3)
 37 | svm = SVC(kernel="linear", C=1.0, random_state=0)
 38 | rf = RandomForestClassifier(n_estimators=10, random_state=1)
 39 | dt = DecisionTreeClassifier(random_state=0)
 40 | mlp = MLPClassifier(random_state=0, max_iter=300)
 41 | clf_voting = VotingClassifier(
 42 |     estimators=[("rf", rf), ("knn", knn), ("svm", svm)], voting="hard"
 43 | )
 44 | knn = KNeighborsClassifier(
 45 |     algorithm="auto",
 46 |     leaf_size=30,
 47 |     metric="minkowski",
 48 |     metric_params=None,
 49 |     n_jobs=None,
 50 |     n_neighbors=5,
 51 |     p=2,
 52 |     weights="uniform",
 53 | )
 54 | print("=========================")
 55 | print("kNN Classifier")
 56 | print("=========================")
 57 | t1_ens = time.time()
 58 | knn.fit(X_train, y_train.astype(int))
 59 | t2_ens = time.time()
 60 | print("Time to train knn on training dat:", t2_ens - t1_ens)
 61 | y_pred = knn.predict(X_test)
 62 | print("Accuracy - ", accuracy_score(y_test, y_pred) * 100)
 63 | cls_report = classification_report(y_true=y_test, y_pred=y_pred)
 64 | print(cls_report)
 65 | pkl_filename = "./qaiser_models/knn_binary.pkl"
 66 | if not path.isfile(pkl_filename):
 67 |     with open(pkl_filename, "wb") as file:
 68 |         pickle.dump(knn, file)
 69 |     print("Saved model to disk")
 70 | else:
 71 |     print("Model already saved")
 72 | print("=========================")
 73 | print("Fitting SVM Classifier")
 74 | print("=========================")
 75 | t1_svm = time.time()
 76 | svm.fit(X_train, y_train.astype(int))
 77 | t2_svm = time.time()
 78 | print("Time to train SVM on training dat:", t2_svm - t1_svm)
 79 | y_pred = svm.predict(X_test)
 80 | print("Accuracy for binary SVM is - ", accuracy_score(y_test, y_pred) * 100)
 81 | cls_report = classification_report(y_true=y_test, y_pred=y_pred)
 82 | print(cls_report)
 83 | pkl_filename = "./qaiser_models/SVM_binary.pkl"
 84 | if not path.isfile(pkl_filename):
 85 |     with open(pkl_filename, "wb") as file:
 86 |         pickle.dump(knn, file)
 87 |     print("Saved model to disk")
 88 | else:
 89 |     print("Model already saved")
 90 | print("=========================")
 91 | print("Fitting Random Forest Classifier")
 92 | print("=========================")
 93 | t1_rf = time.time()
 94 | rf.fit(X_train, y_train.astype(int))
 95 | t2_rf = time.time()
 96 | print("Time to train RF on binary training dat:", t2_rf - t1_rf)
 97 | print("======================================================")
 98 | y_pred = rf.predict(X_test)
 99 | print("Accuracy for binary SVM is - ", accuracy_score(y_test, y_pred) * 100)
100 | cls_report = classification_report(y_true=y_test, y_pred=y_pred)
101 | print("========Printing Classification Reports==========")
102 | print(cls_report)
103 | pkl_filename = "./qaiser_models/RF_binary.pkl"
104 | if not path.isfile(pkl_filename):
105 |     with open(pkl_filename, "wb") as file:
106 |         pickle.dump(rf, file)
107 |     print("Saved model to disk")
108 | else:
109 |     print("Model already saved")
110 | print("===========================================")
111 | print("Fitting Random Forest Classifier")
112 | print("===========================================")
113 | t1_dt = time.time()
114 | dt.fit(X_train, y_train.astype(int))
115 | t2_dt = time.time()
116 | print("Time to train RF on binary training dat:", t2_dt - t1_dt)
117 | print("======================================================")
118 | y_pred = dt.predict(X_test)
119 | print("Accuracy for binary SVM is - ", accuracy_score(y_test, y_pred) * 100)
120 | cls_report = classification_report(y_true=y_test, y_pred=y_pred)
121 | print("========Printing Classification Reports==========")
122 | print(cls_report)
123 | pkl_filename = "./qaiser_models/DT_binary.pkl"
124 | if not path.isfile(pkl_filename):
125 |     with open(pkl_filename, "wb") as file:
126 |         pickle.dump(dt, file)
127 |     print("Saved model to disk")
128 | else:
129 |     print("Model already saved")
130 | print("===========================================")
131 | print("Fitting MLP Classifier")
132 | print("===========================================")
133 | t1_mlp = time.time()
134 | mlp.fit(X_train, y_train.astype(int))
135 | t2_mlp = time.time()
136 | print("Time to train MLP on binary training dat:", t2_dt - t1_dt)
137 | print("======================================================")
138 | y_pred = mlp.predict(X_test)
139 | print("Accuracy for binary MLP is - ", accuracy_score(y_test, y_pred) * 100)
140 | cls_report = classification_report(y_true=y_test, y_pred=y_pred)
141 | print("========Printing Classification Reports==========")
142 | print(cls_report)
143 | pkl_filename = "./qaiser_models/MLP_binary.pkl"
144 | if not path.isfile(pkl_filename):
145 |     with open(pkl_filename, "wb") as file:
146 |         pickle.dump(mlp, file)
147 |     print("Saved model to disk")
148 | else:
149 |     print("Model already saved")
150 | print("===========================================")
151 | print("Fitting Our Ensemble Method Classifier")
152 | print("===========================================")
153 | t1_clf_voting = time.time()
154 | clf_voting.fit(X_train, y_train.astype(int))
155 | t2_clf_voting = time.time()
156 | print("Time to train clf_voting on binary training dat:", t2_clf_voting - t1_clf_voting)
157 | print("======================================================")
158 | y_pred = clf_voting.predict(X_test)
159 | print("Accuracy for binary clf_voting is - ", accuracy_score(y_test, y_pred) * 100)
160 | cls_report = classification_report(y_true=y_test, y_pred=y_pred)
161 | print("========Printing Classification Reports==========")
162 | print(cls_report)
163 | pkl_filename = "./qaiser_models/clf_voting_binary.pkl"
164 | if not path.isfile(pkl_filename):
165 |     with open(pkl_filename, "wb") as file:
166 |         pickle.dump(clf_voting, file)
167 |     print("Saved model to disk")
168 | else:
169 |     print("Model already saved")
170 | 


--------------------------------------------------------------------------------
/UNSW_MULTI_UPDATED.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import seaborn as sns
  4 | import matplotlib.pyplot as plt
  5 | import pickle
  6 | from os import path
  7 | from sklearn.preprocessing import MinMaxScaler
  8 | from sklearn.preprocessing import StandardScaler
  9 | from sklearn.preprocessing import LabelEncoder
 10 | from sklearn import metrics
 11 | from sklearn import preprocessing
 12 | from sklearn.metrics import accuracy_score
 13 | from sklearn.model_selection import train_test_split
 14 | from sklearn.metrics import classification_report
 15 | from sklearn.svm import SVC
 16 | from sklearn.tree import DecisionTreeClassifier
 17 | from sklearn.ensemble import RandomForestClassifier
 18 | from sklearn.neighbors import KNeighborsClassifier
 19 | from sklearn.neural_network import MLPClassifier
 20 | from sklearn.ensemble import VotingClassifier
 21 | import time
 22 | 
 23 | bin_data_path = "./datasets/multi_data.csv"
 24 | df = pd.read_csv(bin_data_path)
 25 | print("Dimensions of the Training set:", df.shape)
 26 | df.shape
 27 | df.head()
 28 | X = df.drop(columns=["label"], axis=1)
 29 | Y = df["label"]
 30 | X_train, X_test, y_train, y_test = train_test_split(
 31 |     X, Y, test_size=0.30, random_state=50
 32 | )
 33 | print("Training Data Shape:", X_train.shape)
 34 | print("Training Labels Shape:", y_train.shape)
 35 | print("Testing Data Shape:", X_test.shape)
 36 | print("Testing Label Shape:", y_test.shape)
 37 | 
 38 | knn = KNeighborsClassifier(n_neighbors=3)
 39 | svm = SVC(kernel="poly", C=1.0, random_state=0)
 40 | rf = RandomForestClassifier(n_estimators=10, random_state=1)
 41 | dt = DecisionTreeClassifier(random_state=0)
 42 | mlp = MLPClassifier(random_state=0, max_iter=300)
 43 | clf_voting = VotingClassifier(
 44 |     estimators=[("rf", rf), ("knn", knn), ("svm", svm)], voting="hard"
 45 | )
 46 | knn = KNeighborsClassifier(
 47 |     algorithm="auto",
 48 |     leaf_size=30,
 49 |     metric="minkowski",
 50 |     metric_params=None,
 51 |     n_jobs=None,
 52 |     n_neighbors=5,
 53 |     p=2,
 54 |     weights="uniform",
 55 | )
 56 | print("=========================")
 57 | print("kNN Classifier")
 58 | print("=========================")
 59 | t1_ens = time.time()
 60 | knn.fit(X_train, y_train.astype(int))
 61 | t2_ens = time.time()
 62 | print("Time to train knn on MultiClass training dat:", t2_ens - t1_ens)
 63 | y_pred = knn.predict(X_test)
 64 | print("Accuracy - ", accuracy_score(y_test, y_pred) * 100)
 65 | cls_report = classification_report(y_true=y_test, y_pred=y_pred)
 66 | print(cls_report)
 67 | pkl_filename = "./qaiser_models/knn_multi.pkl"
 68 | if not path.isfile(pkl_filename):
 69 |     with open(pkl_filename, "wb") as file:
 70 |         pickle.dump(knn, file)
 71 |     print("Saved model to disk")
 72 | else:
 73 |     print("Model already saved")
 74 | import matplotlib.pyplot as plt
 75 | from sklearn.metrics import plot_confusion_matrix
 76 | 
 77 | print("Testing on Unssen Data")
 78 | fig, ax = plt.subplots(figsize=(10, 10))
 79 | labels = [
 80 |     "Analysis",
 81 |     "Backdoor",
 82 |     "DoS",
 83 |     "Exploits",
 84 |     "Fuzzers",
 85 |     "Generic",
 86 |     "Normal",
 87 |     "Recon",
 88 |     "Worms",
 89 | ]
 90 | plot_confusion_matrix(
 91 |     knn, X_test, y_test, cmap="Greens", display_labels=labels, normalize="pred", ax=ax
 92 | )
 93 | plt.savefig("./diagrams/kNN Confusion Matrix.png")
 94 | plt.show()
 95 | print("=========================")
 96 | print("Fitting SVM Classifier")
 97 | print("=========================")
 98 | t1_svm = time.time()
 99 | svm.fit(X_train, y_train.astype(int))
100 | t2_svm = time.time()
101 | print("Time to train SVM on training dat:", t2_svm - t1_svm)
102 | y_pred = svm.predict(X_test)
103 | print("Accuracy for Multiclass SVM is - ", accuracy_score(y_test, y_pred) * 100)
104 | cls_report = classification_report(y_true=y_test, y_pred=y_pred)
105 | print(cls_report)
106 | pkl_filename = "./qaiser_models/SVM_multi.pkl"
107 | if not path.isfile(pkl_filename):
108 |     with open(pkl_filename, "wb") as file:
109 |         pickle.dump(svm, file)
110 |     print("Saved model to disk")
111 | else:
112 |     print("Model already saved")
113 | import matplotlib.pyplot as plt
114 | from sklearn.metrics import plot_confusion_matrix
115 | 
116 | print("Testing on Unssen Data")
117 | fig, ax = plt.subplots(figsize=(10, 10))
118 | labels = [
119 |     "Analysis",
120 |     "Backdoor",
121 |     "DoS",
122 |     "Exploits",
123 |     "Fuzzers",
124 |     "Generic",
125 |     "Normal",
126 |     "Recon",
127 |     "Worms",
128 | ]
129 | plot_confusion_matrix(
130 |     svm, X_test, y_test, cmap="Greens", display_labels=labels, normalize="pred", ax=ax
131 | )
132 | plt.savefig("./diagrams/SVM multiclass Confusion Matrix.png")
133 | plt.show()
134 | import matplotlib.pyplot as plt
135 | from sklearn.metrics import plot_confusion_matrix
136 | 
137 | print("=========================")
138 | print("Fitting Random Forest Classifier")
139 | print("=========================")
140 | t1_rf = time.time()
141 | rf.fit(X_train, y_train.astype(int))
142 | t2_rf = time.time()
143 | print("Time to train RF on binary training dat:", t2_rf - t1_rf)
144 | print("======================================================")
145 | y_pred = rf.predict(X_test)
146 | print("Accuracy for multi RF is - ", accuracy_score(y_test, y_pred) * 100)
147 | cls_report = classification_report(y_true=y_test, y_pred=y_pred)
148 | print("========Printing Classification Reports==========")
149 | print(cls_report)
150 | pkl_filename = "./qaiser_models/RF_multi.pkl"
151 | if not path.isfile(pkl_filename):
152 |     with open(pkl_filename, "wb") as file:
153 |         pickle.dump(rf, file)
154 |     print("Saved model to disk")
155 | else:
156 |     print("Model already saved")
157 | print("Testing on Unssen Data")
158 | fig, ax = plt.subplots(figsize=(10, 10))
159 | labels = [
160 |     "Analysis",
161 |     "Backdoor",
162 |     "DoS",
163 |     "Exploits",
164 |     "Fuzzers",
165 |     "Generic",
166 |     "Normal",
167 |     "Recon",
168 |     "Worms",
169 | ]
170 | plot_confusion_matrix(
171 |     rf, X_test, y_test, cmap="Greens", display_labels=labels, normalize="pred", ax=ax
172 | )
173 | plt.savefig("./diagrams/RF Confusion Matrix.png")
174 | plt.show()
175 | print("===========================================")
176 | print("Fitting DT Classifier")
177 | print("===========================================")
178 | t1_dt = time.time()
179 | dt.fit(X_train, y_train.astype(int))
180 | t2_dt = time.time()
181 | print("Time to train RF on multiclass training dat:", t2_dt - t1_dt)
182 | print("======================================================")
183 | y_pred = dt.predict(X_test)
184 | print("Accuracy for multi DT is - ", accuracy_score(y_test, y_pred) * 100)
185 | cls_report = classification_report(y_true=y_test, y_pred=y_pred)
186 | print("========Printing Classification Reports==========")
187 | print(cls_report)
188 | pkl_filename = "./qaiser_models/DT_multi.pkl"
189 | if not path.isfile(pkl_filename):
190 |     with open(pkl_filename, "wb") as file:
191 |         pickle.dump(dt, file)
192 |     print("Saved model to disk")
193 | else:
194 |     print("Model already saved")
195 | print("Testing on Unssen Data")
196 | fig, ax = plt.subplots(figsize=(10, 10))
197 | labels = [
198 |     "Analysis",
199 |     "Backdoor",
200 |     "DoS",
201 |     "Exploits",
202 |     "Fuzzers",
203 |     "Generic",
204 |     "Normal",
205 |     "Recon",
206 |     "Worms",
207 | ]
208 | plot_confusion_matrix(
209 |     dt, X_test, y_test, cmap="Greens", display_labels=labels, normalize="pred", ax=ax
210 | )
211 | plt.savefig("./diagrams/DT Confusion Matrix.png")
212 | plt.show()
213 | print("===========================================")
214 | print("Fitting MLP Classifier")
215 | print("===========================================")
216 | t1_mlp = time.time()
217 | mlp.fit(X_train, y_train.astype(int))
218 | t2_mlp = time.time()
219 | print("Time to train MLP on multiclass training dat:", t2_dt - t1_dt)
220 | print("======================================================")
221 | y_pred = mlp.predict(X_test)
222 | print("Accuracy for multiclass MLP is - ", accuracy_score(y_test, y_pred) * 100)
223 | cls_report = classification_report(y_true=y_test, y_pred=y_pred)
224 | print("========Printing Classification Reports==========")
225 | print(cls_report)
226 | pkl_filename = "./qaiser_models/MLP_multi.pkl"
227 | if not path.isfile(pkl_filename):
228 |     with open(pkl_filename, "wb") as file:
229 |         pickle.dump(mlp, file)
230 |     print("Saved model to disk")
231 | else:
232 |     print("Model already saved")
233 | print("Testing on Unssen Data")
234 | fig, ax = plt.subplots(figsize=(10, 10))
235 | labels = [
236 |     "Analysis",
237 |     "Backdoor",
238 |     "DoS",
239 |     "Exploits",
240 |     "Fuzzers",
241 |     "Generic",
242 |     "Normal",
243 |     "Recon",
244 |     "Worms",
245 | ]
246 | plot_confusion_matrix(
247 |     mlp, X_test, y_test, cmap="Greens", display_labels=labels, normalize="pred", ax=ax
248 | )
249 | plt.savefig("./diagrams/MLP Confusion Matrix.png")
250 | plt.show()
251 | print("===========================================")
252 | print("Fitting Our Ensemble Method Classifier")
253 | print("===========================================")
254 | t1_clf_voting = time.time()
255 | clf_voting.fit(X_train, y_train.astype(int))
256 | t2_clf_voting = time.time()
257 | print("Time to train clf_voting on binary training dat:", t2_clf_voting - t1_clf_voting)
258 | print("======================================================")
259 | y_pred = clf_voting.predict(X_test)
260 | print("Accuracy for binary clf_voting is - ", accuracy_score(y_test, y_pred) * 100)
261 | cls_report = classification_report(y_true=y_test, y_pred=y_pred)
262 | print("========Printing Classification Reports==========")
263 | print(cls_report)
264 | pkl_filename = "./qaiser_models/clf_voting_multi.pkl"
265 | if not path.isfile(pkl_filename):
266 |     with open(pkl_filename, "wb") as file:
267 |         pickle.dump(clf_voting, file)
268 |     print("Saved model to disk")
269 | else:
270 |     print("Model already saved")
271 | print("Testing on Unssen Data")
272 | fig, ax = plt.subplots(figsize=(10, 10))
273 | labels = [
274 |     "Analysis",
275 |     "Backdoor",
276 |     "DoS",
277 |     "Exploits",
278 |     "Fuzzers",
279 |     "Generic",
280 |     "Normal",
281 |     "Recon",
282 |     "Worms",
283 | ]
284 | plot_confusion_matrix(
285 |     clf_voting,
286 |     X_test,
287 |     y_test,
288 |     cmap="Greens",
289 |     display_labels=labels,
290 |     normalize="pred",
291 |     ax=ax,
292 | )
293 | plt.savefig("./diagrams/clf_voting Confusion Matrix-Testing.png")
294 | plt.show()
295 | print("===========================================")
296 | print("Fitting Our Ensemble Method Classifier")
297 | print("===========================================")
298 | from sklearn.ensemble import GradientBoostingClassifier
299 | 
300 | xg = GradientBoostingClassifier(
301 |     n_estimators=100, learning_rate=1.0, max_depth=3, random_state=0
302 | )
303 | clf_voting = VotingClassifier(
304 |     estimators=[("rf", rf), ("dt", dt), ("xg", xg)], voting="hard"
305 | )
306 | t1_clf_voting = time.time()
307 | clf_voting.fit(X_train, y_train.astype(int))
308 | t2_clf_voting = time.time()
309 | print("Time to train clf_voting on multi training dat:", t2_clf_voting - t1_clf_voting)
310 | print("======================================================")
311 | y_pred = clf_voting.predict(X_test)
312 | print("Accuracy for multiclass clf_voting is - ", accuracy_score(y_test, y_pred) * 100)
313 | cls_report = classification_report(y_true=y_test, y_pred=y_pred)
314 | print("========Printing Classification Reports==========")
315 | print(cls_report)
316 | pkl_filename = "./qaiser_models/clf_ensemble_multi.pkl"
317 | if not path.isfile(pkl_filename):
318 |     with open(pkl_filename, "wb") as file:
319 |         pickle.dump(clf_voting, file)
320 |     print("Saved model to disk")
321 | else:
322 |     print("Model already saved")
323 | print("Testing on Unssen Data")
324 | fig, ax = plt.subplots(figsize=(10, 10))
325 | labels = [
326 |     "Analysis",
327 |     "Backdoor",
328 |     "DoS",
329 |     "Exploits",
330 |     "Fuzzers",
331 |     "Generic",
332 |     "Normal",
333 |     "Recon",
334 |     "Worms",
335 | ]
336 | plot_confusion_matrix(
337 |     clf_voting,
338 |     X_test,
339 |     y_test,
340 |     cmap="Greens",
341 |     display_labels=labels,
342 |     normalize="pred",
343 |     ax=ax,
344 | )
345 | plt.savefig("./diagrams/Ensemble Confusion Matrix-Testing.png")
346 | plt.show()
347 | print("===========================================")
348 | print("Fitting Our Ensemble Method Classifier")
349 | print("===========================================")
350 | print("Time to train clf_voting on multi training dat:", t2_clf_voting - t1_clf_voting)
351 | print("======================================================")
352 | y_pred = clf_voting.predict(X_train)
353 | print(
354 |     "Accuracy for multiclass clf_voting on Training Data is - ",
355 |     accuracy_score(y_train.astype(int), y_pred) * 100,
356 | )
357 | print("Testing on Unssen Data")
358 | fig, ax = plt.subplots(figsize=(10, 10))
359 | labels = [
360 |     "Analysis",
361 |     "Backdoor",
362 |     "DoS",
363 |     "Exploits",
364 |     "Fuzzers",
365 |     "Generic",
366 |     "Normal",
367 |     "Recon",
368 |     "Worms",
369 | ]
370 | plot_confusion_matrix(
371 |     clf_voting,
372 |     X_train,
373 |     y_train,
374 |     cmap="Greens",
375 |     display_labels=labels,
376 |     normalize="pred",
377 |     ax=ax,
378 | )
379 | plt.savefig("./diagrams/Ensemble Training Data Confusion Matrix-Testing.png")
380 | plt.show()
381 | xg = GradientBoostingClassifier(
382 |     n_estimators=100, learning_rate=1.0, max_depth=3, random_state=0
383 | )
384 | clf_voting = VotingClassifier(
385 |     estimators=[("rf", rf), ("dt", dt), ("xg", xg)], voting="soft"
386 | )
387 | t1_clf_voting = time.time()
388 | clf_voting.fit(X_train, y_train.astype(int))
389 | t2_clf_voting = time.time()
390 | print("Time to train clf_voting on multi training dat:", t2_clf_voting - t1_clf_voting)
391 | import scikitplot.plotters as skplt
392 | import matplotlib.pyplot as plt
393 | 
394 | print("ROC Curve for Testing Data")
395 | preds = clf_voting.predict_proba(X_test)
396 | fig, ax = plt.subplots(figsize=(10, 10))
397 | skplt.plot_roc_curve(y_test, preds, ax=ax)
398 | plt.savefig("Ensemble ROC for Testing.png")
399 | plt.show()
400 | import scikitplot.plotters as skplt
401 | import matplotlib.pyplot as plt
402 | 
403 | print("ROC Curve for Training Data")
404 | preds = clf_voting.predict_proba(X_train)
405 | fig, ax = plt.subplots(figsize=(10, 10))
406 | skplt.plot_roc_curve(y_train, preds, ax=ax)
407 | plt.savefig("Ensemble ROC for Training.png")
408 | plt.show()
409 | 


--------------------------------------------------------------------------------
/cicids2018_updated.py:
--------------------------------------------------------------------------------
  1 | ### this code was executed in jupyter notebook and exported as .py files so you may need to modify it to run as python script.
  2 | import numpy as np                                                                                     
  3 | import pandas as pd                                                                                    
  4 | import seaborn as sns                                                                                  
  5 | import matplotlib.pyplot as plt                                                                        
  6 | from sklearn import preprocessing                                                                      
  7 | from sklearn.preprocessing import LabelEncoder, Imputer,MinMaxScaler                                   
  8 | from sklearn.model_selection import train_test_split,cross_val_score                                   
  9 | from sklearn.ensemble import RandomForestClassifier                                                    
 10 | from sklearn.decomposition import PCA                                                                  
 11 | from sklearn.metrics import confusion_matrix,accuracy_score,precision_recall_fscore_support            
 12 | from sklearn.neighbors import KNeighborsClassifier                                                     
 13 | from sklearn.tree import DecisionTreeClassifier                                                        
 14 | from sklearn.svm import SVC                                                                            
 15 | from keras.wrappers.scikit_learn import KerasClassifier                                                
 16 | from keras.models import Sequential                                                                    
 17 | from keras.layers import Dense                                                                         
 18 | from keras.optimizers import Adam                                                                      
 19 | from keras.utils.np_utils import to_categorical                                                        
 20 | from sklearn.ensemble import RandomForestRegressor                                                     
 21 | df=pd.read_csv('combined2.csv')                                                                        
 22 | df_value=df[' Label'].value_counts()                                                                   
 23 | df[' Label']=df[' Label'].apply({'DoS Hulk':'DoS', 'DoS GoldenEye':'DoS','DoS Slowhttptest':'DoS','DoS 
 24 | slowloris':'DoS' ,'BENIGN':'BENIGN' ,'DDoS':'DDoS', 'PortScan':'PortScan'}.get)                        
 25 | df2=df.drop_duplicates()                                                                               
 26 | df2_value=df2[' Label'].value_counts()                                                                 
 27 | datatype=df2.dtypes                                                                                    
 28 | df2['Flow Bytes/s']=df2['Flow Bytes/s'].astype('float64')                                              
 29 | df2[' Flow Packets/s']=df2[' Flow Packets/s'].astype('float64')                                        
 30 | NaN_values=df2.isnull().sum()                                                                          
 31 | df2['Flow Bytes/s'].fillna(df2['Flow Bytes/s'].mean(),inplace=True)                                    
 32 | print('Datasetin ilk okunduÄŸu hali: \n',df_value)                                                     
 33 | print('Datasetin ilk (row,Column) sayÄ±sÄ±: {} '.format(df.shape))                                     
 34 | print('Datasetin Labelindeki DoS daldÄ±rÄ±larÄ±nÄ±n birleÅŸtirilmesi ve gÃ¼rÃ¼ltÃ¼nÃ¼n azaltÄ±lmasÄ±:\n',df2_value)                                                                                           
 35 | print('Datasetin son (row,Column) sayÄ±sÄ±: {} '.format(df2.shape))                                    
 36 | dataset=pd.read_csv('dataset.csv')                                     
 37 | dataset                                                                                                
 38 | DoS_df1=dataset[dataset[' Label']=='BENIGN']                                                           
 39 | DoS_df=DoS_df1.append(dataset[dataset[' Label']=='DoS'])                                               
 40 | DoS_df                                                                                                 
 41 | DDoS_df1=dataset[dataset[' Label']=='BENIGN']                                                          
 42 | DDoS_df=DDoS_df1.append(dataset[dataset[' Label']=='DDoS'])                                            
 43 | DDoS_df                                                                                                
 44 | PortScan_df1=dataset[dataset[' Label']=='BENIGN']                                                      
 45 | PortScan_df=PortScan_df1.append(dataset[dataset[' Label']=='PortScan'])                                
 46 | PortScan_df                                                                                            
 47 | NA_df=dataset                                                                                          
 48 | NA_df[' Label']=NA_df[' Label'].apply({'DoS':'Anormal','BENIGN':'Normal' ,'DDoS':'Anormal', 'PortScan':'Anormal'}.get)                                                                                        
 49 | NA_df                                                                                                  
 50 | def train_test_dataset(df):                                                                            
 51 |     labelencoder = LabelEncoder()                                                                      
 52 |     df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1])                                        
 53 |     X = df.drop([' Label'],axis=1)                                                                     
 54 |     y = df.iloc[:, -1].values.reshape(-1,1)                                                            
 55 |     y=np.ravel(y)                                                                                      
 56 |     X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.7, test_size = 0.3, random_state = 0, stratify = y)                                                                               
 57 |     return  X_train, X_test, y_train, y_test                                                           
 58 | def RandomForest(X_train, X_test, y_train, y_test):                                                    
 59 |     rf = RandomForestClassifier(random_state = 0)                                                      
 60 |     imputer = Imputer(missing_values="NaN", strategy = "mean")                                         
 61 |     imputer = imputer.fit(X_train)                                                                     
 62 |     X_train = imputer.transform(X_train)                                                               
 63 |     X_test = imputer.transform(X_test)                                                                 
 64 |     rf.fit(X_train,y_train)                                                                            
 65 |     rf_score=rf.score(X_test,y_test)                                                                   
 66 |     y_predict=rf.predict(X_test)                                                                       
 67 |     y_true=y_test                                                                                      
 68 |     print('Random Forest Accuracy:'+ str(rf_score))                                                    
 69 |     precision,recall,fscore,none= precision_recall_fscore_support(y_true, y_predict, average='weighted')                                                                                                      
 70 |     print('Random Forest precision_recall_fscore:'+(str(precision))+(str(recall))+(str(fscore)))       
 71 |     cm=confusion_matrix(y_true,y_predict)                                                              
 72 |     f,ax=plt.subplots(figsize=(5,5))                                                                   
 73 |     sns.heatmap(cm,annot=True,linewidth=0.5,linecolor="red",fmt=".0f",ax=ax)                           
 74 |     plt.xlabel("y_pred")                                                                               
 75 |     plt.ylabel("y_true")                                                                               
 76 |     plt.show()                                                                                         
 77 |     return rf_score,precision,recall,fscore,none                                                       
 78 | def DecisionTree(X_train, X_test, y_train, y_test):                                                    
 79 |     dt = DecisionTreeClassifier(random_state = 0)                                                      
 80 |     imputer = Imputer(missing_values="NaN", strategy = "mean")                                         
 81 |     imputer = imputer.fit(X_train)                                                                     
 82 |     X_train = imputer.transform(X_train)                                                               
 83 |     X_test = imputer.transform(X_test)                                                                 
 84 |     dt.fit(X_train, y_train)                                                                           
 85 |     score=dt.score(X_test,y_test)                                                                      
 86 |     print('Decision Tree Accuracy:'+ str(score))                                                       
 87 |     y_predict=dt.predict(X_test)                                                                       
 88 |     y_true=y_test                                                                                      
 89 |     precision,recall,fscore,none= precision_recall_fscore_support(y_true, y_predict, average='weighted')                                                                                                      
 90 |     print('Decision Tree precision_recall_fscore:'+(str(precision))+(str(recall))+(str(fscore)))       
 91 |     cm=confusion_matrix(y_true,y_predict)                                                              
 92 |     f,ax=plt.subplots(figsize=(5,5))                                                                   
 93 |     sns.heatmap(cm,annot=True,linewidth=0.5,linecolor="red",fmt=".0f",ax=ax)                           
 94 |     plt.xlabel("y_pred")                                                                               
 95 |     plt.ylabel("y_true")                                                                               
 96 |     plt.show()                                                                                         
 97 |     return score,precision,recall,fscore,none                                                          
 98 | def kNN(X_train, X_test, y_train, y_test):                                                             
 99 |     knn=KNeighborsClassifier(n_neighbors=5)                                                            
100 |     imputer = Imputer(missing_values="NaN", strategy = "mean")                                         
101 |     imputer = imputer.fit(X_train)                                                                     
102 |     X_train = imputer.transform(X_train)                                                               
103 |     X_test = imputer.transform(X_test)                                                                 
104 |     knn.fit(X_train,y_train)                                                                           
105 |     prediction=knn.predict(X_test)                                                                     
106 |     score=knn.score(X_test,y_test)                                                                     
107 |     print("5 nn score:"+ str(score))                                                                   
108 |     y_predict=knn.predict(X_test)                                                                      
109 |     y_true=y_test                                                                                      
110 |     precision,recall,fscore,none= precision_recall_fscore_support(y_true, y_predict, average='weighted')                                                                                                      
111 |     print('5nn precision_recall_fscore:'+(str(precision))+(str(recall))+(str(fscore)))                 
112 |     cm=confusion_matrix(y_true,y_predict)                                                              
113 |     f,ax=plt.subplots(figsize=(5,5))                                                                   
114 |     sns.heatmap(cm,annot=True,linewidth=0.5,linecolor="red",fmt=".0f",ax=ax)                           
115 |     plt.xlabel("y_pred")                                                                               
116 |     plt.ylabel("y_true")                                                                               
117 |     plt.show()                                                                                         
118 |     return score,precision,recall,fscore,none                                                          
119 | def SVM(X_train, X_test, y_train, y_test):                                                             
120 |     svclassifier = SVC(kernel='linear')                                                                
121 |     imputer = Imputer(missing_values="NaN", strategy = "mean")                                         
122 |     imputer = imputer.fit(X_train)                                                                     
123 |     X_train = imputer.transform(X_train)                                                               
124 |     X_test = imputer.transform(X_test)                                                                 
125 |     svclassifier.fit(X_train, y_train)                                                                 
126 |     print("SVM Classification Accuracy:"+ str(svclassifier.score(X_test,y_test)))                      
127 |     y_predict = svclassifier.predict(X_test)                                                           
128 |     y_true=y_test                                                                                      
129 |     cm=confusion_matrix(y_true,y_predict)                                                              
130 |     f,ax=plt.subplots(figsize=(5,5))                                                                   
131 |     sns.heatmap(cm,annot=True,linewidth=0.5,linecolor="red",fmt=".0f",ax=ax)                           
132 |     plt.xlabel("y_pred")                                                                               
133 |     plt.ylabel("y_true")                                                                               
134 |     plt.show()                                                                                         
135 | def build_classifier(X_train):                                                                         
136 |     def bm():                                                                                          
137 |         classifier = Sequential()                                                                      
138 |         classifier.add(Dense(units = 80, kernel_initializer = 'uniform', activation = 'relu', input_dim = X_train.shape[1]))                                                                                  
139 |         classifier.add(Dense(units = 25, kernel_initializer = 'uniform', activation = 'relu'))         
140 |                                                                                                        
141 |         classifier.add(Dense(units = 2, kernel_initializer = 'uniform', activation = 'softmax'))       
142 |         lr=.003                                                                                        
143 |         adam0=Adam(lr=lr)                                                                              
144 |         classifier.compile(optimizer =adam0, loss = 'categorical_crossentropy', metrics = ['accuracy'])                                                                                                       
145 |         return classifier                                                                              
146 |     return bm                                                                                          
147 | def ANN(X_train, X_test, y_train, y_test):                                                             
148 |     y_ = to_categorical(y_train)                                                                       
149 |     y_t=to_categorical(y_test)                                                                         
150 |     estimator  = KerasClassifier(build_fn = build_classifier(X_train), epochs = 5)                     
151 |     accuracies = cross_val_score(estimator, X = X_train, y = y_, cv = 3)                               
152 |     mean = accuracies.mean()                                                                           
153 |     variance = accuracies.std()                                                                        
154 |     print("Accuracy mean: "+ str(mean))                                                                
155 |     print("Accuracy variance: "+ str(variance))                                                        
156 | def feature_selection(df):                                                                             
157 |     feature=(df.drop([' Label'],axis=1)).columns.values                                                
158 |     labelencoder = LabelEncoder()                                                                      
159 |     df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1])                                        
160 |     X = df.drop([' Label'],axis=1)                                                                     
161 |     Y = df.iloc[:, -1].values.reshape(-1,1)                                                            
162 |     Y=np.ravel(Y)                                                                                      
163 |     imputer = Imputer(missing_values="NaN", strategy = "mean")                                         
164 |     imputer = imputer.fit(X)                                                                           
165 |     X = imputer.transform(X)                                                                           
166 |     rf = RandomForestRegressor()                                                                       
167 |     rf.fit(X, Y)                                                                                       
168 |     print ("Features sorted by their score:")                                                          
169 |     print (sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), feature), reverse=True))    
170 | feature_selection(dataset)                                                                             
171 | DoSX_train, DoSX_test, DoSy_train, DoSy_test=train_test_dataset(DoS_df)                                
172 | DDoSX_train, DDoSX_test, DDoSy_train, DDoSy_test=train_test_dataset(DDoS_df)                           
173 | PS_X_train,PS_X_test,PS_y_train, PS_y_test=train_test_dataset(PortScan_df)                             
174 | NA_X_train, NA_X_test, NA_y_train, NA_y_test=train_test_dataset(NA_df)                                 
175 | dosrf_score,dosrf_precision,dosrf_recall,dosrf_fscore,none=RandomForest(DoSX_train, DoSX_test, DoSy_train, DoSy_test)                                                                                         
176 | dosdt_score,dosdt_precision,dosdt_recall,dosdt_fscore,none=DecisionTree(DoSX_train, DoSX_test, DoSy_train, DoSy_test)                                                                                         
177 | dosKnn_score,dosKnn_precision,dosKnn_recall,dosKnn_fscore,none=kNN(DoSX_train, DoSX_test, DoSy_train, DoSy_test)                                                                                              
178 | SVM(DoSX_train, DoSX_test, DoSy_train, DoSy_test)                                                      
179 | ANN(DoSX_train, DoSX_test, DoSy_train, DoSy_test)                                                      
180 | labels=np.unique(DoSy_train)                                                                           
181 | print(labels)                                                                                          
182 | psrf_score,psrf_precision,psrf_recall,psrf_fscore,none=RandomForest(PS_X_train,PS_X_test,PS_y_train, PS_y_test)                                                                                               
183 | psdt_score,psdt_precision,psdt_recall,psdt_fscore,none=DecisionTree(PS_X_train,PS_X_test,PS_y_train, PS_y_test)                                                                                               
184 | psKnn_score,psKnn_precision,psKnn_recall,psKnn_fscore,none=kNN(PS_X_train,PS_X_test,PS_y_train, PS_y_test)                                                                                                    
185 | SVM(PS_X_train,PS_X_test,PS_y_train, PS_y_test)                                                        
186 | ANN(PS_X_train,PS_X_test,PS_y_train, PS_y_test)                                                        
187 | ddosrf_score,ddosrf_precision,ddosrf_recall,ddosrf_fscore,none=RandomForest(DDoSX_train, DDoSX_test, DDoSy_train, DDoSy_test)                                                                                 
188 | ddosdt_score,ddosdt_precision,ddosdt_recall,ddosdt_fscore,none=DecisionTree(DDoSX_train, DDoSX_test, DDoSy_train, DDoSy_test)                                                                                 
189 | ddosKnn_score,ddosKnn_precision,ddosKnn_recall,ddosKnn_fscore,none=kNN(DDoSX_train, DDoSX_test, DDoSy_train, DDoSy_test)                                                                                      
190 | SVM(DDoSX_train, DDoSX_test, DDoSy_train, DDoSy_test)                                                  
191 | ANN(DDoSX_train, DDoSX_test, DDoSy_train, DDoSy_test)                                                  
192 | narf_score,narf_precision,narf_recall,narf_fscore,none=RandomForest(NA_X_train, NA_X_test, NA_y_train, 
193 | NA_y_test)                                                                                             
194 | nadt_score,nadt_precision,nadt_recall,nadt_fscore,none=DecisionTree(NA_X_train, NA_X_test, NA_y_train, 
195 | NA_y_test)                                                                                             
196 | naKnn_score,naKnn_precision,naKnn_recall,naKnn_fscore,none=kNN(NA_X_train, NA_X_test, NA_y_train, NA_y_test)                                                                                                  
197 | SVM(NA_X_train, NA_X_test, NA_y_train, NA_y_test)                                                      
198 | ANN(NA_X_train, NA_X_test, NA_y_train, NA_y_test)                                                      
199 | d={'Algoritmalar': ["Random Forest", "Decision Tree","KNN","ANN"],                                     
200 |    'DoS accuracy': [dosrf_score,dosdt_score,dosKnn_score,0.7636],                                      
201 |    'DDoS accuracy': [ddosrf_score, ddosdt_score,ddosKnn_score,0.8307],                                 
202 |    'Port Scan accuracy':[psrf_score,psdt_score,psKnn_score,0.8738],                                    
203 |    'Normal/Anormal accuracy':[narf_score,nadt_score,naKnn_score,0.6034],                               
204 |   }                                                                                                    
205 | dataframe= pd.DataFrame(data=d)                                                                        
206 | dataframe   
207 | 


--------------------------------------------------------------------------------