├── NID_NSL_KDD_updated.py ├── README.md ├── UNSW_BINARY_UPDATED.py ├── UNSW_MULTI_UPDATED.py └── cicids2018_updated.py /NID_NSL_KDD_updated.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.model_selection import cross_val_score 4 | from sklearn import metrics 5 | import sys 6 | import sklearn 7 | import io 8 | import random 9 | 10 | train_url = "https://raw.githubusercontent.com/merteroglu/NSL-KDD-Network-Instrusion-Detection/master/NSL_KDD_Train.csv" 11 | test_url = "https://raw.githubusercontent.com/merteroglu/NSL-KDD-Network-Instrusion-Detection/master/NSL_KDD_Test.csv" 12 | col_names = [ 13 | "duration", 14 | "protocol_type", 15 | "service", 16 | "flag", 17 | "src_bytes", 18 | "dst_bytes", 19 | "land", 20 | "wrong_fragment", 21 | "urgent", 22 | "hot", 23 | "num_failed_logins", 24 | "logged_in", 25 | "num_compromised", 26 | "root_shell", 27 | "su_attempted", 28 | "num_root", 29 | "num_file_creations", 30 | "num_shells", 31 | "num_access_files", 32 | "num_outbound_cmds", 33 | "is_host_login", 34 | "is_guest_login", 35 | "count", 36 | "srv_count", 37 | "serror_rate", 38 | "srv_serror_rate", 39 | "rerror_rate", 40 | "srv_rerror_rate", 41 | "same_srv_rate", 42 | "diff_srv_rate", 43 | "srv_diff_host_rate", 44 | "dst_host_count", 45 | "dst_host_srv_count", 46 | "dst_host_same_srv_rate", 47 | "dst_host_diff_srv_rate", 48 | "dst_host_same_src_port_rate", 49 | "dst_host_srv_diff_host_rate", 50 | "dst_host_serror_rate", 51 | "dst_host_srv_serror_rate", 52 | "dst_host_rerror_rate", 53 | "dst_host_srv_rerror_rate", 54 | "label", 55 | ] 56 | df = pd.read_csv(train_url, header=None, names=col_names) 57 | df_test = pd.read_csv(test_url, header=None, names=col_names) 58 | print("Dimensions of the Training set:", df.shape) 59 | print("Dimensions of the Test set:", df_test.shape) 60 | df.head(5) 61 | print("Label distribution Training set:") 62 | print(df["label"].value_counts()) 63 | print() 64 | print("Label distribution Test set:") 65 | print(df_test["label"].value_counts()) 66 | print("Training set:") 67 | for col_name in df.columns: 68 | if df[col_name].dtypes == "object": 69 | unique_cat = len(df[col_name].unique()) 70 | print( 71 | "Feature '{col_name}' has {unique_cat} categories".format( 72 | col_name=col_name, unique_cat=unique_cat 73 | ) 74 | ) 75 | print() 76 | print("Distribution of categories in service:") 77 | print(df["service"].value_counts().sort_values(ascending=False).head()) 78 | print("Test set:") 79 | for col_name in df_test.columns: 80 | if df_test[col_name].dtypes == "object": 81 | unique_cat = len(df_test[col_name].unique()) 82 | print( 83 | "Feature '{col_name}' has {unique_cat} categories".format( 84 | col_name=col_name, unique_cat=unique_cat 85 | ) 86 | ) 87 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder 88 | 89 | categorical_columns = ["protocol_type", "service", "flag"] 90 | df_categorical_values = df[categorical_columns] 91 | testdf_categorical_values = df_test[categorical_columns] 92 | df_categorical_values.head() 93 | unique_protocol = sorted(df.protocol_type.unique()) 94 | string1 = "Protocol_type_" 95 | unique_protocol2 = [string1 + x for x in unique_protocol] 96 | print(unique_protocol2) 97 | unique_service = sorted(df.service.unique()) 98 | string2 = "service_" 99 | unique_service2 = [string2 + x for x in unique_service] 100 | print(unique_service2) 101 | unique_flag = sorted(df.flag.unique()) 102 | string3 = "flag_" 103 | unique_flag2 = [string3 + x for x in unique_flag] 104 | print(unique_flag2) 105 | dumcols = unique_protocol2 + unique_service2 + unique_flag2 106 | unique_service_test = sorted(df_test.service.unique()) 107 | unique_service2_test = [string2 + x for x in unique_service_test] 108 | testdumcols = unique_protocol2 + unique_service2_test + unique_flag2 109 | df_categorical_values_enc = df_categorical_values.apply(LabelEncoder().fit_transform) 110 | print(df_categorical_values.head()) 111 | print("--------------------") 112 | print(df_categorical_values_enc.head()) 113 | testdf_categorical_values_enc = testdf_categorical_values.apply( 114 | LabelEncoder().fit_transform 115 | ) 116 | enc = OneHotEncoder(categories="auto") 117 | df_categorical_values_encenc = enc.fit_transform(df_categorical_values_enc) 118 | df_cat_data = pd.DataFrame(df_categorical_values_encenc.toarray(), columns=dumcols) 119 | testdf_categorical_values_encenc = enc.fit_transform(testdf_categorical_values_enc) 120 | testdf_cat_data = pd.DataFrame( 121 | testdf_categorical_values_encenc.toarray(), columns=testdumcols 122 | ) 123 | df_cat_data.head() 124 | trainservice = df["service"].tolist() 125 | testservice = df_test["service"].tolist() 126 | difference = list(set(trainservice) - set(testservice)) 127 | string = "service_" 128 | difference = [string + x for x in difference] 129 | difference 130 | for col in difference: 131 | testdf_cat_data[col] = 0 132 | print(df_cat_data.shape) 133 | print(testdf_cat_data.shape) 134 | newdf = df.join(df_cat_data) 135 | newdf.drop("flag", axis=1, inplace=True) 136 | newdf.drop("protocol_type", axis=1, inplace=True) 137 | newdf.drop("service", axis=1, inplace=True) 138 | newdf_test = df_test.join(testdf_cat_data) 139 | newdf_test.drop("flag", axis=1, inplace=True) 140 | newdf_test.drop("protocol_type", axis=1, inplace=True) 141 | newdf_test.drop("service", axis=1, inplace=True) 142 | print(newdf.shape) 143 | print(newdf_test.shape) 144 | labeldf = newdf["label"] 145 | labeldf_test = newdf_test["label"] 146 | newlabeldf = labeldf.replace( 147 | { 148 | "normal": 0, 149 | "neptune": 1, 150 | "back": 1, 151 | "land": 1, 152 | "pod": 1, 153 | "smurf": 1, 154 | "teardrop": 1, 155 | "mailbomb": 1, 156 | "apache2": 1, 157 | "processtable": 1, 158 | "udpstorm": 1, 159 | "worm": 1, 160 | "ipsweep": 2, 161 | "nmap": 2, 162 | "portsweep": 2, 163 | "satan": 2, 164 | "mscan": 2, 165 | "saint": 2, 166 | "ftp_write": 3, 167 | "guess_passwd": 3, 168 | "imap": 3, 169 | "multihop": 3, 170 | "phf": 3, 171 | "spy": 3, 172 | "warezclient": 3, 173 | "warezmaster": 3, 174 | "sendmail": 3, 175 | "named": 3, 176 | "snmpgetattack": 3, 177 | "snmpguess": 3, 178 | "xlock": 3, 179 | "xsnoop": 3, 180 | "httptunnel": 3, 181 | "buffer_overflow": 4, 182 | "loadmodule": 4, 183 | "perl": 4, 184 | "rootkit": 4, 185 | "ps": 4, 186 | "sqlattack": 4, 187 | "xterm": 4, 188 | } 189 | ) 190 | newlabeldf_test = labeldf_test.replace( 191 | { 192 | "normal": 0, 193 | "neptune": 1, 194 | "back": 1, 195 | "land": 1, 196 | "pod": 1, 197 | "smurf": 1, 198 | "teardrop": 1, 199 | "mailbomb": 1, 200 | "apache2": 1, 201 | "processtable": 1, 202 | "udpstorm": 1, 203 | "worm": 1, 204 | "ipsweep": 2, 205 | "nmap": 2, 206 | "portsweep": 2, 207 | "satan": 2, 208 | "mscan": 2, 209 | "saint": 2, 210 | "ftp_write": 3, 211 | "guess_passwd": 3, 212 | "imap": 3, 213 | "multihop": 3, 214 | "phf": 3, 215 | "spy": 3, 216 | "warezclient": 3, 217 | "warezmaster": 3, 218 | "sendmail": 3, 219 | "named": 3, 220 | "snmpgetattack": 3, 221 | "snmpguess": 3, 222 | "xlock": 3, 223 | "xsnoop": 3, 224 | "httptunnel": 3, 225 | "buffer_overflow": 4, 226 | "loadmodule": 4, 227 | "perl": 4, 228 | "rootkit": 4, 229 | "ps": 4, 230 | "sqlattack": 4, 231 | "xterm": 4, 232 | } 233 | ) 234 | newdf["label"] = newlabeldf 235 | newdf_test["label"] = newlabeldf_test 236 | to_drop_DoS = [0, 1] 237 | to_drop_Probe = [0, 2] 238 | to_drop_R2L = [0, 3] 239 | to_drop_U2R = [0, 4] 240 | DoS_df = newdf[newdf["label"].isin(to_drop_DoS)] 241 | Probe_df = newdf[newdf["label"].isin(to_drop_Probe)] 242 | R2L_df = newdf[newdf["label"].isin(to_drop_R2L)] 243 | U2R_df = newdf[newdf["label"].isin(to_drop_U2R)] 244 | DoS_df_test = newdf_test[newdf_test["label"].isin(to_drop_DoS)] 245 | Probe_df_test = newdf_test[newdf_test["label"].isin(to_drop_Probe)] 246 | R2L_df_test = newdf_test[newdf_test["label"].isin(to_drop_R2L)] 247 | U2R_df_test = newdf_test[newdf_test["label"].isin(to_drop_U2R)] 248 | print("Train:") 249 | print("Dimensions of DoS:", DoS_df.shape) 250 | print("Dimensions of Probe:", Probe_df.shape) 251 | print("Dimensions of R2L:", R2L_df.shape) 252 | print("Dimensions of U2R:", U2R_df.shape) 253 | print() 254 | print("Test:") 255 | print("Dimensions of DoS:", DoS_df_test.shape) 256 | print("Dimensions of Probe:", Probe_df_test.shape) 257 | print("Dimensions of R2L:", R2L_df_test.shape) 258 | print("Dimensions of U2R:", U2R_df_test.shape) 259 | X_DoS = DoS_df.drop("label", 1) 260 | Y_DoS = DoS_df.label 261 | X_Probe = Probe_df.drop("label", 1) 262 | Y_Probe = Probe_df.label 263 | X_R2L = R2L_df.drop("label", 1) 264 | Y_R2L = R2L_df.label 265 | X_U2R = U2R_df.drop("label", 1) 266 | Y_U2R = U2R_df.label 267 | X_DoS_test = DoS_df_test.drop("label", 1) 268 | Y_DoS_test = DoS_df_test.label 269 | X_Probe_test = Probe_df_test.drop("label", 1) 270 | Y_Probe_test = Probe_df_test.label 271 | X_R2L_test = R2L_df_test.drop("label", 1) 272 | Y_R2L_test = R2L_df_test.label 273 | X_U2R_test = U2R_df_test.drop("label", 1) 274 | Y_U2R_test = U2R_df_test.label 275 | colNames = list(X_DoS) 276 | colNames_test = list(X_DoS_test) 277 | from sklearn import preprocessing 278 | 279 | scaler1 = preprocessing.StandardScaler().fit(X_DoS) 280 | X_DoS = scaler1.transform(X_DoS) 281 | scaler2 = preprocessing.StandardScaler().fit(X_Probe) 282 | X_Probe = scaler2.transform(X_Probe) 283 | scaler3 = preprocessing.StandardScaler().fit(X_R2L) 284 | X_R2L = scaler3.transform(X_R2L) 285 | scaler4 = preprocessing.StandardScaler().fit(X_U2R) 286 | X_U2R = scaler4.transform(X_U2R) 287 | scaler5 = preprocessing.StandardScaler().fit(X_DoS_test) 288 | X_DoS_test = scaler5.transform(X_DoS_test) 289 | scaler6 = preprocessing.StandardScaler().fit(X_Probe_test) 290 | X_Probe_test = scaler6.transform(X_Probe_test) 291 | scaler7 = preprocessing.StandardScaler().fit(X_R2L_test) 292 | X_R2L_test = scaler7.transform(X_R2L_test) 293 | scaler8 = preprocessing.StandardScaler().fit(X_U2R_test) 294 | X_U2R_test = scaler8.transform(X_U2R_test) 295 | from sklearn.feature_selection import RFE 296 | from sklearn.ensemble import RandomForestClassifier 297 | 298 | clf = RandomForestClassifier(n_estimators=10, n_jobs=2) 299 | rfe = RFE(estimator=clf, n_features_to_select=13, step=1) 300 | rfe.fit(X_DoS, Y_DoS.astype(int)) 301 | X_rfeDoS = rfe.transform(X_DoS) 302 | true = rfe.support_ 303 | rfecolindex_DoS = [i for i, x in enumerate(true) if x] 304 | rfecolname_DoS = list(colNames[i] for i in rfecolindex_DoS) 305 | rfe.fit(X_Probe, Y_Probe.astype(int)) 306 | X_rfeProbe = rfe.transform(X_Probe) 307 | true = rfe.support_ 308 | rfecolindex_Probe = [i for i, x in enumerate(true) if x] 309 | rfecolname_Probe = list(colNames[i] for i in rfecolindex_Probe) 310 | rfe.fit(X_R2L, Y_R2L.astype(int)) 311 | X_rfeR2L = rfe.transform(X_R2L) 312 | true = rfe.support_ 313 | rfecolindex_R2L = [i for i, x in enumerate(true) if x] 314 | rfecolname_R2L = list(colNames[i] for i in rfecolindex_R2L) 315 | rfe.fit(X_U2R, Y_U2R.astype(int)) 316 | X_rfeU2R = rfe.transform(X_U2R) 317 | true = rfe.support_ 318 | rfecolindex_U2R = [i for i, x in enumerate(true) if x] 319 | rfecolname_U2R = list(colNames[i] for i in rfecolindex_U2R) 320 | print("Features selected for DoS:", rfecolname_DoS) 321 | print() 322 | print("Features selected for Probe:", rfecolname_Probe) 323 | print() 324 | print("Features selected for R2L:", rfecolname_R2L) 325 | print() 326 | print("Features selected for U2R:", rfecolname_U2R) 327 | print(X_rfeDoS.shape) 328 | print(X_rfeProbe.shape) 329 | print(X_rfeR2L.shape) 330 | print(X_rfeU2R.shape) 331 | clf_DoS = RandomForestClassifier(n_estimators=10, n_jobs=2) 332 | clf_Probe = RandomForestClassifier(n_estimators=10, n_jobs=2) 333 | clf_R2L = RandomForestClassifier(n_estimators=10, n_jobs=2) 334 | clf_U2R = RandomForestClassifier(n_estimators=10, n_jobs=2) 335 | clf_DoS.fit(X_DoS, Y_DoS.astype(int)) 336 | clf_Probe.fit(X_Probe, Y_Probe.astype(int)) 337 | clf_R2L.fit(X_R2L, Y_R2L.astype(int)) 338 | clf_U2R.fit(X_U2R, Y_U2R.astype(int)) 339 | clf_rfeDoS = RandomForestClassifier(n_estimators=10, n_jobs=2) 340 | clf_rfeProbe = RandomForestClassifier(n_estimators=10, n_jobs=2) 341 | clf_rfeR2L = RandomForestClassifier(n_estimators=10, n_jobs=2) 342 | clf_rfeU2R = RandomForestClassifier(n_estimators=10, n_jobs=2) 343 | clf_rfeDoS.fit(X_rfeDoS, Y_DoS.astype(int)) 344 | clf_rfeProbe.fit(X_rfeProbe, Y_Probe.astype(int)) 345 | clf_rfeR2L.fit(X_rfeR2L, Y_R2L.astype(int)) 346 | clf_rfeU2R.fit(X_rfeU2R, Y_U2R.astype(int)) 347 | clf_DoS.predict(X_DoS_test) 348 | clf_DoS.predict_proba(X_DoS_test)[0:10] 349 | Y_DoS_pred = clf_DoS.predict(X_DoS_test) 350 | pd.crosstab( 351 | Y_DoS_test, Y_DoS_pred, rownames=["Actual attacks"], colnames=["Predicted attacks"] 352 | ) 353 | Y_Probe_pred = clf_Probe.predict(X_Probe_test) 354 | pd.crosstab( 355 | Y_Probe_test, 356 | Y_Probe_pred, 357 | rownames=["Actual attacks"], 358 | colnames=["Predicted attacks"], 359 | ) 360 | Y_R2L_pred = clf_R2L.predict(X_R2L_test) 361 | pd.crosstab( 362 | Y_R2L_test, Y_R2L_pred, rownames=["Actual attacks"], colnames=["Predicted attacks"] 363 | ) 364 | Y_U2R_pred = clf_U2R.predict(X_U2R_test) 365 | pd.crosstab( 366 | Y_U2R_test, Y_U2R_pred, rownames=["Actual attacks"], colnames=["Predicted attacks"] 367 | ) 368 | from sklearn.model_selection import cross_val_score 369 | from sklearn import metrics 370 | 371 | accuracy = cross_val_score(clf_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="accuracy") 372 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2)) 373 | precision = cross_val_score(clf_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="precision") 374 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2)) 375 | recall = cross_val_score(clf_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="recall") 376 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2)) 377 | f = cross_val_score(clf_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="f1") 378 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2)) 379 | accuracy = cross_val_score( 380 | clf_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="accuracy" 381 | ) 382 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2)) 383 | precision = cross_val_score( 384 | clf_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="precision_macro" 385 | ) 386 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2)) 387 | recall = cross_val_score( 388 | clf_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="recall_macro" 389 | ) 390 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2)) 391 | f = cross_val_score(clf_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="f1_macro") 392 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2)) 393 | accuracy = cross_val_score(clf_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="accuracy") 394 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2)) 395 | precision = cross_val_score( 396 | clf_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="precision_macro" 397 | ) 398 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2)) 399 | recall = cross_val_score(clf_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="recall_macro") 400 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2)) 401 | f = cross_val_score(clf_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="f1_macro") 402 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2)) 403 | accuracy = cross_val_score(clf_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="accuracy") 404 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2)) 405 | precision = cross_val_score( 406 | clf_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="precision_macro" 407 | ) 408 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2)) 409 | recall = cross_val_score(clf_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="recall_macro") 410 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2)) 411 | f = cross_val_score(clf_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="f1_macro") 412 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2)) 413 | X_DoS_test2 = X_DoS_test[:, rfecolindex_DoS] 414 | X_Probe_test2 = X_Probe_test[:, rfecolindex_Probe] 415 | X_R2L_test2 = X_R2L_test[:, rfecolindex_R2L] 416 | X_U2R_test2 = X_U2R_test[:, rfecolindex_U2R] 417 | X_U2R_test2.shape 418 | Y_DoS_pred2 = clf_rfeDoS.predict(X_DoS_test2) 419 | pd.crosstab( 420 | Y_DoS_test, Y_DoS_pred2, rownames=["Actual attacks"], colnames=["Predicted attacks"] 421 | ) 422 | Y_Probe_pred2 = clf_rfeProbe.predict(X_Probe_test2) 423 | pd.crosstab( 424 | Y_Probe_test, 425 | Y_Probe_pred2, 426 | rownames=["Actual attacks"], 427 | colnames=["Predicted attacks"], 428 | ) 429 | Y_R2L_pred2 = clf_rfeR2L.predict(X_R2L_test2) 430 | pd.crosstab( 431 | Y_R2L_test, Y_R2L_pred2, rownames=["Actual attacks"], colnames=["Predicted attacks"] 432 | ) 433 | Y_U2R_pred2 = clf_rfeU2R.predict(X_U2R_test2) 434 | pd.crosstab( 435 | Y_U2R_test, Y_U2R_pred2, rownames=["Actual attacks"], colnames=["Predicted attacks"] 436 | ) 437 | accuracy = cross_val_score( 438 | clf_rfeDoS, X_DoS_test2, Y_DoS_test, cv=10, scoring="accuracy" 439 | ) 440 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2)) 441 | precision = cross_val_score( 442 | clf_rfeDoS, X_DoS_test2, Y_DoS_test, cv=10, scoring="precision" 443 | ) 444 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2)) 445 | recall = cross_val_score(clf_rfeDoS, X_DoS_test2, Y_DoS_test, cv=10, scoring="recall") 446 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2)) 447 | f = cross_val_score(clf_rfeDoS, X_DoS_test2, Y_DoS_test, cv=10, scoring="f1") 448 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2)) 449 | accuracy = cross_val_score( 450 | clf_rfeProbe, X_Probe_test2, Y_Probe_test, cv=10, scoring="accuracy" 451 | ) 452 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2)) 453 | precision = cross_val_score( 454 | clf_rfeProbe, X_Probe_test2, Y_Probe_test, cv=10, scoring="precision_macro" 455 | ) 456 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2)) 457 | recall = cross_val_score( 458 | clf_rfeProbe, X_Probe_test2, Y_Probe_test, cv=10, scoring="recall_macro" 459 | ) 460 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2)) 461 | f = cross_val_score( 462 | clf_rfeProbe, X_Probe_test2, Y_Probe_test, cv=10, scoring="f1_macro" 463 | ) 464 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2)) 465 | accuracy = cross_val_score( 466 | clf_rfeR2L, X_R2L_test2, Y_R2L_test, cv=10, scoring="accuracy" 467 | ) 468 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2)) 469 | precision = cross_val_score( 470 | clf_rfeR2L, X_R2L_test2, Y_R2L_test, cv=10, scoring="precision_macro" 471 | ) 472 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2)) 473 | recall = cross_val_score( 474 | clf_rfeR2L, X_R2L_test2, Y_R2L_test, cv=10, scoring="recall_macro" 475 | ) 476 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2)) 477 | f = cross_val_score(clf_rfeR2L, X_R2L_test2, Y_R2L_test, cv=10, scoring="f1_macro") 478 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2)) 479 | accuracy = cross_val_score( 480 | clf_rfeU2R, X_U2R_test2, Y_U2R_test, cv=10, scoring="accuracy" 481 | ) 482 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2)) 483 | precision = cross_val_score( 484 | clf_rfeU2R, X_U2R_test2, Y_U2R_test, cv=10, scoring="precision_macro" 485 | ) 486 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2)) 487 | recall = cross_val_score( 488 | clf_rfeU2R, X_U2R_test2, Y_U2R_test, cv=10, scoring="recall_macro" 489 | ) 490 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2)) 491 | f = cross_val_score(clf_rfeU2R, X_U2R_test2, Y_U2R_test, cv=10, scoring="f1_macro") 492 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2)) 493 | from sklearn.neighbors import KNeighborsClassifier 494 | 495 | clf_KNN_DoS = KNeighborsClassifier() 496 | clf_KNN_Probe = KNeighborsClassifier() 497 | clf_KNN_R2L = KNeighborsClassifier() 498 | clf_KNN_U2R = KNeighborsClassifier() 499 | clf_KNN_DoS.fit(X_DoS, Y_DoS.astype(int)) 500 | clf_KNN_Probe.fit(X_Probe, Y_Probe.astype(int)) 501 | clf_KNN_R2L.fit(X_R2L, Y_R2L.astype(int)) 502 | clf_KNN_U2R.fit(X_U2R, Y_U2R.astype(int)) 503 | Y_DoS_pred = clf_KNN_DoS.predict(X_DoS_test) 504 | pd.crosstab( 505 | Y_DoS_test, Y_DoS_pred, rownames=["Actual attacks"], colnames=["Predicted attacks"] 506 | ) 507 | Y_Probe_pred = clf_KNN_Probe.predict(X_Probe_test) 508 | pd.crosstab( 509 | Y_Probe_test, 510 | Y_Probe_pred, 511 | rownames=["Actual attacks"], 512 | colnames=["Predicted attacks"], 513 | ) 514 | Y_R2L_pred = clf_KNN_R2L.predict(X_R2L_test) 515 | pd.crosstab( 516 | Y_R2L_test, Y_R2L_pred, rownames=["Actual attacks"], colnames=["Predicted attacks"] 517 | ) 518 | Y_U2R_pred = clf_KNN_U2R.predict(X_U2R_test) 519 | pd.crosstab( 520 | Y_U2R_test, Y_U2R_pred, rownames=["Actual attacks"], colnames=["Predicted attacks"] 521 | ) 522 | from sklearn.model_selection import cross_val_score 523 | from sklearn import metrics 524 | 525 | accuracy = cross_val_score( 526 | clf_KNN_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="accuracy" 527 | ) 528 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2)) 529 | precision = cross_val_score( 530 | clf_KNN_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="precision" 531 | ) 532 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2)) 533 | recall = cross_val_score(clf_KNN_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="recall") 534 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2)) 535 | f = cross_val_score(clf_KNN_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="f1") 536 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2)) 537 | accuracy = cross_val_score( 538 | clf_KNN_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="accuracy" 539 | ) 540 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2)) 541 | precision = cross_val_score( 542 | clf_KNN_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="precision_macro" 543 | ) 544 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2)) 545 | recall = cross_val_score( 546 | clf_KNN_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="recall_macro" 547 | ) 548 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2)) 549 | f = cross_val_score( 550 | clf_KNN_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="f1_macro" 551 | ) 552 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2)) 553 | accuracy = cross_val_score( 554 | clf_KNN_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="accuracy" 555 | ) 556 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2)) 557 | precision = cross_val_score( 558 | clf_KNN_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="precision_macro" 559 | ) 560 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2)) 561 | recall = cross_val_score( 562 | clf_KNN_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="recall_macro" 563 | ) 564 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2)) 565 | f = cross_val_score(clf_KNN_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="f1_macro") 566 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2)) 567 | accuracy = cross_val_score( 568 | clf_KNN_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="accuracy" 569 | ) 570 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2)) 571 | precision = cross_val_score( 572 | clf_KNN_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="precision_macro" 573 | ) 574 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2)) 575 | recall = cross_val_score( 576 | clf_KNN_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="recall_macro" 577 | ) 578 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2)) 579 | f = cross_val_score(clf_KNN_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="f1_macro") 580 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2)) 581 | from sklearn.svm import SVC 582 | 583 | clf_SVM_DoS = SVC(kernel="linear", C=1.0, random_state=0) 584 | clf_SVM_Probe = SVC(kernel="linear", C=1.0, random_state=0) 585 | clf_SVM_R2L = SVC(kernel="linear", C=1.0, random_state=0) 586 | clf_SVM_U2R = SVC(kernel="linear", C=1.0, random_state=0) 587 | clf_SVM_DoS.fit(X_DoS, Y_DoS.astype(int)) 588 | clf_SVM_Probe.fit(X_Probe, Y_Probe.astype(int)) 589 | clf_SVM_R2L.fit(X_R2L, Y_R2L.astype(int)) 590 | clf_SVM_U2R.fit(X_U2R, Y_U2R.astype(int)) 591 | Y_DoS_pred = clf_SVM_DoS.predict(X_DoS_test) 592 | pd.crosstab( 593 | Y_DoS_test, Y_DoS_pred, rownames=["Actual attacks"], colnames=["Predicted attacks"] 594 | ) 595 | Y_Probe_pred = clf_SVM_Probe.predict(X_Probe_test) 596 | pd.crosstab( 597 | Y_Probe_test, 598 | Y_Probe_pred, 599 | rownames=["Actual attacks"], 600 | colnames=["Predicted attacks"], 601 | ) 602 | Y_R2L_pred = clf_SVM_R2L.predict(X_R2L_test) 603 | pd.crosstab( 604 | Y_R2L_test, Y_R2L_pred, rownames=["Actual attacks"], colnames=["Predicted attacks"] 605 | ) 606 | Y_U2R_pred = clf_SVM_U2R.predict(X_U2R_test) 607 | pd.crosstab( 608 | Y_U2R_test, Y_U2R_pred, rownames=["Actual attacks"], colnames=["Predicted attacks"] 609 | ) 610 | from sklearn.model_selection import cross_val_score 611 | from sklearn import metrics 612 | 613 | accuracy = cross_val_score( 614 | clf_SVM_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="accuracy" 615 | ) 616 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2)) 617 | precision = cross_val_score( 618 | clf_SVM_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="precision" 619 | ) 620 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2)) 621 | recall = cross_val_score(clf_SVM_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="recall") 622 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2)) 623 | f = cross_val_score(clf_SVM_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="f1") 624 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2)) 625 | accuracy = cross_val_score( 626 | clf_SVM_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="accuracy" 627 | ) 628 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2)) 629 | precision = cross_val_score( 630 | clf_SVM_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="precision_macro" 631 | ) 632 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2)) 633 | recall = cross_val_score( 634 | clf_SVM_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="recall_macro" 635 | ) 636 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2)) 637 | f = cross_val_score( 638 | clf_SVM_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="f1_macro" 639 | ) 640 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2)) 641 | accuracy = cross_val_score( 642 | clf_SVM_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="accuracy" 643 | ) 644 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2)) 645 | precision = cross_val_score( 646 | clf_SVM_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="precision_macro" 647 | ) 648 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2)) 649 | recall = cross_val_score( 650 | clf_SVM_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="recall_macro" 651 | ) 652 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2)) 653 | f = cross_val_score(clf_SVM_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="f1_macro") 654 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2)) 655 | accuracy = cross_val_score( 656 | clf_SVM_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="accuracy" 657 | ) 658 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2)) 659 | precision = cross_val_score( 660 | clf_SVM_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="precision_macro" 661 | ) 662 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2)) 663 | recall = cross_val_score( 664 | clf_SVM_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="recall_macro" 665 | ) 666 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2)) 667 | f = cross_val_score(clf_SVM_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="f1_macro") 668 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2)) 669 | from sklearn.ensemble import VotingClassifier 670 | 671 | clf_voting_DoS = VotingClassifier( 672 | estimators=[("rf", clf_DoS), ("knn", clf_KNN_DoS), ("svm", clf_SVM_DoS)], 673 | voting="hard", 674 | ) 675 | clf_voting_Probe = VotingClassifier( 676 | estimators=[("rf", clf_Probe), ("knn", clf_KNN_Probe), ("svm", clf_SVM_Probe)], 677 | voting="hard", 678 | ) 679 | clf_voting_R2L = VotingClassifier( 680 | estimators=[("rf", clf_R2L), ("knn", clf_KNN_R2L), ("svm", clf_SVM_R2L)], 681 | voting="hard", 682 | ) 683 | clf_voting_U2R = VotingClassifier( 684 | estimators=[("rf", clf_U2R), ("knn", clf_KNN_U2R), ("svm", clf_SVM_U2R)], 685 | voting="hard", 686 | ) 687 | clf_voting_DoS.fit(X_DoS, Y_DoS.astype(int)) 688 | clf_voting_Probe.fit(X_Probe, Y_Probe.astype(int)) 689 | clf_voting_R2L.fit(X_R2L, Y_R2L.astype(int)) 690 | clf_voting_U2R.fit(X_U2R, Y_U2R.astype(int)) 691 | Y_DoS_pred = clf_voting_DoS.predict(X_DoS_test) 692 | pd.crosstab( 693 | Y_DoS_test, Y_DoS_pred, rownames=["Actual attacks"], colnames=["Predicted attacks"] 694 | ) 695 | Y_Probe_pred = clf_voting_Probe.predict(X_Probe_test) 696 | pd.crosstab( 697 | Y_Probe_test, 698 | Y_Probe_pred, 699 | rownames=["Actual attacks"], 700 | colnames=["Predicted attacks"], 701 | ) 702 | Y_R2L_pred = clf_voting_R2L.predict(X_R2L_test) 703 | pd.crosstab( 704 | Y_R2L_test, Y_R2L_pred, rownames=["Actual attacks"], colnames=["Predicted attacks"] 705 | ) 706 | Y_U2R_pred = clf_voting_U2R.predict(X_U2R_test) 707 | pd.crosstab( 708 | Y_U2R_test, Y_U2R_pred, rownames=["Actual attacks"], colnames=["Predicted attacks"] 709 | ) 710 | 711 | accuracy = cross_val_score( 712 | clf_voting_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="accuracy" 713 | ) 714 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2)) 715 | precision = cross_val_score( 716 | clf_voting_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="precision" 717 | ) 718 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2)) 719 | recall = cross_val_score( 720 | clf_voting_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="recall" 721 | ) 722 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2)) 723 | f = cross_val_score(clf_voting_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring="f1") 724 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2)) 725 | accuracy = cross_val_score( 726 | clf_voting_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="accuracy" 727 | ) 728 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2)) 729 | precision = cross_val_score( 730 | clf_voting_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="precision_macro" 731 | ) 732 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2)) 733 | recall = cross_val_score( 734 | clf_voting_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="recall_macro" 735 | ) 736 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2)) 737 | f = cross_val_score( 738 | clf_voting_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring="f1_macro" 739 | ) 740 | print("F-mesaure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2)) 741 | accuracy = cross_val_score( 742 | clf_voting_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="accuracy" 743 | ) 744 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2)) 745 | precision = cross_val_score( 746 | clf_voting_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="precision_macro" 747 | ) 748 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2)) 749 | recall = cross_val_score( 750 | clf_voting_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="recall_macro" 751 | ) 752 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2)) 753 | f = cross_val_score(clf_voting_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring="f1_macro") 754 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2)) 755 | accuracy = cross_val_score( 756 | clf_voting_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="accuracy" 757 | ) 758 | print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2)) 759 | precision = cross_val_score( 760 | clf_voting_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="precision_macro" 761 | ) 762 | print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2)) 763 | recall = cross_val_score( 764 | clf_voting_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="recall_macro" 765 | ) 766 | print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2)) 767 | f = cross_val_score(clf_voting_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring="f1_macro") 768 | print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2)) 769 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Network Intrusion Detection using Light Weight ML Ensemble Method 2 | 3 | ![Javatpoint](https://zenodo.org/badge/DOI/10.5281/zenodo.7801597.svg) 4 | 5 | Codes for the paper entitled "Optimization of Predictive Performance of Intrusion Detection System Using Hybrid Ensemble Model for Secure Systems" 6 | 7 | ### Following datasets were used in this study. 8 | 9 | - [The UNSW-NB15 Dataset](https://research.unsw.edu.au/projects/unsw-nb15-dataset) 10 | - [NSL-KDD dataset](https://www.unb.ca/cic/datasets/nsl.html) 11 | - [CSE-CIC-IDS2018](https://www.unb.ca/cic/datasets/ids-2018.html) 12 | 13 | ### Following algorithms were studied individually and an ensemble of RF, SVM and MLP was used to develop a cost-effective and accurate model for Intrusion Detection. 14 | - Random Forest 15 | - Decision Tree 16 | - kNN 17 | - SVM 18 | - MLP 19 | - DNN 20 | - CNN (4 Conv1D layers) 21 | - LSTM (3 LSTM layers) 22 | - RNN (3 RNN layers) 23 | 24 | ### Evaluation Methods 25 | - Accuracy 26 | - Precision 27 | - Recall 28 | - F1-Score 29 | 30 | ### Machine Specifications used for experimentation 31 | - HP 840 G2 laptop 32 | - Intel core i5 processor (5th generation) 33 | - 64 bit Windows 10 operating system 34 | - 16 GB RAM 35 | 36 | ### Note 37 | These codes are obtained by downloading .py files from original jupyter notebooks. While reproducing the results make sure you run these codes in a separate Jupyter Notebook. 38 | 39 | ### Update 40 | Codes for deep learning algorithms will be uploaded soon. 41 | -------------------------------------------------------------------------------- /UNSW_BINARY_UPDATED.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import seaborn as sns 4 | import matplotlib.pyplot as plt 5 | import pickle 6 | from os import path 7 | from sklearn.preprocessing import MinMaxScaler 8 | from sklearn.preprocessing import StandardScaler 9 | from sklearn.preprocessing import LabelEncoder 10 | from sklearn import metrics 11 | from sklearn import preprocessing 12 | from sklearn.metrics import accuracy_score 13 | from sklearn.model_selection import train_test_split 14 | from sklearn.metrics import classification_report 15 | from sklearn.svm import SVC 16 | from sklearn.tree import DecisionTreeClassifier 17 | from sklearn.ensemble import RandomForestClassifier 18 | from sklearn.neighbors import KNeighborsClassifier 19 | from sklearn.neural_network import MLPClassifier 20 | from sklearn.ensemble import VotingClassifier 21 | import time 22 | 23 | bin_data_path = "./datasets/bin_data.csv" 24 | multi_data_path = "./datasets/multi_data.csv" 25 | df = pd.read_csv(bin_data_path) 26 | print("Dimensions of the Training set:", df.shape) 27 | df.shape 28 | df.head() 29 | X = df.drop(columns=["label"], axis=1) 30 | Y = df["label"] 31 | X_train, X_test, y_train, y_test = train_test_split( 32 | X, Y, test_size=0.30, random_state=50 33 | ) 34 | 35 | 36 | knn = KNeighborsClassifier(n_neighbors=3) 37 | svm = SVC(kernel="linear", C=1.0, random_state=0) 38 | rf = RandomForestClassifier(n_estimators=10, random_state=1) 39 | dt = DecisionTreeClassifier(random_state=0) 40 | mlp = MLPClassifier(random_state=0, max_iter=300) 41 | clf_voting = VotingClassifier( 42 | estimators=[("rf", rf), ("knn", knn), ("svm", svm)], voting="hard" 43 | ) 44 | knn = KNeighborsClassifier( 45 | algorithm="auto", 46 | leaf_size=30, 47 | metric="minkowski", 48 | metric_params=None, 49 | n_jobs=None, 50 | n_neighbors=5, 51 | p=2, 52 | weights="uniform", 53 | ) 54 | print("=========================") 55 | print("kNN Classifier") 56 | print("=========================") 57 | t1_ens = time.time() 58 | knn.fit(X_train, y_train.astype(int)) 59 | t2_ens = time.time() 60 | print("Time to train knn on training dat:", t2_ens - t1_ens) 61 | y_pred = knn.predict(X_test) 62 | print("Accuracy - ", accuracy_score(y_test, y_pred) * 100) 63 | cls_report = classification_report(y_true=y_test, y_pred=y_pred) 64 | print(cls_report) 65 | pkl_filename = "./qaiser_models/knn_binary.pkl" 66 | if not path.isfile(pkl_filename): 67 | with open(pkl_filename, "wb") as file: 68 | pickle.dump(knn, file) 69 | print("Saved model to disk") 70 | else: 71 | print("Model already saved") 72 | print("=========================") 73 | print("Fitting SVM Classifier") 74 | print("=========================") 75 | t1_svm = time.time() 76 | svm.fit(X_train, y_train.astype(int)) 77 | t2_svm = time.time() 78 | print("Time to train SVM on training dat:", t2_svm - t1_svm) 79 | y_pred = svm.predict(X_test) 80 | print("Accuracy for binary SVM is - ", accuracy_score(y_test, y_pred) * 100) 81 | cls_report = classification_report(y_true=y_test, y_pred=y_pred) 82 | print(cls_report) 83 | pkl_filename = "./qaiser_models/SVM_binary.pkl" 84 | if not path.isfile(pkl_filename): 85 | with open(pkl_filename, "wb") as file: 86 | pickle.dump(knn, file) 87 | print("Saved model to disk") 88 | else: 89 | print("Model already saved") 90 | print("=========================") 91 | print("Fitting Random Forest Classifier") 92 | print("=========================") 93 | t1_rf = time.time() 94 | rf.fit(X_train, y_train.astype(int)) 95 | t2_rf = time.time() 96 | print("Time to train RF on binary training dat:", t2_rf - t1_rf) 97 | print("======================================================") 98 | y_pred = rf.predict(X_test) 99 | print("Accuracy for binary SVM is - ", accuracy_score(y_test, y_pred) * 100) 100 | cls_report = classification_report(y_true=y_test, y_pred=y_pred) 101 | print("========Printing Classification Reports==========") 102 | print(cls_report) 103 | pkl_filename = "./qaiser_models/RF_binary.pkl" 104 | if not path.isfile(pkl_filename): 105 | with open(pkl_filename, "wb") as file: 106 | pickle.dump(rf, file) 107 | print("Saved model to disk") 108 | else: 109 | print("Model already saved") 110 | print("===========================================") 111 | print("Fitting Random Forest Classifier") 112 | print("===========================================") 113 | t1_dt = time.time() 114 | dt.fit(X_train, y_train.astype(int)) 115 | t2_dt = time.time() 116 | print("Time to train RF on binary training dat:", t2_dt - t1_dt) 117 | print("======================================================") 118 | y_pred = dt.predict(X_test) 119 | print("Accuracy for binary SVM is - ", accuracy_score(y_test, y_pred) * 100) 120 | cls_report = classification_report(y_true=y_test, y_pred=y_pred) 121 | print("========Printing Classification Reports==========") 122 | print(cls_report) 123 | pkl_filename = "./qaiser_models/DT_binary.pkl" 124 | if not path.isfile(pkl_filename): 125 | with open(pkl_filename, "wb") as file: 126 | pickle.dump(dt, file) 127 | print("Saved model to disk") 128 | else: 129 | print("Model already saved") 130 | print("===========================================") 131 | print("Fitting MLP Classifier") 132 | print("===========================================") 133 | t1_mlp = time.time() 134 | mlp.fit(X_train, y_train.astype(int)) 135 | t2_mlp = time.time() 136 | print("Time to train MLP on binary training dat:", t2_dt - t1_dt) 137 | print("======================================================") 138 | y_pred = mlp.predict(X_test) 139 | print("Accuracy for binary MLP is - ", accuracy_score(y_test, y_pred) * 100) 140 | cls_report = classification_report(y_true=y_test, y_pred=y_pred) 141 | print("========Printing Classification Reports==========") 142 | print(cls_report) 143 | pkl_filename = "./qaiser_models/MLP_binary.pkl" 144 | if not path.isfile(pkl_filename): 145 | with open(pkl_filename, "wb") as file: 146 | pickle.dump(mlp, file) 147 | print("Saved model to disk") 148 | else: 149 | print("Model already saved") 150 | print("===========================================") 151 | print("Fitting Our Ensemble Method Classifier") 152 | print("===========================================") 153 | t1_clf_voting = time.time() 154 | clf_voting.fit(X_train, y_train.astype(int)) 155 | t2_clf_voting = time.time() 156 | print("Time to train clf_voting on binary training dat:", t2_clf_voting - t1_clf_voting) 157 | print("======================================================") 158 | y_pred = clf_voting.predict(X_test) 159 | print("Accuracy for binary clf_voting is - ", accuracy_score(y_test, y_pred) * 100) 160 | cls_report = classification_report(y_true=y_test, y_pred=y_pred) 161 | print("========Printing Classification Reports==========") 162 | print(cls_report) 163 | pkl_filename = "./qaiser_models/clf_voting_binary.pkl" 164 | if not path.isfile(pkl_filename): 165 | with open(pkl_filename, "wb") as file: 166 | pickle.dump(clf_voting, file) 167 | print("Saved model to disk") 168 | else: 169 | print("Model already saved") 170 | -------------------------------------------------------------------------------- /UNSW_MULTI_UPDATED.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import seaborn as sns 4 | import matplotlib.pyplot as plt 5 | import pickle 6 | from os import path 7 | from sklearn.preprocessing import MinMaxScaler 8 | from sklearn.preprocessing import StandardScaler 9 | from sklearn.preprocessing import LabelEncoder 10 | from sklearn import metrics 11 | from sklearn import preprocessing 12 | from sklearn.metrics import accuracy_score 13 | from sklearn.model_selection import train_test_split 14 | from sklearn.metrics import classification_report 15 | from sklearn.svm import SVC 16 | from sklearn.tree import DecisionTreeClassifier 17 | from sklearn.ensemble import RandomForestClassifier 18 | from sklearn.neighbors import KNeighborsClassifier 19 | from sklearn.neural_network import MLPClassifier 20 | from sklearn.ensemble import VotingClassifier 21 | import time 22 | 23 | bin_data_path = "./datasets/multi_data.csv" 24 | df = pd.read_csv(bin_data_path) 25 | print("Dimensions of the Training set:", df.shape) 26 | df.shape 27 | df.head() 28 | X = df.drop(columns=["label"], axis=1) 29 | Y = df["label"] 30 | X_train, X_test, y_train, y_test = train_test_split( 31 | X, Y, test_size=0.30, random_state=50 32 | ) 33 | print("Training Data Shape:", X_train.shape) 34 | print("Training Labels Shape:", y_train.shape) 35 | print("Testing Data Shape:", X_test.shape) 36 | print("Testing Label Shape:", y_test.shape) 37 | 38 | knn = KNeighborsClassifier(n_neighbors=3) 39 | svm = SVC(kernel="poly", C=1.0, random_state=0) 40 | rf = RandomForestClassifier(n_estimators=10, random_state=1) 41 | dt = DecisionTreeClassifier(random_state=0) 42 | mlp = MLPClassifier(random_state=0, max_iter=300) 43 | clf_voting = VotingClassifier( 44 | estimators=[("rf", rf), ("knn", knn), ("svm", svm)], voting="hard" 45 | ) 46 | knn = KNeighborsClassifier( 47 | algorithm="auto", 48 | leaf_size=30, 49 | metric="minkowski", 50 | metric_params=None, 51 | n_jobs=None, 52 | n_neighbors=5, 53 | p=2, 54 | weights="uniform", 55 | ) 56 | print("=========================") 57 | print("kNN Classifier") 58 | print("=========================") 59 | t1_ens = time.time() 60 | knn.fit(X_train, y_train.astype(int)) 61 | t2_ens = time.time() 62 | print("Time to train knn on MultiClass training dat:", t2_ens - t1_ens) 63 | y_pred = knn.predict(X_test) 64 | print("Accuracy - ", accuracy_score(y_test, y_pred) * 100) 65 | cls_report = classification_report(y_true=y_test, y_pred=y_pred) 66 | print(cls_report) 67 | pkl_filename = "./qaiser_models/knn_multi.pkl" 68 | if not path.isfile(pkl_filename): 69 | with open(pkl_filename, "wb") as file: 70 | pickle.dump(knn, file) 71 | print("Saved model to disk") 72 | else: 73 | print("Model already saved") 74 | import matplotlib.pyplot as plt 75 | from sklearn.metrics import plot_confusion_matrix 76 | 77 | print("Testing on Unssen Data") 78 | fig, ax = plt.subplots(figsize=(10, 10)) 79 | labels = [ 80 | "Analysis", 81 | "Backdoor", 82 | "DoS", 83 | "Exploits", 84 | "Fuzzers", 85 | "Generic", 86 | "Normal", 87 | "Recon", 88 | "Worms", 89 | ] 90 | plot_confusion_matrix( 91 | knn, X_test, y_test, cmap="Greens", display_labels=labels, normalize="pred", ax=ax 92 | ) 93 | plt.savefig("./diagrams/kNN Confusion Matrix.png") 94 | plt.show() 95 | print("=========================") 96 | print("Fitting SVM Classifier") 97 | print("=========================") 98 | t1_svm = time.time() 99 | svm.fit(X_train, y_train.astype(int)) 100 | t2_svm = time.time() 101 | print("Time to train SVM on training dat:", t2_svm - t1_svm) 102 | y_pred = svm.predict(X_test) 103 | print("Accuracy for Multiclass SVM is - ", accuracy_score(y_test, y_pred) * 100) 104 | cls_report = classification_report(y_true=y_test, y_pred=y_pred) 105 | print(cls_report) 106 | pkl_filename = "./qaiser_models/SVM_multi.pkl" 107 | if not path.isfile(pkl_filename): 108 | with open(pkl_filename, "wb") as file: 109 | pickle.dump(svm, file) 110 | print("Saved model to disk") 111 | else: 112 | print("Model already saved") 113 | import matplotlib.pyplot as plt 114 | from sklearn.metrics import plot_confusion_matrix 115 | 116 | print("Testing on Unssen Data") 117 | fig, ax = plt.subplots(figsize=(10, 10)) 118 | labels = [ 119 | "Analysis", 120 | "Backdoor", 121 | "DoS", 122 | "Exploits", 123 | "Fuzzers", 124 | "Generic", 125 | "Normal", 126 | "Recon", 127 | "Worms", 128 | ] 129 | plot_confusion_matrix( 130 | svm, X_test, y_test, cmap="Greens", display_labels=labels, normalize="pred", ax=ax 131 | ) 132 | plt.savefig("./diagrams/SVM multiclass Confusion Matrix.png") 133 | plt.show() 134 | import matplotlib.pyplot as plt 135 | from sklearn.metrics import plot_confusion_matrix 136 | 137 | print("=========================") 138 | print("Fitting Random Forest Classifier") 139 | print("=========================") 140 | t1_rf = time.time() 141 | rf.fit(X_train, y_train.astype(int)) 142 | t2_rf = time.time() 143 | print("Time to train RF on binary training dat:", t2_rf - t1_rf) 144 | print("======================================================") 145 | y_pred = rf.predict(X_test) 146 | print("Accuracy for multi RF is - ", accuracy_score(y_test, y_pred) * 100) 147 | cls_report = classification_report(y_true=y_test, y_pred=y_pred) 148 | print("========Printing Classification Reports==========") 149 | print(cls_report) 150 | pkl_filename = "./qaiser_models/RF_multi.pkl" 151 | if not path.isfile(pkl_filename): 152 | with open(pkl_filename, "wb") as file: 153 | pickle.dump(rf, file) 154 | print("Saved model to disk") 155 | else: 156 | print("Model already saved") 157 | print("Testing on Unssen Data") 158 | fig, ax = plt.subplots(figsize=(10, 10)) 159 | labels = [ 160 | "Analysis", 161 | "Backdoor", 162 | "DoS", 163 | "Exploits", 164 | "Fuzzers", 165 | "Generic", 166 | "Normal", 167 | "Recon", 168 | "Worms", 169 | ] 170 | plot_confusion_matrix( 171 | rf, X_test, y_test, cmap="Greens", display_labels=labels, normalize="pred", ax=ax 172 | ) 173 | plt.savefig("./diagrams/RF Confusion Matrix.png") 174 | plt.show() 175 | print("===========================================") 176 | print("Fitting DT Classifier") 177 | print("===========================================") 178 | t1_dt = time.time() 179 | dt.fit(X_train, y_train.astype(int)) 180 | t2_dt = time.time() 181 | print("Time to train RF on multiclass training dat:", t2_dt - t1_dt) 182 | print("======================================================") 183 | y_pred = dt.predict(X_test) 184 | print("Accuracy for multi DT is - ", accuracy_score(y_test, y_pred) * 100) 185 | cls_report = classification_report(y_true=y_test, y_pred=y_pred) 186 | print("========Printing Classification Reports==========") 187 | print(cls_report) 188 | pkl_filename = "./qaiser_models/DT_multi.pkl" 189 | if not path.isfile(pkl_filename): 190 | with open(pkl_filename, "wb") as file: 191 | pickle.dump(dt, file) 192 | print("Saved model to disk") 193 | else: 194 | print("Model already saved") 195 | print("Testing on Unssen Data") 196 | fig, ax = plt.subplots(figsize=(10, 10)) 197 | labels = [ 198 | "Analysis", 199 | "Backdoor", 200 | "DoS", 201 | "Exploits", 202 | "Fuzzers", 203 | "Generic", 204 | "Normal", 205 | "Recon", 206 | "Worms", 207 | ] 208 | plot_confusion_matrix( 209 | dt, X_test, y_test, cmap="Greens", display_labels=labels, normalize="pred", ax=ax 210 | ) 211 | plt.savefig("./diagrams/DT Confusion Matrix.png") 212 | plt.show() 213 | print("===========================================") 214 | print("Fitting MLP Classifier") 215 | print("===========================================") 216 | t1_mlp = time.time() 217 | mlp.fit(X_train, y_train.astype(int)) 218 | t2_mlp = time.time() 219 | print("Time to train MLP on multiclass training dat:", t2_dt - t1_dt) 220 | print("======================================================") 221 | y_pred = mlp.predict(X_test) 222 | print("Accuracy for multiclass MLP is - ", accuracy_score(y_test, y_pred) * 100) 223 | cls_report = classification_report(y_true=y_test, y_pred=y_pred) 224 | print("========Printing Classification Reports==========") 225 | print(cls_report) 226 | pkl_filename = "./qaiser_models/MLP_multi.pkl" 227 | if not path.isfile(pkl_filename): 228 | with open(pkl_filename, "wb") as file: 229 | pickle.dump(mlp, file) 230 | print("Saved model to disk") 231 | else: 232 | print("Model already saved") 233 | print("Testing on Unssen Data") 234 | fig, ax = plt.subplots(figsize=(10, 10)) 235 | labels = [ 236 | "Analysis", 237 | "Backdoor", 238 | "DoS", 239 | "Exploits", 240 | "Fuzzers", 241 | "Generic", 242 | "Normal", 243 | "Recon", 244 | "Worms", 245 | ] 246 | plot_confusion_matrix( 247 | mlp, X_test, y_test, cmap="Greens", display_labels=labels, normalize="pred", ax=ax 248 | ) 249 | plt.savefig("./diagrams/MLP Confusion Matrix.png") 250 | plt.show() 251 | print("===========================================") 252 | print("Fitting Our Ensemble Method Classifier") 253 | print("===========================================") 254 | t1_clf_voting = time.time() 255 | clf_voting.fit(X_train, y_train.astype(int)) 256 | t2_clf_voting = time.time() 257 | print("Time to train clf_voting on binary training dat:", t2_clf_voting - t1_clf_voting) 258 | print("======================================================") 259 | y_pred = clf_voting.predict(X_test) 260 | print("Accuracy for binary clf_voting is - ", accuracy_score(y_test, y_pred) * 100) 261 | cls_report = classification_report(y_true=y_test, y_pred=y_pred) 262 | print("========Printing Classification Reports==========") 263 | print(cls_report) 264 | pkl_filename = "./qaiser_models/clf_voting_multi.pkl" 265 | if not path.isfile(pkl_filename): 266 | with open(pkl_filename, "wb") as file: 267 | pickle.dump(clf_voting, file) 268 | print("Saved model to disk") 269 | else: 270 | print("Model already saved") 271 | print("Testing on Unssen Data") 272 | fig, ax = plt.subplots(figsize=(10, 10)) 273 | labels = [ 274 | "Analysis", 275 | "Backdoor", 276 | "DoS", 277 | "Exploits", 278 | "Fuzzers", 279 | "Generic", 280 | "Normal", 281 | "Recon", 282 | "Worms", 283 | ] 284 | plot_confusion_matrix( 285 | clf_voting, 286 | X_test, 287 | y_test, 288 | cmap="Greens", 289 | display_labels=labels, 290 | normalize="pred", 291 | ax=ax, 292 | ) 293 | plt.savefig("./diagrams/clf_voting Confusion Matrix-Testing.png") 294 | plt.show() 295 | print("===========================================") 296 | print("Fitting Our Ensemble Method Classifier") 297 | print("===========================================") 298 | from sklearn.ensemble import GradientBoostingClassifier 299 | 300 | xg = GradientBoostingClassifier( 301 | n_estimators=100, learning_rate=1.0, max_depth=3, random_state=0 302 | ) 303 | clf_voting = VotingClassifier( 304 | estimators=[("rf", rf), ("dt", dt), ("xg", xg)], voting="hard" 305 | ) 306 | t1_clf_voting = time.time() 307 | clf_voting.fit(X_train, y_train.astype(int)) 308 | t2_clf_voting = time.time() 309 | print("Time to train clf_voting on multi training dat:", t2_clf_voting - t1_clf_voting) 310 | print("======================================================") 311 | y_pred = clf_voting.predict(X_test) 312 | print("Accuracy for multiclass clf_voting is - ", accuracy_score(y_test, y_pred) * 100) 313 | cls_report = classification_report(y_true=y_test, y_pred=y_pred) 314 | print("========Printing Classification Reports==========") 315 | print(cls_report) 316 | pkl_filename = "./qaiser_models/clf_ensemble_multi.pkl" 317 | if not path.isfile(pkl_filename): 318 | with open(pkl_filename, "wb") as file: 319 | pickle.dump(clf_voting, file) 320 | print("Saved model to disk") 321 | else: 322 | print("Model already saved") 323 | print("Testing on Unssen Data") 324 | fig, ax = plt.subplots(figsize=(10, 10)) 325 | labels = [ 326 | "Analysis", 327 | "Backdoor", 328 | "DoS", 329 | "Exploits", 330 | "Fuzzers", 331 | "Generic", 332 | "Normal", 333 | "Recon", 334 | "Worms", 335 | ] 336 | plot_confusion_matrix( 337 | clf_voting, 338 | X_test, 339 | y_test, 340 | cmap="Greens", 341 | display_labels=labels, 342 | normalize="pred", 343 | ax=ax, 344 | ) 345 | plt.savefig("./diagrams/Ensemble Confusion Matrix-Testing.png") 346 | plt.show() 347 | print("===========================================") 348 | print("Fitting Our Ensemble Method Classifier") 349 | print("===========================================") 350 | print("Time to train clf_voting on multi training dat:", t2_clf_voting - t1_clf_voting) 351 | print("======================================================") 352 | y_pred = clf_voting.predict(X_train) 353 | print( 354 | "Accuracy for multiclass clf_voting on Training Data is - ", 355 | accuracy_score(y_train.astype(int), y_pred) * 100, 356 | ) 357 | print("Testing on Unssen Data") 358 | fig, ax = plt.subplots(figsize=(10, 10)) 359 | labels = [ 360 | "Analysis", 361 | "Backdoor", 362 | "DoS", 363 | "Exploits", 364 | "Fuzzers", 365 | "Generic", 366 | "Normal", 367 | "Recon", 368 | "Worms", 369 | ] 370 | plot_confusion_matrix( 371 | clf_voting, 372 | X_train, 373 | y_train, 374 | cmap="Greens", 375 | display_labels=labels, 376 | normalize="pred", 377 | ax=ax, 378 | ) 379 | plt.savefig("./diagrams/Ensemble Training Data Confusion Matrix-Testing.png") 380 | plt.show() 381 | xg = GradientBoostingClassifier( 382 | n_estimators=100, learning_rate=1.0, max_depth=3, random_state=0 383 | ) 384 | clf_voting = VotingClassifier( 385 | estimators=[("rf", rf), ("dt", dt), ("xg", xg)], voting="soft" 386 | ) 387 | t1_clf_voting = time.time() 388 | clf_voting.fit(X_train, y_train.astype(int)) 389 | t2_clf_voting = time.time() 390 | print("Time to train clf_voting on multi training dat:", t2_clf_voting - t1_clf_voting) 391 | import scikitplot.plotters as skplt 392 | import matplotlib.pyplot as plt 393 | 394 | print("ROC Curve for Testing Data") 395 | preds = clf_voting.predict_proba(X_test) 396 | fig, ax = plt.subplots(figsize=(10, 10)) 397 | skplt.plot_roc_curve(y_test, preds, ax=ax) 398 | plt.savefig("Ensemble ROC for Testing.png") 399 | plt.show() 400 | import scikitplot.plotters as skplt 401 | import matplotlib.pyplot as plt 402 | 403 | print("ROC Curve for Training Data") 404 | preds = clf_voting.predict_proba(X_train) 405 | fig, ax = plt.subplots(figsize=(10, 10)) 406 | skplt.plot_roc_curve(y_train, preds, ax=ax) 407 | plt.savefig("Ensemble ROC for Training.png") 408 | plt.show() 409 | -------------------------------------------------------------------------------- /cicids2018_updated.py: -------------------------------------------------------------------------------- 1 | ### this code was executed in jupyter notebook and exported as .py files so you may need to modify it to run as python script. 2 | import numpy as np 3 | import pandas as pd 4 | import seaborn as sns 5 | import matplotlib.pyplot as plt 6 | from sklearn import preprocessing 7 | from sklearn.preprocessing import LabelEncoder, Imputer,MinMaxScaler 8 | from sklearn.model_selection import train_test_split,cross_val_score 9 | from sklearn.ensemble import RandomForestClassifier 10 | from sklearn.decomposition import PCA 11 | from sklearn.metrics import confusion_matrix,accuracy_score,precision_recall_fscore_support 12 | from sklearn.neighbors import KNeighborsClassifier 13 | from sklearn.tree import DecisionTreeClassifier 14 | from sklearn.svm import SVC 15 | from keras.wrappers.scikit_learn import KerasClassifier 16 | from keras.models import Sequential 17 | from keras.layers import Dense 18 | from keras.optimizers import Adam 19 | from keras.utils.np_utils import to_categorical 20 | from sklearn.ensemble import RandomForestRegressor 21 | df=pd.read_csv('combined2.csv') 22 | df_value=df[' Label'].value_counts() 23 | df[' Label']=df[' Label'].apply({'DoS Hulk':'DoS', 'DoS GoldenEye':'DoS','DoS Slowhttptest':'DoS','DoS 24 | slowloris':'DoS' ,'BENIGN':'BENIGN' ,'DDoS':'DDoS', 'PortScan':'PortScan'}.get) 25 | df2=df.drop_duplicates() 26 | df2_value=df2[' Label'].value_counts() 27 | datatype=df2.dtypes 28 | df2['Flow Bytes/s']=df2['Flow Bytes/s'].astype('float64') 29 | df2[' Flow Packets/s']=df2[' Flow Packets/s'].astype('float64') 30 | NaN_values=df2.isnull().sum() 31 | df2['Flow Bytes/s'].fillna(df2['Flow Bytes/s'].mean(),inplace=True) 32 | print('Datasetin ilk okunduÄŸu hali: \n',df_value) 33 | print('Datasetin ilk (row,Column) sayısı: {} '.format(df.shape)) 34 | print('Datasetin Labelindeki DoS daldırılarının birleÅŸtirilmesi ve gürültünün azaltılması:\n',df2_value) 35 | print('Datasetin son (row,Column) sayısı: {} '.format(df2.shape)) 36 | dataset=pd.read_csv('dataset.csv') 37 | dataset 38 | DoS_df1=dataset[dataset[' Label']=='BENIGN'] 39 | DoS_df=DoS_df1.append(dataset[dataset[' Label']=='DoS']) 40 | DoS_df 41 | DDoS_df1=dataset[dataset[' Label']=='BENIGN'] 42 | DDoS_df=DDoS_df1.append(dataset[dataset[' Label']=='DDoS']) 43 | DDoS_df 44 | PortScan_df1=dataset[dataset[' Label']=='BENIGN'] 45 | PortScan_df=PortScan_df1.append(dataset[dataset[' Label']=='PortScan']) 46 | PortScan_df 47 | NA_df=dataset 48 | NA_df[' Label']=NA_df[' Label'].apply({'DoS':'Anormal','BENIGN':'Normal' ,'DDoS':'Anormal', 'PortScan':'Anormal'}.get) 49 | NA_df 50 | def train_test_dataset(df): 51 | labelencoder = LabelEncoder() 52 | df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1]) 53 | X = df.drop([' Label'],axis=1) 54 | y = df.iloc[:, -1].values.reshape(-1,1) 55 | y=np.ravel(y) 56 | X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.7, test_size = 0.3, random_state = 0, stratify = y) 57 | return X_train, X_test, y_train, y_test 58 | def RandomForest(X_train, X_test, y_train, y_test): 59 | rf = RandomForestClassifier(random_state = 0) 60 | imputer = Imputer(missing_values="NaN", strategy = "mean") 61 | imputer = imputer.fit(X_train) 62 | X_train = imputer.transform(X_train) 63 | X_test = imputer.transform(X_test) 64 | rf.fit(X_train,y_train) 65 | rf_score=rf.score(X_test,y_test) 66 | y_predict=rf.predict(X_test) 67 | y_true=y_test 68 | print('Random Forest Accuracy:'+ str(rf_score)) 69 | precision,recall,fscore,none= precision_recall_fscore_support(y_true, y_predict, average='weighted') 70 | print('Random Forest precision_recall_fscore:'+(str(precision))+(str(recall))+(str(fscore))) 71 | cm=confusion_matrix(y_true,y_predict) 72 | f,ax=plt.subplots(figsize=(5,5)) 73 | sns.heatmap(cm,annot=True,linewidth=0.5,linecolor="red",fmt=".0f",ax=ax) 74 | plt.xlabel("y_pred") 75 | plt.ylabel("y_true") 76 | plt.show() 77 | return rf_score,precision,recall,fscore,none 78 | def DecisionTree(X_train, X_test, y_train, y_test): 79 | dt = DecisionTreeClassifier(random_state = 0) 80 | imputer = Imputer(missing_values="NaN", strategy = "mean") 81 | imputer = imputer.fit(X_train) 82 | X_train = imputer.transform(X_train) 83 | X_test = imputer.transform(X_test) 84 | dt.fit(X_train, y_train) 85 | score=dt.score(X_test,y_test) 86 | print('Decision Tree Accuracy:'+ str(score)) 87 | y_predict=dt.predict(X_test) 88 | y_true=y_test 89 | precision,recall,fscore,none= precision_recall_fscore_support(y_true, y_predict, average='weighted') 90 | print('Decision Tree precision_recall_fscore:'+(str(precision))+(str(recall))+(str(fscore))) 91 | cm=confusion_matrix(y_true,y_predict) 92 | f,ax=plt.subplots(figsize=(5,5)) 93 | sns.heatmap(cm,annot=True,linewidth=0.5,linecolor="red",fmt=".0f",ax=ax) 94 | plt.xlabel("y_pred") 95 | plt.ylabel("y_true") 96 | plt.show() 97 | return score,precision,recall,fscore,none 98 | def kNN(X_train, X_test, y_train, y_test): 99 | knn=KNeighborsClassifier(n_neighbors=5) 100 | imputer = Imputer(missing_values="NaN", strategy = "mean") 101 | imputer = imputer.fit(X_train) 102 | X_train = imputer.transform(X_train) 103 | X_test = imputer.transform(X_test) 104 | knn.fit(X_train,y_train) 105 | prediction=knn.predict(X_test) 106 | score=knn.score(X_test,y_test) 107 | print("5 nn score:"+ str(score)) 108 | y_predict=knn.predict(X_test) 109 | y_true=y_test 110 | precision,recall,fscore,none= precision_recall_fscore_support(y_true, y_predict, average='weighted') 111 | print('5nn precision_recall_fscore:'+(str(precision))+(str(recall))+(str(fscore))) 112 | cm=confusion_matrix(y_true,y_predict) 113 | f,ax=plt.subplots(figsize=(5,5)) 114 | sns.heatmap(cm,annot=True,linewidth=0.5,linecolor="red",fmt=".0f",ax=ax) 115 | plt.xlabel("y_pred") 116 | plt.ylabel("y_true") 117 | plt.show() 118 | return score,precision,recall,fscore,none 119 | def SVM(X_train, X_test, y_train, y_test): 120 | svclassifier = SVC(kernel='linear') 121 | imputer = Imputer(missing_values="NaN", strategy = "mean") 122 | imputer = imputer.fit(X_train) 123 | X_train = imputer.transform(X_train) 124 | X_test = imputer.transform(X_test) 125 | svclassifier.fit(X_train, y_train) 126 | print("SVM Classification Accuracy:"+ str(svclassifier.score(X_test,y_test))) 127 | y_predict = svclassifier.predict(X_test) 128 | y_true=y_test 129 | cm=confusion_matrix(y_true,y_predict) 130 | f,ax=plt.subplots(figsize=(5,5)) 131 | sns.heatmap(cm,annot=True,linewidth=0.5,linecolor="red",fmt=".0f",ax=ax) 132 | plt.xlabel("y_pred") 133 | plt.ylabel("y_true") 134 | plt.show() 135 | def build_classifier(X_train): 136 | def bm(): 137 | classifier = Sequential() 138 | classifier.add(Dense(units = 80, kernel_initializer = 'uniform', activation = 'relu', input_dim = X_train.shape[1])) 139 | classifier.add(Dense(units = 25, kernel_initializer = 'uniform', activation = 'relu')) 140 | 141 | classifier.add(Dense(units = 2, kernel_initializer = 'uniform', activation = 'softmax')) 142 | lr=.003 143 | adam0=Adam(lr=lr) 144 | classifier.compile(optimizer =adam0, loss = 'categorical_crossentropy', metrics = ['accuracy']) 145 | return classifier 146 | return bm 147 | def ANN(X_train, X_test, y_train, y_test): 148 | y_ = to_categorical(y_train) 149 | y_t=to_categorical(y_test) 150 | estimator = KerasClassifier(build_fn = build_classifier(X_train), epochs = 5) 151 | accuracies = cross_val_score(estimator, X = X_train, y = y_, cv = 3) 152 | mean = accuracies.mean() 153 | variance = accuracies.std() 154 | print("Accuracy mean: "+ str(mean)) 155 | print("Accuracy variance: "+ str(variance)) 156 | def feature_selection(df): 157 | feature=(df.drop([' Label'],axis=1)).columns.values 158 | labelencoder = LabelEncoder() 159 | df.iloc[:, -1] = labelencoder.fit_transform(df.iloc[:, -1]) 160 | X = df.drop([' Label'],axis=1) 161 | Y = df.iloc[:, -1].values.reshape(-1,1) 162 | Y=np.ravel(Y) 163 | imputer = Imputer(missing_values="NaN", strategy = "mean") 164 | imputer = imputer.fit(X) 165 | X = imputer.transform(X) 166 | rf = RandomForestRegressor() 167 | rf.fit(X, Y) 168 | print ("Features sorted by their score:") 169 | print (sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), feature), reverse=True)) 170 | feature_selection(dataset) 171 | DoSX_train, DoSX_test, DoSy_train, DoSy_test=train_test_dataset(DoS_df) 172 | DDoSX_train, DDoSX_test, DDoSy_train, DDoSy_test=train_test_dataset(DDoS_df) 173 | PS_X_train,PS_X_test,PS_y_train, PS_y_test=train_test_dataset(PortScan_df) 174 | NA_X_train, NA_X_test, NA_y_train, NA_y_test=train_test_dataset(NA_df) 175 | dosrf_score,dosrf_precision,dosrf_recall,dosrf_fscore,none=RandomForest(DoSX_train, DoSX_test, DoSy_train, DoSy_test) 176 | dosdt_score,dosdt_precision,dosdt_recall,dosdt_fscore,none=DecisionTree(DoSX_train, DoSX_test, DoSy_train, DoSy_test) 177 | dosKnn_score,dosKnn_precision,dosKnn_recall,dosKnn_fscore,none=kNN(DoSX_train, DoSX_test, DoSy_train, DoSy_test) 178 | SVM(DoSX_train, DoSX_test, DoSy_train, DoSy_test) 179 | ANN(DoSX_train, DoSX_test, DoSy_train, DoSy_test) 180 | labels=np.unique(DoSy_train) 181 | print(labels) 182 | psrf_score,psrf_precision,psrf_recall,psrf_fscore,none=RandomForest(PS_X_train,PS_X_test,PS_y_train, PS_y_test) 183 | psdt_score,psdt_precision,psdt_recall,psdt_fscore,none=DecisionTree(PS_X_train,PS_X_test,PS_y_train, PS_y_test) 184 | psKnn_score,psKnn_precision,psKnn_recall,psKnn_fscore,none=kNN(PS_X_train,PS_X_test,PS_y_train, PS_y_test) 185 | SVM(PS_X_train,PS_X_test,PS_y_train, PS_y_test) 186 | ANN(PS_X_train,PS_X_test,PS_y_train, PS_y_test) 187 | ddosrf_score,ddosrf_precision,ddosrf_recall,ddosrf_fscore,none=RandomForest(DDoSX_train, DDoSX_test, DDoSy_train, DDoSy_test) 188 | ddosdt_score,ddosdt_precision,ddosdt_recall,ddosdt_fscore,none=DecisionTree(DDoSX_train, DDoSX_test, DDoSy_train, DDoSy_test) 189 | ddosKnn_score,ddosKnn_precision,ddosKnn_recall,ddosKnn_fscore,none=kNN(DDoSX_train, DDoSX_test, DDoSy_train, DDoSy_test) 190 | SVM(DDoSX_train, DDoSX_test, DDoSy_train, DDoSy_test) 191 | ANN(DDoSX_train, DDoSX_test, DDoSy_train, DDoSy_test) 192 | narf_score,narf_precision,narf_recall,narf_fscore,none=RandomForest(NA_X_train, NA_X_test, NA_y_train, 193 | NA_y_test) 194 | nadt_score,nadt_precision,nadt_recall,nadt_fscore,none=DecisionTree(NA_X_train, NA_X_test, NA_y_train, 195 | NA_y_test) 196 | naKnn_score,naKnn_precision,naKnn_recall,naKnn_fscore,none=kNN(NA_X_train, NA_X_test, NA_y_train, NA_y_test) 197 | SVM(NA_X_train, NA_X_test, NA_y_train, NA_y_test) 198 | ANN(NA_X_train, NA_X_test, NA_y_train, NA_y_test) 199 | d={'Algoritmalar': ["Random Forest", "Decision Tree","KNN","ANN"], 200 | 'DoS accuracy': [dosrf_score,dosdt_score,dosKnn_score,0.7636], 201 | 'DDoS accuracy': [ddosrf_score, ddosdt_score,ddosKnn_score,0.8307], 202 | 'Port Scan accuracy':[psrf_score,psdt_score,psKnn_score,0.8738], 203 | 'Normal/Anormal accuracy':[narf_score,nadt_score,naKnn_score,0.6034], 204 | } 205 | dataframe= pd.DataFrame(data=d) 206 | dataframe 207 | --------------------------------------------------------------------------------