├── DDoS_ML.py └── README.md /DDoS_ML.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """EECE 490-M3.ipynb 3 | 4 | Automatically generated by Colaboratory. 5 | 6 | Original file is located at 7 | https://colab.research.google.com/drive/1amfBeMoL-SOQ6pTMd8Sdc2hETtFsJkHE 8 | 9 | SVM 10 | 11 | # SVM 12 | """ 13 | 14 | import numpy as np 15 | import pandas as pd 16 | import pickle 17 | import joblib 18 | import matplotlib.pyplot as plt 19 | from numpy import loadtxt 20 | from sklearn.model_selection import train_test_split 21 | from sklearn.preprocessing import StandardScaler 22 | from sklearn.svm import SVC 23 | from sklearn.metrics import confusion_matrix, classification_report 24 | from sklearn.metrics import accuracy_score 25 | from sklearn.metrics import f1_score 26 | from sklearn.metrics import precision_score 27 | from tqdm import tqdm 28 | from xgboost import plot_importance 29 | from sklearn.metrics import confusion_matrix 30 | from sklearn.metrics import recall_score 31 | 32 | #run this if you want to upload the dataset from your google drive 33 | from google.colab import drive 34 | drive.mount("/content/gdrive") 35 | 36 | df= pd.concat(map(pd.read_csv, ['/content/gdrive/My Drive/metasploitable-2.csv', '/content/gdrive/My Drive/Normal_data.csv']), ignore_index=True) 37 | df["Src IP"] = [float(str(i).replace(".", "")) for i in df["Src IP"]] 38 | df["Dst IP"] = [float(str(i).replace(".", "")) for i in df["Dst IP"]] 39 | df["Flow ID"] = [float(str(i).replace(".", "").replace("-", "")) for i in df["Flow ID"]] 40 | df["Timestamp"] = [float(str(i).replace("/", "").replace(":", "").replace(" ","").replace("PM","")) for i in df["Timestamp"]] 41 | print(df) 42 | 43 | #run this if you want to upload dataset from local device 44 | from google.colab import files 45 | uploaded=files.upload() 46 | 47 | # Import the dataset 48 | df = pd.concact(map(pd.read_csv, ['metasploitable-2.csv','Normal_data.csv']), ignore_index=True) 49 | df["Src IP"] = [float(str(i).replace(".", "")) for i in df["Src IP"]] 50 | df["Dst IP"] = [float(str(i).replace(".", "")) for i in df["Dst IP"]] 51 | df["Flow ID"] = [float(str(i).replace(".", "").replace("-", "")) for i in df["Flow ID"]] 52 | df["Timestamp"] = [float(str(i).replace("/", "").replace(":", "").replace(" ","").replace("PM","")) for i in df["Timestamp"]] 53 | print(df) 54 | 55 | # Splitting dataset into features and label 56 | X = df.drop('Label', axis=1) 57 | y = df['Label'] 58 | 59 | # Splitting the dataset into the training set and the test set 60 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) 61 | 62 | # Feature scaling 63 | scaler = StandardScaler() 64 | X_train = scaler.fit_transform(X_train) 65 | X_test = scaler.transform(X_test) 66 | 67 | # Fitting SVM with the training set 68 | SVM = SVC(kernel='linear', random_state=0) 69 | SVM.fit(X_train, y_train) 70 | 71 | # Testing the model by classifying the test set 72 | y_pred = SVM.predict(X_test) 73 | 74 | # Creating confusion matrix for evaluation 75 | cm = confusion_matrix(y_test, y_pred) 76 | cr = classification_report(y_test, y_pred) 77 | 78 | # Print out confusion matrix and report 79 | print(cm) 80 | print(cr) 81 | 82 | """NB 83 | 84 | # NB 85 | """ 86 | 87 | import numpy as np 88 | import pandas as pd 89 | import matplotlib.pyplot as plt 90 | from xgboost import plot_importance 91 | from numpy import loadtxt 92 | from sklearn.model_selection import train_test_split 93 | from sklearn.metrics import accuracy_score 94 | from sklearn.metrics import f1_score 95 | from sklearn.metrics import precision_score 96 | import pickle 97 | from tqdm import tqdm 98 | from sklearn.metrics import confusion_matrix, classification_report 99 | from sklearn.naive_bayes import GaussianNB 100 | from sklearn.metrics import recall_score 101 | from sklearn.naive_bayes import GaussianNB 102 | from sklearn.preprocessing import StandardScaler 103 | 104 | #run this if you want to upload the dataset from your google drive 105 | from google.colab import drive 106 | drive.mount("/content/gdrive") 107 | 108 | train= pd.concat(map(pd.read_csv, ['/content/gdrive/My Drive/metasploitable-2.csv', '/content/gdrive/My Drive/Normal_data.csv'])) 109 | train["Src IP"] = [float(str(i).replace(".", "")) for i in train["Src IP"]] 110 | train["Dst IP"] = [float(str(i).replace(".", "")) for i in train["Dst IP"]] 111 | train["Flow ID"] = [float(str(i).replace(".", "").replace("-", "")) for i in train["Flow ID"]] 112 | train["Timestamp"] = [float(str(i).replace("/", "").replace(":", "").replace(" ","").replace("PM","")) for i in train["Timestamp"]] 113 | print(train) 114 | 115 | from sklearn import preprocessing 116 | for f in train.columns: 117 | if train[f].dtype=='object': 118 | label = preprocessing.LabelEncoder() 119 | label.fit(list(train[f].values)) 120 | train[f] = label.transform(list(train[f].values)) 121 | train.fillna((-999), inplace=True) 122 | train=np.array(train) 123 | train = train.astype(float) 124 | 125 | train = pd.DataFrame(train) 126 | #print(train) 127 | 128 | X = train.drop(train.columns[[81]], axis=1) 129 | y=train[train.columns[[81]]] 130 | 131 | # split data into train and test sets 132 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7) 133 | 134 | X_train = StandardScaler().fit_transform(X_train) 135 | X_test = StandardScaler().fit_transform(X_test) 136 | 137 | NB=GaussianNB() 138 | NB.fit(X_train, y_train) 139 | 140 | y_pred = NB.predict(X_test) 141 | 142 | accuracy = accuracy_score(y_test, y_pred) 143 | print("accuracy:",accuracy) 144 | 145 | f1score=f1_score(y_test, y_pred, average='micro') 146 | print("f1-acore:",f1score) 147 | 148 | cm=confusion_matrix(y_test, y_pred) 149 | print("confusion matrix:",cm) 150 | 151 | print(classification_report(y_test, y_pred)) 152 | 153 | pr=precision_score(y_test,y_pred, average='micro') 154 | print("Precision:",pr) 155 | 156 | """KNN 157 | 158 | # KNN 159 | """ 160 | 161 | import numpy as np 162 | import pandas as pd 163 | from sklearn.model_selection import train_test_split 164 | from sklearn.neighbors import KNeighborsClassifier 165 | from sklearn.preprocessing import StandardScaler 166 | from sklearn.metrics import classification_report, confusion_matrix 167 | 168 | #run this if you want to upload the dataset from your google drive 169 | from google.colab import drive 170 | drive.mount("/content/gdrive") 171 | 172 | #import the dataset 173 | df= pd.concat(map(pd.read_csv, ['/content/gdrive/My Drive/metasploitable-2.csv', '/content/gdrive/My Drive/Normal_data.csv']), ignore_index=True) 174 | df["Src IP"] = [float(str(i).replace(".", "")) for i in df["Src IP"]] 175 | df["Dst IP"] = [float(str(i).replace(".", "")) for i in df["Dst IP"]] 176 | df["Flow ID"] = [float(str(i).replace(".", "").replace("-", "")) for i in df["Flow ID"]] 177 | df["Timestamp"] = [float(str(i).replace("/", "").replace(":", "").replace(" ","").replace("PM","")) for i in df["Timestamp"]] 178 | 179 | print(df) 180 | 181 | #here we are preprocessing 182 | 183 | X=df.drop('Label',1) 184 | y = df['Label'].values 185 | 186 | #Split the training and testing data 187 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) 188 | 189 | #feature Scaling 190 | scaler = StandardScaler() 191 | scaler.fit(X_train) 192 | 193 | X_train = scaler.transform(X_train) 194 | X_test = scaler.transform(X_test) 195 | 196 | #here we train the KNN algorithm to make predictions with it 197 | 198 | KNN = KNeighborsClassifier(n_neighbors=5) 199 | KNN.fit(X_train, y_train) 200 | 201 | #make predictions on the test data 202 | y_pred = KNN.predict(X_test) 203 | 204 | print(confusion_matrix(y_test, y_pred)) 205 | 206 | print(classification_report(y_test, y_pred)) 207 | 208 | """RF 209 | 210 | # RF 211 | """ 212 | 213 | import numpy as np 214 | import pandas as pd 215 | from sklearn.model_selection import train_test_split 216 | from sklearn.preprocessing import StandardScaler 217 | from sklearn.ensemble import RandomForestClassifier 218 | from sklearn.metrics import confusion_matrix, classification_report 219 | 220 | #run this if you want to upload the dataset from your google drive 221 | from google.colab import drive 222 | drive.mount("/content/gdrive") 223 | 224 | #import the dataset 225 | df = pd.concat(map(pd.read_csv, ['/content/gdrive/My Drive/metasploitable-2.csv', '/content/gdrive/My Drive/Normal_data.csv']), ignore_index=True) 226 | df["Src IP"] = [float(str(i).replace(".", "")) for i in df["Src IP"]] 227 | df["Dst IP"] = [float(str(i).replace(".", "")) for i in df["Dst IP"]] 228 | df["Flow ID"] = [float(str(i).replace(".", "").replace("-", "")) for i in df["Flow ID"]] 229 | df["Timestamp"] = [float(str(i).replace("/", "").replace(":", "").replace(" ","").replace("PM","")) for i in df["Timestamp"]] 230 | 231 | print(df) 232 | 233 | #here we are preprocessing 234 | X = df.drop('Label',1) 235 | y = df['Label'].values 236 | 237 | # Splitting the dataset into the Training set and Test set 238 | X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size = 0.25, random_state = 0) 239 | 240 | # Feature Scaling 241 | X_Train = StandardScaler().fit_transform(X_Train) 242 | X_Test = StandardScaler().fit_transform(X_Test) 243 | 244 | # Fitting the classifier into the Training set 245 | RF = RandomForestClassifier(n_estimators = 200, criterion = 'entropy', random_state = 0) 246 | RF.fit(X_Train,y_Train) 247 | 248 | # Predicting the test set results 249 | y_Pred = RF.predict(X_Test) 250 | 251 | # Making the Confusion Matrix 252 | cm = confusion_matrix(y_Test, y_Pred) 253 | print(cm) 254 | 255 | print(classification_report(y_Test, y_Pred)) 256 | 257 | """DT 258 | 259 | # DT 260 | """ 261 | 262 | import numpy as np 263 | import pandas as pd 264 | from sklearn.model_selection import train_test_split 265 | from sklearn.preprocessing import StandardScaler 266 | from sklearn.metrics import confusion_matrix, classification_report 267 | from sklearn.tree import DecisionTreeClassifier 268 | 269 | #run this if you want to upload the dataset from your google drive 270 | from google.colab import drive 271 | drive.mount("/content/gdrive") 272 | 273 | #import the dataset 274 | df = pd.concat(map(pd.read_csv, ['/content/gdrive/My Drive/metasploitable-2.csv', '/content/gdrive/My Drive/Normal_data.csv']), ignore_index=True) 275 | df["Src IP"] = [float(str(i).replace(".", "")) for i in df["Src IP"]] 276 | df["Dst IP"] = [float(str(i).replace(".", "")) for i in df["Dst IP"]] 277 | df["Flow ID"] = [float(str(i).replace(".", "").replace("-", "")) for i in df["Flow ID"]] 278 | df["Timestamp"] = [float(str(i).replace("/", "").replace(":", "").replace(" ","").replace("PM","")) for i in df["Timestamp"]] 279 | 280 | print(df) 281 | 282 | #here we are preprocessing 283 | X = df.drop('Label',1) 284 | y = df['Label'].values 285 | 286 | # Splitting the dataset into the Training set and Test set 287 | X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size = 0.25, random_state = 0) 288 | 289 | # Feature Scaling 290 | X_Train = StandardScaler().fit_transform(X_Train) 291 | X_Test = StandardScaler().fit_transform(X_Test) 292 | 293 | # Fitting the classifier into the Training set 294 | DT = DecisionTreeClassifier(max_depth=6, random_state=1) 295 | DT.fit(X_Train, y_Train) 296 | 297 | # Predicting the test set results 298 | y_Pred = DT.predict(X_Test) 299 | 300 | # Making the Confusion Matrix 301 | cm = confusion_matrix(y_Test, y_Pred) 302 | print(cm) 303 | 304 | print(classification_report(y_Test, y_Pred)) 305 | 306 | """Stacked Model 307 | 308 | # Stacked Model 309 | """ 310 | 311 | import numpy as np 312 | import pandas as pd 313 | from sklearn.ensemble import StackingClassifier 314 | from sklearn.linear_model import LogisticRegression 315 | from sklearn.metrics import accuracy_score 316 | from sklearn.metrics import f1_score 317 | from sklearn.metrics import confusion_matrix, classification_report 318 | 319 | #run this if you want to upload the dataset from your google drive 320 | from google.colab import drive 321 | drive.mount("/content/gdrive") 322 | 323 | #import the dataset 324 | df = pd.concat(map(pd.read_csv, ['/content/gdrive/My Drive/metasploitable-2.csv', '/content/gdrive/My Drive/Normal_data.csv']), ignore_index=True) 325 | df["Src IP"] = [float(str(i).replace(".", "")) for i in df["Src IP"]] 326 | df["Dst IP"] = [float(str(i).replace(".", "")) for i in df["Dst IP"]] 327 | df["Flow ID"] = [float(str(i).replace(".", "").replace("-", "")) for i in df["Flow ID"]] 328 | df["Timestamp"] = [float(str(i).replace("/", "").replace(":", "").replace(" ","").replace("PM","")) for i in df["Timestamp"]] 329 | 330 | #here we are preprocessing 331 | X = df.drop('Label',1) 332 | y = df['Label'].values 333 | 334 | # Splitting the dataset into the Training set and Test set 335 | X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size = 0.25, random_state = 0) 336 | 337 | # Feature Scaling 338 | X_Train = StandardScaler().fit_transform(X_Train) 339 | X_Test = StandardScaler().fit_transform(X_Test) 340 | 341 | estimator_list = [ 342 | ('SVM',SVM), 343 | ('NB',NB), 344 | ('KNN',KNN), 345 | ('RF',RF), 346 | ('DT',DT) 347 | ] 348 | 349 | # Build and fit stack model 350 | stack_model = StackingClassifier( 351 | estimators=estimator_list, final_estimator=LogisticRegression()) 352 | stack_model.fit(X_train, y_train) 353 | 354 | # Make predictions 355 | y_train_pred = stack_model.predict(X_train) 356 | y_test_pred = stack_model.predict(X_test) 357 | 358 | # Training set model performance 359 | train_acc = accuracy_score(y_train, y_train_pred) 360 | train_f1 = f1_score(y_train, y_train_pred, average='weighted') 361 | train_pr=precision_score(y_train,y_train_pred, average='micro') 362 | 363 | 364 | print('Model performance for Training set') 365 | print('Accuracy: %s' % train_acc) 366 | print('F1 score: %s' % train_f1) 367 | print("Precision:%s",train_pr) 368 | 369 | # Test set model performance 370 | test_acc = accuracy_score(y_test, y_test_pred) 371 | test_f1 = f1_score(y_test, y_test_pred, average='weighted') 372 | test_pr=precision_score(y_test,y_test_pred, average='micro') 373 | 374 | print('Model performance for Test set') 375 | print('Accuracy: %s' % test_acc) 376 | print('F1 score: %s' % test_f1) 377 | print("Precision:%s",test_pr) 378 | 379 | cm=confusion_matrix(y_test, y_test_pred) 380 | print(cm) 381 | 382 | print(classification_report(y_test, y_test_pred)) 383 | 384 | """References 385 | 386 | [1] icesonata, "DDoSDN," GitHub, January 6, 2021. 387 | [2] devendra416, "ML-DDoS-Detection-SGB," GitHub, April 30, 2019. 388 | [3] mahesh147, "Random-Forest-Classifier," GitHub, January 22, 2018. 389 | [4] dataprofessor, "Stacking_Classifier," GitHub, April 11, 2021. 390 | """ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DDoS-attacks-detection-on-SDNs-using-ML-models 2 | 3 | As a part of EECE 490 - Intro to Machine Learning course, we stacked five machine learning models into one model to increase accuracy and performance rates of DDoS attacks detection and mitigation on SDNs: 4 | - Dataset used: InSDN dataset containing normal generated traffic and different types of attacks such as DDoS, DoS, U2R, BFA, etc. The dataset consists of 57 attributes and 136743 training points. 5 | - Source of dataset: N.-A. L.-K. ,. A. D. J. MAHMOUD SAID ELSAYED, ”InSDN: A Novel SDN Intrusion Dataset,” IEEE Explore, 8 September 2020 . [Online].Available: https://ieeexplore.ieee.org/document/9187858 6 | - Machine learning models used: support vector machine, decision tree, random forest, naive bayes, and k-nearset neighbor. 7 | - All models were trained and tested separately on the InSDN dataset. A confusion matrix and classification report were created to evaluate the outcomes. 8 | - All models were stacked into one "smart detection stacking model" using a stacking classifier with accuracy and f1-score higher than the aforementioned models. 9 | 10 | ## We used: 11 | 12 | Python, scikit-learn, pandas, numpy, matplotlib 13 | --------------------------------------------------------------------------------