├── DDoS_ML.py
└── README.md


/DDoS_ML.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """EECE 490-M3.ipynb
  3 | 
  4 | Automatically generated by Colaboratory.
  5 | 
  6 | Original file is located at
  7 |     https://colab.research.google.com/drive/1amfBeMoL-SOQ6pTMd8Sdc2hETtFsJkHE
  8 | 
  9 | SVM
 10 | 
 11 | # SVM
 12 | """
 13 | 
 14 | import numpy as np
 15 | import pandas as pd
 16 | import pickle
 17 | import joblib
 18 | import matplotlib.pyplot as plt
 19 | from numpy import loadtxt
 20 | from sklearn.model_selection import train_test_split
 21 | from sklearn.preprocessing import StandardScaler
 22 | from sklearn.svm import SVC
 23 | from sklearn.metrics import confusion_matrix, classification_report
 24 | from sklearn.metrics import accuracy_score
 25 | from sklearn.metrics import f1_score
 26 | from sklearn.metrics import precision_score
 27 | from tqdm import tqdm
 28 | from xgboost import plot_importance
 29 | from sklearn.metrics import confusion_matrix
 30 | from sklearn.metrics import recall_score
 31 | 
 32 | #run this if you want to upload the dataset from your google drive 
 33 | from google.colab import drive
 34 | drive.mount("/content/gdrive")
 35 | 
 36 | df= pd.concat(map(pd.read_csv, ['/content/gdrive/My Drive/metasploitable-2.csv', '/content/gdrive/My Drive/Normal_data.csv']), ignore_index=True)
 37 | df["Src IP"] = [float(str(i).replace(".", "")) for i in df["Src IP"]]
 38 | df["Dst IP"] = [float(str(i).replace(".", "")) for i in df["Dst IP"]]
 39 | df["Flow ID"] = [float(str(i).replace(".", "").replace("-", "")) for i in df["Flow ID"]]
 40 | df["Timestamp"] = [float(str(i).replace("/", "").replace(":", "").replace(" ","").replace("PM","")) for i in df["Timestamp"]]
 41 | print(df)
 42 | 
 43 | #run this if you want to upload dataset from local device
 44 | from google.colab import files
 45 | uploaded=files.upload()
 46 | 
 47 | # Import the dataset
 48 | df = pd.concact(map(pd.read_csv, ['metasploitable-2.csv','Normal_data.csv']), ignore_index=True)
 49 | df["Src IP"] = [float(str(i).replace(".", "")) for i in df["Src IP"]]
 50 | df["Dst IP"] = [float(str(i).replace(".", "")) for i in df["Dst IP"]]
 51 | df["Flow ID"] = [float(str(i).replace(".", "").replace("-", "")) for i in df["Flow ID"]]
 52 | df["Timestamp"] = [float(str(i).replace("/", "").replace(":", "").replace(" ","").replace("PM","")) for i in df["Timestamp"]]
 53 | print(df)
 54 | 
 55 | # Splitting dataset into features and label
 56 | X = df.drop('Label', axis=1)
 57 | y = df['Label']
 58 | 
 59 | # Splitting the dataset into the training set and the test set
 60 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
 61 | 
 62 | # Feature scaling 
 63 | scaler = StandardScaler()
 64 | X_train = scaler.fit_transform(X_train)
 65 | X_test = scaler.transform(X_test)
 66 | 
 67 | # Fitting SVM with the training set
 68 | SVM = SVC(kernel='linear', random_state=0)
 69 | SVM.fit(X_train, y_train)
 70 | 
 71 | # Testing the model by classifying the test set
 72 | y_pred = SVM.predict(X_test)
 73 | 
 74 | # Creating confusion matrix for evaluation
 75 | cm = confusion_matrix(y_test, y_pred)
 76 | cr = classification_report(y_test, y_pred)
 77 | 
 78 | # Print out confusion matrix and report
 79 | print(cm)
 80 | print(cr)
 81 | 
 82 | """NB
 83 | 
 84 | # NB
 85 | """
 86 | 
 87 | import numpy as np
 88 | import pandas as pd
 89 | import matplotlib.pyplot as plt
 90 | from xgboost import plot_importance
 91 | from numpy import loadtxt
 92 | from sklearn.model_selection import train_test_split
 93 | from sklearn.metrics import accuracy_score
 94 | from sklearn.metrics import f1_score
 95 | from sklearn.metrics import precision_score
 96 | import pickle
 97 | from tqdm import tqdm
 98 | from sklearn.metrics import confusion_matrix, classification_report
 99 | from sklearn.naive_bayes import GaussianNB
100 | from sklearn.metrics import recall_score
101 | from sklearn.naive_bayes import GaussianNB
102 | from sklearn.preprocessing import StandardScaler
103 | 
104 | #run this if you want to upload the dataset from your google drive 
105 | from google.colab import drive
106 | drive.mount("/content/gdrive")
107 | 
108 | train= pd.concat(map(pd.read_csv, ['/content/gdrive/My Drive/metasploitable-2.csv', '/content/gdrive/My Drive/Normal_data.csv']))
109 | train["Src IP"] = [float(str(i).replace(".", "")) for i in train["Src IP"]]
110 | train["Dst IP"] = [float(str(i).replace(".", "")) for i in train["Dst IP"]]
111 | train["Flow ID"] = [float(str(i).replace(".", "").replace("-", "")) for i in train["Flow ID"]]
112 | train["Timestamp"] = [float(str(i).replace("/", "").replace(":", "").replace(" ","").replace("PM","")) for i in train["Timestamp"]]
113 | print(train)
114 | 
115 | from sklearn import preprocessing 
116 | for f in train.columns: 
117 |   if train[f].dtype=='object': 
118 |     label = preprocessing.LabelEncoder() 
119 |     label.fit(list(train[f].values)) 
120 |     train[f] = label.transform(list(train[f].values))
121 | train.fillna((-999), inplace=True) 
122 | train=np.array(train) 
123 | train = train.astype(float)
124 | 
125 | train = pd.DataFrame(train)
126 | #print(train)
127 | 
128 | X = train.drop(train.columns[[81]], axis=1)
129 | y=train[train.columns[[81]]]
130 | 
131 | # split data into train and test sets
132 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)
133 | 
134 | X_train = StandardScaler().fit_transform(X_train)
135 | X_test = StandardScaler().fit_transform(X_test)
136 | 
137 | NB=GaussianNB()
138 | NB.fit(X_train, y_train)
139 | 
140 | y_pred = NB.predict(X_test)
141 | 
142 | accuracy = accuracy_score(y_test, y_pred)
143 | print("accuracy:",accuracy)
144 | 
145 | f1score=f1_score(y_test, y_pred, average='micro')
146 | print("f1-acore:",f1score)
147 | 
148 | cm=confusion_matrix(y_test, y_pred)
149 | print("confusion matrix:",cm)
150 | 
151 | print(classification_report(y_test, y_pred))
152 | 
153 | pr=precision_score(y_test,y_pred, average='micro')
154 | print("Precision:",pr)
155 | 
156 | """KNN
157 | 
158 | # KNN
159 | """
160 | 
161 | import numpy as np
162 | import pandas as pd
163 | from sklearn.model_selection import train_test_split
164 | from sklearn.neighbors import KNeighborsClassifier
165 | from sklearn.preprocessing import StandardScaler
166 | from sklearn.metrics import classification_report, confusion_matrix
167 | 
168 | #run this if you want to upload the dataset from your google drive 
169 | from google.colab import drive
170 | drive.mount("/content/gdrive")
171 | 
172 | #import the dataset 
173 | df= pd.concat(map(pd.read_csv, ['/content/gdrive/My Drive/metasploitable-2.csv', '/content/gdrive/My Drive/Normal_data.csv']), ignore_index=True)
174 | df["Src IP"] = [float(str(i).replace(".", "")) for i in df["Src IP"]]
175 | df["Dst IP"] = [float(str(i).replace(".", "")) for i in df["Dst IP"]]
176 | df["Flow ID"] = [float(str(i).replace(".", "").replace("-", "")) for i in df["Flow ID"]]
177 | df["Timestamp"] = [float(str(i).replace("/", "").replace(":", "").replace(" ","").replace("PM","")) for i in df["Timestamp"]]
178 | 
179 | print(df)
180 | 
181 | #here we are preprocessing 
182 | 
183 | X=df.drop('Label',1)
184 | y = df['Label'].values
185 | 
186 | #Split the training and testing data
187 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
188 | 
189 | #feature Scaling
190 | scaler = StandardScaler()
191 | scaler.fit(X_train)
192 | 
193 | X_train = scaler.transform(X_train)
194 | X_test = scaler.transform(X_test)
195 | 
196 | #here we train the KNN algorithm to make predictions with it
197 | 
198 | KNN = KNeighborsClassifier(n_neighbors=5)
199 | KNN.fit(X_train, y_train)
200 | 
201 | #make predictions on the test data 
202 | y_pred = KNN.predict(X_test)
203 | 
204 | print(confusion_matrix(y_test, y_pred))
205 | 
206 | print(classification_report(y_test, y_pred))
207 | 
208 | """RF
209 | 
210 | # RF
211 | """
212 | 
213 | import numpy as np
214 | import pandas as pd
215 | from sklearn.model_selection import train_test_split
216 | from sklearn.preprocessing import StandardScaler
217 | from sklearn.ensemble import RandomForestClassifier
218 | from sklearn.metrics import confusion_matrix, classification_report
219 | 
220 | #run this if you want to upload the dataset from your google drive 
221 | from google.colab import drive
222 | drive.mount("/content/gdrive")
223 | 
224 | #import the dataset 
225 | df = pd.concat(map(pd.read_csv, ['/content/gdrive/My Drive/metasploitable-2.csv', '/content/gdrive/My Drive/Normal_data.csv']), ignore_index=True)
226 | df["Src IP"] = [float(str(i).replace(".", "")) for i in df["Src IP"]]
227 | df["Dst IP"] = [float(str(i).replace(".", "")) for i in df["Dst IP"]]
228 | df["Flow ID"] = [float(str(i).replace(".", "").replace("-", "")) for i in df["Flow ID"]]
229 | df["Timestamp"] = [float(str(i).replace("/", "").replace(":", "").replace(" ","").replace("PM","")) for i in df["Timestamp"]]
230 | 
231 | print(df)
232 | 
233 | #here we are preprocessing 
234 | X = df.drop('Label',1)
235 | y = df['Label'].values
236 | 
237 | # Splitting the dataset into the Training set and Test set
238 | X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size = 0.25, random_state = 0)
239 | 
240 | # Feature Scaling
241 | X_Train = StandardScaler().fit_transform(X_Train)
242 | X_Test = StandardScaler().fit_transform(X_Test)
243 | 
244 | # Fitting the classifier into the Training set
245 | RF = RandomForestClassifier(n_estimators = 200, criterion = 'entropy', random_state = 0)
246 | RF.fit(X_Train,y_Train)
247 | 
248 | # Predicting the test set results
249 | y_Pred = RF.predict(X_Test)
250 | 
251 | # Making the Confusion Matrix 
252 | cm = confusion_matrix(y_Test, y_Pred)
253 | print(cm)
254 | 
255 | print(classification_report(y_Test, y_Pred))
256 | 
257 | """DT
258 | 
259 | # DT
260 | """
261 | 
262 | import numpy as np
263 | import pandas as pd
264 | from sklearn.model_selection import train_test_split
265 | from sklearn.preprocessing import StandardScaler
266 | from sklearn.metrics import confusion_matrix, classification_report
267 | from sklearn.tree import DecisionTreeClassifier
268 | 
269 | #run this if you want to upload the dataset from your google drive 
270 | from google.colab import drive
271 | drive.mount("/content/gdrive")
272 | 
273 | #import the dataset 
274 | df = pd.concat(map(pd.read_csv, ['/content/gdrive/My Drive/metasploitable-2.csv', '/content/gdrive/My Drive/Normal_data.csv']), ignore_index=True)
275 | df["Src IP"] = [float(str(i).replace(".", "")) for i in df["Src IP"]]
276 | df["Dst IP"] = [float(str(i).replace(".", "")) for i in df["Dst IP"]]
277 | df["Flow ID"] = [float(str(i).replace(".", "").replace("-", "")) for i in df["Flow ID"]]
278 | df["Timestamp"] = [float(str(i).replace("/", "").replace(":", "").replace(" ","").replace("PM","")) for i in df["Timestamp"]]
279 | 
280 | print(df)
281 | 
282 | #here we are preprocessing 
283 | X = df.drop('Label',1)
284 | y = df['Label'].values
285 | 
286 | # Splitting the dataset into the Training set and Test set
287 | X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size = 0.25, random_state = 0)
288 | 
289 | # Feature Scaling
290 | X_Train = StandardScaler().fit_transform(X_Train)
291 | X_Test = StandardScaler().fit_transform(X_Test)
292 | 
293 | # Fitting the classifier into the Training set
294 | DT = DecisionTreeClassifier(max_depth=6, random_state=1)
295 | DT.fit(X_Train, y_Train)
296 | 
297 | # Predicting the test set results
298 | y_Pred = DT.predict(X_Test)
299 | 
300 | # Making the Confusion Matrix 
301 | cm = confusion_matrix(y_Test, y_Pred)
302 | print(cm)
303 | 
304 | print(classification_report(y_Test, y_Pred))
305 | 
306 | """Stacked Model
307 | 
308 | # Stacked Model
309 | """
310 | 
311 | import numpy as np
312 | import pandas as pd
313 | from sklearn.ensemble import StackingClassifier
314 | from sklearn.linear_model import LogisticRegression
315 | from sklearn.metrics import accuracy_score
316 | from sklearn.metrics import f1_score
317 | from sklearn.metrics import confusion_matrix, classification_report
318 | 
319 | #run this if you want to upload the dataset from your google drive 
320 | from google.colab import drive
321 | drive.mount("/content/gdrive")
322 | 
323 | #import the dataset 
324 | df = pd.concat(map(pd.read_csv, ['/content/gdrive/My Drive/metasploitable-2.csv', '/content/gdrive/My Drive/Normal_data.csv']), ignore_index=True)
325 | df["Src IP"] = [float(str(i).replace(".", "")) for i in df["Src IP"]]
326 | df["Dst IP"] = [float(str(i).replace(".", "")) for i in df["Dst IP"]]
327 | df["Flow ID"] = [float(str(i).replace(".", "").replace("-", "")) for i in df["Flow ID"]]
328 | df["Timestamp"] = [float(str(i).replace("/", "").replace(":", "").replace(" ","").replace("PM","")) for i in df["Timestamp"]]
329 | 
330 | #here we are preprocessing 
331 | X = df.drop('Label',1)
332 | y = df['Label'].values
333 | 
334 | # Splitting the dataset into the Training set and Test set
335 | X_Train, X_Test, y_Train, y_Test = train_test_split(X, y, test_size = 0.25, random_state = 0)
336 | 
337 | # Feature Scaling
338 | X_Train = StandardScaler().fit_transform(X_Train)
339 | X_Test = StandardScaler().fit_transform(X_Test)
340 | 
341 | estimator_list = [
342 |     ('SVM',SVM),
343 |     ('NB',NB),
344 |     ('KNN',KNN),
345 |     ('RF',RF),
346 |     ('DT',DT)
347 |     ]
348 | 
349 | # Build and fit stack model
350 | stack_model = StackingClassifier(
351 |     estimators=estimator_list, final_estimator=LogisticRegression())
352 | stack_model.fit(X_train, y_train)
353 | 
354 | # Make predictions
355 | y_train_pred = stack_model.predict(X_train)
356 | y_test_pred = stack_model.predict(X_test)
357 | 
358 | # Training set model performance
359 | train_acc = accuracy_score(y_train, y_train_pred)  
360 | train_f1 = f1_score(y_train, y_train_pred, average='weighted') 
361 | train_pr=precision_score(y_train,y_train_pred, average='micro')
362 | 
363 | 
364 | print('Model performance for Training set')
365 | print('Accuracy: %s' % train_acc)
366 | print('F1 score: %s' % train_f1)
367 | print("Precision:%s",train_pr)
368 | 
369 | # Test set model performance
370 | test_acc = accuracy_score(y_test, y_test_pred)  
371 | test_f1 = f1_score(y_test, y_test_pred, average='weighted') 
372 | test_pr=precision_score(y_test,y_test_pred, average='micro')
373 | 
374 | print('Model performance for Test set')
375 | print('Accuracy: %s' % test_acc)
376 | print('F1 score: %s' % test_f1)
377 | print("Precision:%s",test_pr)
378 | 
379 | cm=confusion_matrix(y_test, y_test_pred)
380 | print(cm)
381 | 
382 | print(classification_report(y_test, y_test_pred))
383 | 
384 | """References
385 | 
386 | [1] 	icesonata, "DDoSDN," GitHub, January 6, 2021.
387 | [2] 	devendra416, "ML-DDoS-Detection-SGB," GitHub, April 30, 2019.
388 | [3] 	mahesh147, "Random-Forest-Classifier," GitHub, January 22, 2018.
389 | [4] 	dataprofessor, "Stacking_Classifier," GitHub, April 11, 2021.
390 | """


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DDoS-attacks-detection-on-SDNs-using-ML-models
 2 | 
 3 | As a part of EECE 490 - Intro to Machine Learning course, we stacked five machine learning models into one model to increase accuracy and performance rates of DDoS attacks detection and mitigation on SDNs:
 4 | - Dataset used: InSDN dataset containing normal generated traffic and different types of attacks such as DDoS, DoS, U2R, BFA, etc. The dataset consists of 57 attributes and 136743 training points.
 5 | - Source of dataset: N.-A. L.-K. ,. A. D. J. MAHMOUD SAID ELSAYED, ”InSDN: A Novel SDN Intrusion Dataset,” IEEE Explore, 8 September 2020 . [Online].Available: https://ieeexplore.ieee.org/document/9187858
 6 | - Machine learning models used: support vector machine, decision tree, random forest, naive bayes, and k-nearset neighbor.
 7 | - All models were trained and tested separately on the InSDN dataset. A confusion matrix and classification report were created to evaluate the outcomes.
 8 | - All models were stacked into one "smart detection stacking model" using a stacking classifier with accuracy and f1-score higher than the aforementioned models.
 9 | 
10 | ## We used: 
11 | 
12 | Python, scikit-learn, pandas, numpy, matplotlib
13 | 


--------------------------------------------------------------------------------