├── Decision tree binary classifier
    └── logreg_attack.py
├── KNN Classifier
    └── knn.py
├── Logistic regression for binary classification
    └── logreg_attack.py
└── README.md


/Decision tree binary classifier/logreg_attack.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Wed July 18 11:04:32 2018
  5 | 
  6 | @author: Ke-Hsin,Lo
  7 | """
  8 | 
  9 | from __future__ import absolute_import
 10 | from __future__ import division
 11 | from __future__ import print_function
 12 | # import nesssary package
 13 | import math
 14 | import sys
 15 | import numpy
 16 | from numpy import Inf
 17 | import pandas as pd
 18 | import sklearn.metrics
 19 | import sklearn.model_selection
 20 | import sklearn.linear_model
 21 | import sklearn.preprocessing
 22 | import random
 23 | import matplotlib.pyplot as plt
 24 | 
 25 | def load_train_data(train_ratio=1):
 26 |     data = pd.read_csv('./UNSW_NB15_training-set_selected.csv', header=None,
 27 |                        names=['x%i' % (i) for i in range(37)] + ['y'])
 28 |     Xt = numpy.asarray(data[['x%i' % (i) for i in range(37)]])
 29 |     yt = numpy.asarray(data['y'])
 30 |     return sklearn.model_selection.train_test_split(Xt, yt, test_size=1 - train_ratio, random_state=0)
 31 | 
 32 | 
 33 | def load_test_data(train_ratio=0):
 34 |     data = pd.read_csv('./UNSW_NB15_testing-set_selected.csv', header=None,
 35 |                        names=['x%i' % (i) for i in range(37)] + ['y'])
 36 |     Xtt = numpy.asarray(data[['x%i' % (i) for i in range(37)]])
 37 |     ytt = numpy.asarray(data['y'])
 38 |     return sklearn.model_selection.train_test_split(Xtt, ytt, test_size=1 - train_ratio, random_state=0)
 39 | 
 40 | 
 41 | def scale_features(X_train, X_test, low=0, upp=1):
 42 |     minmax_scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(low, upp)).fit(numpy.vstack((X_train)))  # Transforms features by scaling each feature to a given range(0~1) in order to reinforce dataset and fit training set.
 43 |     X_train_scale = minmax_scaler.transform(X_train)
 44 |     X_test_scale = minmax_scaler.transform(X_test)
 45 |     return X_train_scale, X_test_scale
 46 | 
 47 | 
 48 | def logistic(x):
 49 |     return 1.0 / (1 + math.exp(-x))
 50 | 
 51 | 
 52 | def logistic_derivative(x):
 53 |     return logistic(x) * (1 - logistic(x))
 54 | 
 55 | 
 56 | def logistic_log_likelihood_i(x_i, y_i, theta):  # 0/1 : logL= y * logf + (1-y) * log(1-f)
 57 |     if y_i == 1.0:
 58 |         return math.log(logistic(numpy.dot(x_i, theta)))
 59 |     else:
 60 |         return math.log(1 - logistic(numpy.dot(x_i, theta)))
 61 | 
 62 | 
 63 | def logistic_log_likelihood(x, y, beta):
 64 |     return sum(logistic_log_likelihood_i(x_i, y_i, beta)
 65 |                for x_i, y_i in zip(x, y))
 66 | 
 67 | 
 68 | """i is the index of the data point;
 69 |    j the index of the derivative"""
 70 | 
 71 | 
 72 | def logistic_log_partial_ij(x_i, yi, theta, j):    #calculate gives the gradient
 73 | 
 74 |     return (yi - logistic(numpy.dot(x_i, theta))) * x_i[j]
 75 | 
 76 |     """the gradient of the log likelihood
 77 |     corresponding to the i-th data point"""
 78 | 
 79 | 
 80 | def logistic_log_gradient_i(xi, yi, theta):   #calcaulate its it partial derivative by treating it as a function of just its ith variable, holding the o ther variable fixed
 81 |     return [logistic_log_partial_ij(xi, yi, theta, j)
 82 |             for j, _ in enumerate(theta)]
 83 | 
 84 | 
 85 | def logistic_log_gradient(x, y, beta):
 86 |     return reduce(vector_add,
 87 |                   [logistic_log_gradient_i(x_i, y_i, beta)
 88 |                    for x_i, y_i in zip(x, y)])
 89 | 
 90 | 
 91 | """adds two vectors"""
 92 | 
 93 | 
 94 | def vector_add(v, w):
 95 |     return [v_i + w_i for v_i, w_i in zip(v, w)]
 96 | 
 97 | 
 98 | """scalar number multiplies vector ver 2; same as ver 1"""
 99 | 
100 | 
101 | def scalar_multiply_2(c, v):
102 |     row = []
103 | 
104 |     row = numpy.asarray(c) * v
105 | 
106 |     return row
107 | 
108 | 
109 | def error(xi, yi, theta):
110 |     return yi - predict_prob(xi, theta)
111 | 
112 | 
113 | """evaluated error **2"""
114 | 
115 | 
116 | def squared_error(xi, yi, theta):
117 |     return error(xi, yi, theta) ** 2
118 | 
119 | 
120 | """the gradient corresponding to the ith squared error term"""
121 | 
122 | 
123 | def squared_error_gradient(xi, yi, theta):
124 |     return [-2 * x_ij * error(xi, yi, theta)
125 |             for x_ij in xi]
126 | 
127 | 
128 | """ calculate ridge penalty"""
129 | 
130 | 
131 | def ridge_penalty(lamda, theta):
132 |     return lamda * numpy.dot(theta[1:], theta[1:]) / 2
133 | 
134 | 
135 | """calculate ridge gradient simply"""
136 | 
137 | 
138 | def ridge_penalty_gradient(lamda, theta):
139 |     return [0] + [lamda * thetai for thetai in theta[1:]]
140 | 
141 | 
142 | def logreg_sgd(X, y, alpha=.001, iters=100000, eps=1e-2, lamda=0.001):
143 |     n, d = X.shape
144 |   #  print(n, d)
145 |     theta = numpy.zeros((d, 1))
146 | 
147 |     random.seed(0)
148 |     theta = [random.random() for xi in X[0]]
149 | 
150 |     gradient_fn = logistic_log_gradient_i
151 |     target_fn = logistic_log_likelihood_i  # target is to maximize likelihood value (approaching to zero)
152 | 
153 |     data = zip(X, y)
154 | 
155 |     alpha_0 = alpha  # a step length
156 |     max_theta, max_value = -Inf, -500000
157 |     counter_of_no_improve = 0  # counter
158 |     while counter_of_no_improve < iters:
159 | 
160 |         log_likelihood_value = sum((target_fn(x_i, y_i, theta) + ridge_penalty(lamda, theta)) for x_i, y_i in
161 |                                    data) / n  # According to theory of logistic likelihood; add ridge_penalty to prevent from overfitting.
162 |         print(log_likelihood_value, max_value, max_theta, theta)  # print for processing verbosely
163 |         if log_likelihood_value > max_value:  # if value bigger, it was improved.
164 |             print("Likelihood Improved.")
165 |             if abs(log_likelihood_value - max_value) < eps:  # once training finished, response the maximum theta.
166 |                 print("Target Minimum eps Achieved( < 1e-2 ): ", abs(log_likelihood_value - max_value))
167 |                 max_theta, max_value = theta, log_likelihood_value
168 |                 return max_theta
169 |             else:
170 |                 print("eps: ", abs(log_likelihood_value - max_value))  # if not smaller than eps, continue training.
171 | 
172 |                 """if find a new maximum, renew the value, and initialize the alpha, which is the walking length."""
173 |             max_theta, max_value = theta, log_likelihood_value  # save the newest theta as max_theta for return the output and further training
174 |             counter_of_no_improve = 0
175 |             alpha = alpha_0
176 |         else:
177 |             """if it was not improved, narrow the walking length and try to walk next step(shrink the step size)."""
178 |             counter_of_no_improve += 1
179 |             print("Not improved. iter of Narrow the Step Length: ", counter_of_no_improve)
180 |             alpha *= 0.9
181 | 
182 | 
183 |         for xi, yi in data:
184 |            gradient_i = gradient_fn(xi, yi, theta) + ridge_penalty_gradient(lamda, theta)  # calculate gradient
185 | 
186 |            theta = vector_add(theta, scalar_multiply_2(alpha, gradient_i))  # take a step
187 | 
188 |     # if training so many time and over the iterator number, finish training.
189 |     theta = max_theta
190 | 
191 |     return theta
192 | 
193 | 
194 | def predict_prob(X, theta):  # According to theory of logistic likelihood: probability
195 |     return 1. / (1 + numpy.exp(-numpy.dot(X, theta)))
196 | 
197 | 
198 | def evaluate(y_test, y_prob):  # Evaluation, in accordance with theory of statics.
199 |     tpr = []
200 |     fpr = []
201 |     tp, fp, fn, tn = 0, 0, 0, 0  # true positive, false positive, false negative, true negative.
202 |     for index, i in enumerate(y_test):
203 |         j = index
204 | 
205 |      #   print("y_prob:",y_prob[j])
206 |         round_prob=round(y_prob[j])
207 |         if (i == 1 and round_prob == 1):
208 |             tp = tp + 1
209 |         elif (i == 0 and round_prob == 1):
210 |             fp = fp + 1
211 |         elif (i == 1 and round_prob == 0):
212 |             fn = fn + 1
213 |         elif (i == 0 and round_prob == 0):
214 |             tn = tn + 1
215 | 
216 |     # accuracy
217 |     correct = tp + tn
218 |     total = tp + fp + fn + tn
219 |     accuracy = correct / total
220 | 
221 |     # precision
222 |     precision = tp / (tp + fp)
223 | 
224 |     # recall
225 |     recall = tp / (tp + fn)
226 | 
227 |     # f1_score
228 |     p = precision
229 |     r = recall
230 | 
231 |     f1score = 2 * p * r / (p + r)
232 | 
233 |     print("Accuracy: {0}".format(accuracy))
234 |     print("Precision: {0}".format(precision))
235 |     print("Recall: {0}".format(recall))
236 |     print("F1 Score: {0}".format(f1score))
237 | 
238 | 
239 | 
240 | def plot_roc_curve(y_test, y_prob):
241 |     # compute tpr and fpr of different thresholds
242 |     tpr = []
243 |     fpr = []
244 |     plt.plot(fpr, tpr)
245 |     plt.xlabel("False Positive Rate")
246 |     plt.ylabel("True Positive Rate")
247 |     plt.title('ROC ')
248 |     plt.xlim(0,1)
249 |     plt.ylim(0,1)
250 |     plt.gca().set_aspect('equal', adjustable='box')
251 |     fpr, tpr, thh = sklearn.metrics.roc_curve(y_test, y_prob, 1)
252 |     plt.plot(fpr, tpr, color='green', marker='o', linestyle='solid')
253 |     plt.savefig("roc_curve.png")
254 |     plt.show()
255 | 
256 | def main(argv):
257 |     """data preprocessing"""
258 | 
259 |     """preprocessing x and y of training data"""
260 |     x_train2, t1, y_train, t2 = load_train_data(train_ratio=0.99)
261 |     """preprocessing x and y of testing data"""
262 |     t3, X_test, t4, y_test = load_test_data(train_ratio=0.01)
263 |     """scale X dataset"""
264 |     X_train_scale, X_test_scale = scale_features(x_train2, X_test, 0, 1)
265 | 
266 |     """training and get model"""
267 |     theta = logreg_sgd(X_train_scale, y_train)
268 | 
269 |     """result output"""
270 |     y_prob = predict_prob(X_train_scale, theta)
271 |     print("Logreg train accuracy: %f" % (sklearn.metrics.accuracy_score(y_train, y_prob > .5)))
272 |     y_prob = predict_prob(X_test_scale, theta)
273 |     print("Logreg test accuracy: %f" % (sklearn.metrics.accuracy_score(y_test, y_prob > .5)))
274 | 
275 |     evaluate(y_test.flatten(), y_prob.flatten())
276 |     plot_roc_curve(y_test.flatten(), y_prob.flatten())
277 | 
278 | if __name__ == "__main__":
279 |     main(sys.argv)
280 | 


--------------------------------------------------------------------------------
/KNN Classifier/knn.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Wed Aug 01 18:13:26 2018
  5 | 
  6 | @author: Ke-Hsin, Lo
  7 | """
  8 | 
  9 | import unicodecsv
 10 | import random
 11 | import operator
 12 | import math
 13 | import numpy
 14 | 
 15 | import pandas as pd
 16 | import sklearn.metrics
 17 | import sklearn.model_selection
 18 | import sklearn.linear_model
 19 | import sklearn.preprocessing
 20 | import matplotlib.pyplot as plt
 21 | 
 22 | # getdata() function definition
 23 | def getdata(filename):
 24 |     with open(filename, 'rb') as f:
 25 |         reader = unicodecsv.reader(f)
 26 |         return list(reader)
 27 | 
 28 | 
 29 | 
 30 | 
 31 | 
 32 | def cosine_similarity(v1, v2):
 33 | 
 34 |     sum_xx, sum_xy, sum_yy = 0.0, 0.0, 0.0
 35 |  #   print "len: %d" %(len(v1))
 36 |     for i in range(0, len(v1)-1):
 37 |  #       print (v1[i])
 38 |         sum_xx += math.pow(float(v1[i]), 2)
 39 |         sum_xy += float(v1[i]) * float(v2[i])
 40 |         sum_yy += math.pow(float(v2[i]), 2)
 41 | 
 42 |     return sum_xy / math.sqrt(sum_xx * sum_yy)
 43 | 
 44 | def cosine_distance(v1, v2):
 45 |     1-cosine_similarity(v1,v2)
 46 | 
 47 | # KNN prediction and model training
 48 | def knn_predict(test_data, train_data, k_value, category):
 49 |     totalcount = 0
 50 |     for i in test_data: #select tested data
 51 |         cos_similarity_list = [] # all distance array
 52 | 
 53 |         classNum=dict() #a dictionary of nebor
 54 |         classNum['Normal'] = 0
 55 |         classNum['Reconnaissance'] = 0
 56 |         classNum['Exploits'] = 0
 57 |         classNum['Fuzzers'] = 0
 58 |         classNum['DoS'] = 0
 59 |         classNum['Generic'] = 0
 60 |         classNum['Shellcode'] = 0
 61 |         classNum['Analysis'] = 0
 62 |         classNum['Worms'] = 0
 63 |         classNum['Backdoors'] = 0
 64 | 
 65 |         jcount = 0
 66 | 
 67 |         for j in train_data: # find in train data to get the nearest point
 68 |        #     print "i: %s" %(i)
 69 |             cos_sm = cosine_similarity(i, j)  #  1 test data  train set
 70 |             cos_similarity_list.append((category[jcount], cos_sm)) #the distance with the category
 71 | #            print cos_similarity_list # just for debugging and observing; in general running, this line will not be used.
 72 |             print "count: %s" %(jcount)
 73 |             cos_similarity_list.sort(key=operator.itemgetter(1), reverse=True) #use cos piority
 74 |             ''' similarity priority list has been built; we can find the first k nearest neighbors '''
 75 |             jcount += 1
 76 |             totalcount += 1
 77 |             print "Processing: %s" % (totalcount)
 78 | 
 79 |         knn = cos_similarity_list[:k_value]  # select first k neighbors
 80 | 
 81 |         print knn
 82 |         for k in knn: #k[0] is the most simliar.
 83 |             if k[0] == 'Normal':
 84 |                 classNum['Normal'] += 1
 85 |             elif k[0] == 'Reconnaissance':
 86 |                 classNum['Reconnaissance'] += 1
 87 |             elif k[0] == 'Exploits':
 88 |                 classNum['Exploits'] += 1
 89 |             elif k[0] == 'Fuzzers':
 90 |                 classNum['Fuzzers'] += 1
 91 |             elif k[0] == 'DoS':
 92 |                 classNum['DoS'] += 1
 93 |             elif k[0] == 'Generic':
 94 |                 classNum['Generic'] += 1
 95 |             elif k[0] == 'Shellcode':
 96 |                 classNum['Shellcode'] += 1
 97 |             elif k[0] == 'Analysis':
 98 |                 classNum['Analysis'] += 1
 99 |             elif k[0] == 'Worms':
100 |                 classNum['Worms'] += 1
101 |             elif k[0] == 'Backdoors':
102 |                 classNum['Backdoors'] += 1
103 | 
104 |     
105 | #        print  "result: %d %d %d %d %d" %(classNum['Normal'],  classNum['Reconnaissance'], classNum['Exploits'], classNum['Fuzzers'], classNum['DoS'])
106 |         max_value = max(classNum, key=classNum.get) # max(classNum)
107 |         print "max_value %s" %(max_value)
108 | 
109 |     #    recover_key(classNum, max_value)
110 | 
111 |  #       max_index = recover_key(classNum, max_value)
112 |  #       print "max_index %s" %(max_index)
113 |         i.append(max_value) # append prediction; tag category
114 | 
115 | def recover_key(dictionary, value):
116 |      for a_key in dictionary.keys():
117 |          if (dictionary[a_key] == value):
118 |              return a_key
119 | 
120 | # Accuracy calculation function
121 | def accuracy(test_data, true_result):
122 |     correct = 0
123 |     for i in test_data:
124 |         #print len(i)
125 |         #print i[len(i)-1]
126 | 
127 |         jcount = 0
128 |         if true_result[jcount] == i[len(i)-1]:
129 |             correct += 1
130 |             jcount+=1
131 | 
132 |     accuracy = float(correct) / len(test_data) * 100  # accuracy
133 |     return accuracy
134 | 
135 | 
136 | def KNN(K, train_x, train_y, test_x, test_y):
137 |    # dataset = getdata('UNSW_NB15_training-set_selected.csv')  # getdata function call with csv file as parameter
138 | #    print len(dataset)
139 |  #   train_dataset, test_dataset = shuffle(dataset)  # train test data split
140 |   #  K = 3  # Assumed K value
141 | 
142 |     train_dataset = train_x.tolist()
143 |     print "Number of training X: %s" %len(train_dataset)
144 |     print "Number of training Y: %s" %len(train_y)
145 |     test_dataset = test_x.tolist()
146 |     print "Number of testing X: %s" %len(test_dataset)
147 | 
148 |     print "Training Set KNN Process:"
149 |     knn_predict(train_dataset, train_dataset, K, train_y)
150 |     print "Testing Set KNN Process:"
151 |     knn_predict(test_dataset, train_dataset, K, train_y)
152 |     atrain = round(accuracy(train_dataset, train_y),5)
153 |     TrainError = float(100.00000- float(atrain))
154 |     atest = round(accuracy(test_dataset, test_y),5)
155 |     TestError = 100.00000- atest
156 |     # print test_dataset
157 |     print "Accuracy of train_dataset : ", atrain
158 |     print "Train error : ", TrainError
159 |     print "Accuracy of test_dataset: ", atest
160 |     print "Test error: ", TestError
161 |     return TrainError, TestError, atrain, atest
162 | 
163 | def load_train_data(train_ratio=0.12):
164 |     data = pd.read_csv('./UNSW_NB15_training-set_selected.csv', header=None,
165 |                        names=['x%i' % (i) for i in range(33)] + ['logic']+['y'])
166 |     Xt = numpy.asarray(data[['x%i' % (i) for i in range(33)]])
167 |     yt = numpy.asarray(data['y'])
168 | 
169 |     return sklearn.model_selection.train_test_split(Xt, yt, test_size=1 - train_ratio, random_state=0)
170 | 
171 | 
172 | def load_test_data(train_ratio=0.88):
173 |     data = pd.read_csv('./UNSW_NB15_testing-set_selected.csv', header=None,
174 |                        names=['x%i' % (i) for i in range(33)] + ['logic']+['y'])
175 |     Xtt = numpy.asarray(data[['x%i' % (i) for i in range(33)]])
176 |     ytt = numpy.asarray(data['y'])
177 |     return sklearn.model_selection.train_test_split(Xtt, ytt, test_size=1 - train_ratio, random_state=0)
178 | 
179 | def scale_features(X_train, X_test, low=0, upp=1):
180 |     minmax_scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(low, upp)).fit(numpy.vstack((X_train
181 |                                                                                                 )))  # Transforms features by scaling each feature to a given range(0~1) in order to reinforce dataset and fit training set.
182 |     X_train_scale = minmax_scaler.transform(X_train)
183 |     X_test_scale = minmax_scaler.transform(X_test)
184 |     return X_train_scale, X_test_scale
185 | 
186 | """preprocessing x and y of training data"""
187 | x_train2, t1, y_train, t2 = load_train_data(train_ratio=0.003) #1
188 | """preprocessing x and y of testing data"""
189 | t3, X_test, t4, y_test = load_test_data(train_ratio=(1-0.003)) #2
190 | 
191 | """scale X dataset"""
192 | X_train_scale, X_test_scale = scale_features(x_train2, X_test, 0, 1)
193 | TrainError = []
194 | TestError = []
195 | TrainAccuracy = []
196 | TestAccuracy = []
197 | plt.figure(2)
198 | ax1 = plt.subplot(211)
199 | ax2 = plt.subplot(212)
200 | plt.figure(3)
201 | bx1 = plt.subplot(311)
202 | bx2 = plt.subplot(312)
203 | x = []
204 | 
205 | '''knn start: for small sample, start from 1; this from 9 is for this big sample set. Because there are same similarity in diffrent kind.'''
206 | for k in range(13,3,-1): #3
207 |     print "K: %d" %(k)
208 |     TrainErrorTemp, TestErrorTemp, AoTrain, AoTest = KNN(k, x_train2, y_train, X_test, y_test)
209 |     TrainError.append(TrainErrorTemp)
210 |     TestError.append(TestErrorTemp)
211 |     TrainAccuracy.append(AoTrain)
212 |     TestAccuracy.append( AoTest)
213 |     print " "
214 |     x.append(k)
215 | 
216 | 
217 | plt.sca(ax1)
218 | plt.plot(x, TrainError)
219 | 
220 | 
221 | plt.sca(ax2)
222 | plt.plot(x, TestError)
223 | 
224 | 
225 | plt.sca(bx1)
226 | plt.plot(x, TrainAccuracy)
227 | 
228 | 
229 | plt.sca(bx2)
230 | plt.plot(x, TestAccuracy)
231 | 
232 | 
233 | plt.xlabel('x axis') # make axis labels
234 | plt.ylabel('y axis')
235 | plt.show()
236 | 
237 | 


--------------------------------------------------------------------------------
/Logistic regression for binary classification/logreg_attack.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Wed July 18 11:04:32 2018
  5 | 
  6 | @author: Ke-Hsin,Lo
  7 | """
  8 | 
  9 | from __future__ import absolute_import
 10 | from __future__ import division
 11 | from __future__ import print_function
 12 | # import nesssary package
 13 | import math
 14 | import sys
 15 | import numpy
 16 | from numpy import Inf
 17 | import pandas as pd
 18 | import sklearn.metrics
 19 | import sklearn.model_selection
 20 | import sklearn.linear_model
 21 | import sklearn.preprocessing
 22 | import random
 23 | import matplotlib.pyplot as plt
 24 | 
 25 | def load_train_data(train_ratio=1):
 26 |     data = pd.read_csv('./UNSW_NB15_training-set_selected.csv', header=None,
 27 |                        names=['x%i' % (i) for i in range(37)] + ['y'])
 28 |     Xt = numpy.asarray(data[['x%i' % (i) for i in range(37)]])
 29 |     yt = numpy.asarray(data['y'])
 30 |     return sklearn.model_selection.train_test_split(Xt, yt, test_size=1 - train_ratio, random_state=0)
 31 | 
 32 | 
 33 | def load_test_data(train_ratio=0):
 34 |     data = pd.read_csv('./UNSW_NB15_testing-set_selected.csv', header=None,
 35 |                        names=['x%i' % (i) for i in range(37)] + ['y'])
 36 |     Xtt = numpy.asarray(data[['x%i' % (i) for i in range(37)]])
 37 |     ytt = numpy.asarray(data['y'])
 38 |     return sklearn.model_selection.train_test_split(Xtt, ytt, test_size=1 - train_ratio, random_state=0)
 39 | 
 40 | 
 41 | def scale_features(X_train, X_test, low=0, upp=1):
 42 |     minmax_scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(low, upp)).fit(numpy.vstack((X_train)))  # Transforms features by scaling each feature to a given range(0~1) in order to reinforce dataset and fit training set.
 43 |     X_train_scale = minmax_scaler.transform(X_train)
 44 |     X_test_scale = minmax_scaler.transform(X_test)
 45 |     return X_train_scale, X_test_scale
 46 | 
 47 | 
 48 | def logistic(x):
 49 |     return 1.0 / (1 + math.exp(-x))
 50 | 
 51 | 
 52 | def logistic_derivative(x):
 53 |     return logistic(x) * (1 - logistic(x))
 54 | 
 55 | 
 56 | def logistic_log_likelihood_i(x_i, y_i, theta):  # 0/1 : logL= y * logf + (1-y) * log(1-f)
 57 |     if y_i == 1.0:
 58 |         return math.log(logistic(numpy.dot(x_i, theta)))
 59 |     else:
 60 |         return math.log(1 - logistic(numpy.dot(x_i, theta)))
 61 | 
 62 | 
 63 | def logistic_log_likelihood(x, y, beta):
 64 |     return sum(logistic_log_likelihood_i(x_i, y_i, beta)
 65 |                for x_i, y_i in zip(x, y))
 66 | 
 67 | 
 68 | """i is the index of the data point;
 69 |    j the index of the derivative"""
 70 | 
 71 | 
 72 | def logistic_log_partial_ij(x_i, yi, theta, j):    #calculate gives the gradient
 73 | 
 74 |     return (yi - logistic(numpy.dot(x_i, theta))) * x_i[j]
 75 | 
 76 |     """the gradient of the log likelihood
 77 |     corresponding to the i-th data point"""
 78 | 
 79 | 
 80 | def logistic_log_gradient_i(xi, yi, theta):   #calcaulate its it partial derivative by treating it as a function of just its ith variable, holding the o ther variable fixed
 81 |     return [logistic_log_partial_ij(xi, yi, theta, j)
 82 |             for j, _ in enumerate(theta)]
 83 | 
 84 | 
 85 | def logistic_log_gradient(x, y, beta):
 86 |     return reduce(vector_add,
 87 |                   [logistic_log_gradient_i(x_i, y_i, beta)
 88 |                    for x_i, y_i in zip(x, y)])
 89 | 
 90 | 
 91 | """adds two vectors"""
 92 | 
 93 | 
 94 | def vector_add(v, w):
 95 |     return [v_i + w_i for v_i, w_i in zip(v, w)]
 96 | 
 97 | 
 98 | """scalar number multiplies vector ver 2; same as ver 1"""
 99 | 
100 | 
101 | def scalar_multiply_2(c, v):
102 |     row = []
103 | 
104 |     row = numpy.asarray(c) * v
105 | 
106 |     return row
107 | 
108 | 
109 | def error(xi, yi, theta):
110 |     return yi - predict_prob(xi, theta)
111 | 
112 | 
113 | """evaluated error **2"""
114 | 
115 | 
116 | def squared_error(xi, yi, theta):
117 |     return error(xi, yi, theta) ** 2
118 | 
119 | 
120 | """the gradient corresponding to the ith squared error term"""
121 | 
122 | 
123 | def squared_error_gradient(xi, yi, theta):
124 |     return [-2 * x_ij * error(xi, yi, theta)
125 |             for x_ij in xi]
126 | 
127 | 
128 | """ calculate ridge penalty"""
129 | 
130 | 
131 | def ridge_penalty(lamda, theta):
132 |     return lamda * numpy.dot(theta[1:], theta[1:]) / 2
133 | 
134 | 
135 | """calculate ridge gradient simply"""
136 | 
137 | 
138 | def ridge_penalty_gradient(lamda, theta):
139 |     return [0] + [lamda * thetai for thetai in theta[1:]]
140 | 
141 | 
142 | def logreg_sgd(X, y, alpha=.001, iters=100000, eps=1e-2, lamda=0.001):
143 |     n, d = X.shape
144 |   #  print(n, d)
145 |     theta = numpy.zeros((d, 1))
146 | 
147 |     random.seed(0)
148 |     theta = [random.random() for xi in X[0]]
149 | 
150 |     gradient_fn = logistic_log_gradient_i
151 |     target_fn = logistic_log_likelihood_i  # target is to maximize likelihood value (approaching to zero)
152 | 
153 |     data = zip(X, y)
154 | 
155 |     alpha_0 = alpha  # a step length
156 |     max_theta, max_value = -Inf, -500000
157 |     counter_of_no_improve = 0  # counter
158 |     while counter_of_no_improve < iters:
159 | 
160 |         log_likelihood_value = sum((target_fn(x_i, y_i, theta) + ridge_penalty(lamda, theta)) for x_i, y_i in
161 |                                    data) / n  # According to theory of logistic likelihood; add ridge_penalty to prevent from overfitting.
162 |         print(log_likelihood_value, max_value, max_theta, theta)  # print for processing verbosely
163 |         if log_likelihood_value > max_value:  # if value bigger, it was improved.
164 |             print("Likelihood Improved.")
165 |             if abs(log_likelihood_value - max_value) < eps:  # once training finished, response the maximum theta.
166 |                 print("Target Minimum eps Achieved( < 1e-2 ): ", abs(log_likelihood_value - max_value))
167 |                 max_theta, max_value = theta, log_likelihood_value
168 |                 return max_theta
169 |             else:
170 |                 print("eps: ", abs(log_likelihood_value - max_value))  # if not smaller than eps, continue training.
171 | 
172 |                 """if find a new maximum, renew the value, and initialize the alpha, which is the walking length."""
173 |             max_theta, max_value = theta, log_likelihood_value  # save the newest theta as max_theta for return the output and further training
174 |             counter_of_no_improve = 0
175 |             alpha = alpha_0
176 |         else:
177 |             """if it was not improved, narrow the walking length and try to walk next step(shrink the step size)."""
178 |             counter_of_no_improve += 1
179 |             print("Not improved. iter of Narrow the Step Length: ", counter_of_no_improve)
180 |             alpha *= 0.9
181 | 
182 | 
183 |         for xi, yi in data:
184 |            gradient_i = gradient_fn(xi, yi, theta) + ridge_penalty_gradient(lamda, theta)  # calculate gradient
185 | 
186 |            theta = vector_add(theta, scalar_multiply_2(alpha, gradient_i))  # take a step
187 | 
188 |     # if training so many time and over the iterator number, finish training.
189 |     theta = max_theta
190 | 
191 |     return theta
192 | 
193 | 
194 | def predict_prob(X, theta):  # According to theory of logistic likelihood: probability
195 |     return 1. / (1 + numpy.exp(-numpy.dot(X, theta)))
196 | 
197 | 
198 | def evaluate(y_test, y_prob):  # Evaluation, in accordance with theory of statics.
199 |     tpr = []
200 |     fpr = []
201 |     tp, fp, fn, tn = 0, 0, 0, 0  # true positive, false positive, false negative, true negative.
202 |     for index, i in enumerate(y_test):
203 |         j = index
204 | 
205 |      #   print("y_prob:",y_prob[j])
206 |         round_prob=round(y_prob[j])
207 |         if (i == 1 and round_prob == 1):
208 |             tp = tp + 1
209 |         elif (i == 0 and round_prob == 1):
210 |             fp = fp + 1
211 |         elif (i == 1 and round_prob == 0):
212 |             fn = fn + 1
213 |         elif (i == 0 and round_prob == 0):
214 |             tn = tn + 1
215 | 
216 |     # accuracy
217 |     correct = tp + tn
218 |     total = tp + fp + fn + tn
219 |     accuracy = correct / total
220 | 
221 |     # precision
222 |     precision = tp / (tp + fp)
223 | 
224 |     # recall
225 |     recall = tp / (tp + fn)
226 | 
227 |     # f1_score
228 |     p = precision
229 |     r = recall
230 | 
231 |     f1score = 2 * p * r / (p + r)
232 | 
233 |     print("Accuracy: {0}".format(accuracy))
234 |     print("Precision: {0}".format(precision))
235 |     print("Recall: {0}".format(recall))
236 |     print("F1 Score: {0}".format(f1score))
237 | 
238 | 
239 | 
240 | def plot_roc_curve(y_test, y_prob):
241 |     # compute tpr and fpr of different thresholds
242 |     tpr = []
243 |     fpr = []
244 |     plt.plot(fpr, tpr)
245 |     plt.xlabel("False Positive Rate")
246 |     plt.ylabel("True Positive Rate")
247 |     plt.title('ROC ')
248 |     plt.xlim(0,1)
249 |     plt.ylim(0,1)
250 |     plt.gca().set_aspect('equal', adjustable='box')
251 |     fpr, tpr, thh = sklearn.metrics.roc_curve(y_test, y_prob, 1)
252 |     plt.plot(fpr, tpr, color='green', marker='o', linestyle='solid')
253 |     plt.savefig("roc_curve.png")
254 |     plt.show()
255 | 
256 | def main(argv):
257 |     """data preprocessing"""
258 | 
259 |     """preprocessing x and y of training data"""
260 |     x_train2, t1, y_train, t2 = load_train_data(train_ratio=0.99)
261 |     """preprocessing x and y of testing data"""
262 |     t3, X_test, t4, y_test = load_test_data(train_ratio=0.01)
263 |     """scale X dataset"""
264 |     X_train_scale, X_test_scale = scale_features(x_train2, X_test, 0, 1)
265 | 
266 |     """training and get model"""
267 |     theta = logreg_sgd(X_train_scale, y_train)
268 | 
269 |     """result output"""
270 |     y_prob = predict_prob(X_train_scale, theta)
271 |     print("Logreg train accuracy: %f" % (sklearn.metrics.accuracy_score(y_train, y_prob > .5)))
272 |     y_prob = predict_prob(X_test_scale, theta)
273 |     print("Logreg test accuracy: %f" % (sklearn.metrics.accuracy_score(y_test, y_prob > .5)))
274 | 
275 |     evaluate(y_test.flatten(), y_prob.flatten())
276 |     plot_roc_curve(y_test.flatten(), y_prob.flatten())
277 | 
278 | if __name__ == "__main__":
279 |     main(sys.argv)
280 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Machine-learning-algorithms-for-detecting-network-attacks-with-UNSW-NB15-data-set
 2 |  Due to the increasingly development of network technology recently, there are various cyber-attacks posed the huge threats to different fields around the world. Many studies and researches about cyber-security are carried out by experts in order to construct a safe network environment for people. The aim of the work is to build the detection models for classifying the attack data. Hence, we applied the UNSW-NB15 network data set which combines both normal and modern low-level attacks because we would like to create the experimental scenario close to the real world. Two classifiers are logistic regression and decision tree model for binary classification in the work. The deployed technique for decision tree achieved the highest result with 99.99% of testing accuracy compare to the 78.15% of logistic regression classifier. On the other hand, the KNN model is used for categorizing the multi-class in the project, and the averaged accuracy for testing is around 23% for ten categories classification.
 3 |  
 4 | The details of the UNSW-NB15 data set are published in following the papers:
 5 | 
 6 | Moustafa, Nour, and Jill Slay. "UNSW-NB15: a comprehensive data set for network intrusion detection systems (UNSW-NB15 network data set)."Military Communications and Information Systems Conference (MilCIS), 2015. IEEE, 2015.
 7 | Moustafa, Nour, and Jill Slay. "The evaluation of Network Anomaly Detection Systems: Statistical analysis of the UNSW-NB15 data set and the comparison with the KDD99 data set." Information Security Journal: A Global Perspective (2016): 1-14.
 8 |  
 9 |  Find the dataset here: https://www.unsw.adfa.edu.au/australian-centre-for-cyber-security/cybersecurity/ADFA-NB15-Datasets/
10 | 


--------------------------------------------------------------------------------