├── Decision tree binary classifier └── logreg_attack.py ├── KNN Classifier └── knn.py ├── Logistic regression for binary classification └── logreg_attack.py └── README.md /Decision tree binary classifier/logreg_attack.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed July 18 11:04:32 2018 5 | 6 | @author: Ke-Hsin,Lo 7 | """ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import division 11 | from __future__ import print_function 12 | # import nesssary package 13 | import math 14 | import sys 15 | import numpy 16 | from numpy import Inf 17 | import pandas as pd 18 | import sklearn.metrics 19 | import sklearn.model_selection 20 | import sklearn.linear_model 21 | import sklearn.preprocessing 22 | import random 23 | import matplotlib.pyplot as plt 24 | 25 | def load_train_data(train_ratio=1): 26 | data = pd.read_csv('./UNSW_NB15_training-set_selected.csv', header=None, 27 | names=['x%i' % (i) for i in range(37)] + ['y']) 28 | Xt = numpy.asarray(data[['x%i' % (i) for i in range(37)]]) 29 | yt = numpy.asarray(data['y']) 30 | return sklearn.model_selection.train_test_split(Xt, yt, test_size=1 - train_ratio, random_state=0) 31 | 32 | 33 | def load_test_data(train_ratio=0): 34 | data = pd.read_csv('./UNSW_NB15_testing-set_selected.csv', header=None, 35 | names=['x%i' % (i) for i in range(37)] + ['y']) 36 | Xtt = numpy.asarray(data[['x%i' % (i) for i in range(37)]]) 37 | ytt = numpy.asarray(data['y']) 38 | return sklearn.model_selection.train_test_split(Xtt, ytt, test_size=1 - train_ratio, random_state=0) 39 | 40 | 41 | def scale_features(X_train, X_test, low=0, upp=1): 42 | minmax_scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(low, upp)).fit(numpy.vstack((X_train))) # Transforms features by scaling each feature to a given range(0~1) in order to reinforce dataset and fit training set. 43 | X_train_scale = minmax_scaler.transform(X_train) 44 | X_test_scale = minmax_scaler.transform(X_test) 45 | return X_train_scale, X_test_scale 46 | 47 | 48 | def logistic(x): 49 | return 1.0 / (1 + math.exp(-x)) 50 | 51 | 52 | def logistic_derivative(x): 53 | return logistic(x) * (1 - logistic(x)) 54 | 55 | 56 | def logistic_log_likelihood_i(x_i, y_i, theta): # 0/1 : logL= y * logf + (1-y) * log(1-f) 57 | if y_i == 1.0: 58 | return math.log(logistic(numpy.dot(x_i, theta))) 59 | else: 60 | return math.log(1 - logistic(numpy.dot(x_i, theta))) 61 | 62 | 63 | def logistic_log_likelihood(x, y, beta): 64 | return sum(logistic_log_likelihood_i(x_i, y_i, beta) 65 | for x_i, y_i in zip(x, y)) 66 | 67 | 68 | """i is the index of the data point; 69 | j the index of the derivative""" 70 | 71 | 72 | def logistic_log_partial_ij(x_i, yi, theta, j): #calculate gives the gradient 73 | 74 | return (yi - logistic(numpy.dot(x_i, theta))) * x_i[j] 75 | 76 | """the gradient of the log likelihood 77 | corresponding to the i-th data point""" 78 | 79 | 80 | def logistic_log_gradient_i(xi, yi, theta): #calcaulate its it partial derivative by treating it as a function of just its ith variable, holding the o ther variable fixed 81 | return [logistic_log_partial_ij(xi, yi, theta, j) 82 | for j, _ in enumerate(theta)] 83 | 84 | 85 | def logistic_log_gradient(x, y, beta): 86 | return reduce(vector_add, 87 | [logistic_log_gradient_i(x_i, y_i, beta) 88 | for x_i, y_i in zip(x, y)]) 89 | 90 | 91 | """adds two vectors""" 92 | 93 | 94 | def vector_add(v, w): 95 | return [v_i + w_i for v_i, w_i in zip(v, w)] 96 | 97 | 98 | """scalar number multiplies vector ver 2; same as ver 1""" 99 | 100 | 101 | def scalar_multiply_2(c, v): 102 | row = [] 103 | 104 | row = numpy.asarray(c) * v 105 | 106 | return row 107 | 108 | 109 | def error(xi, yi, theta): 110 | return yi - predict_prob(xi, theta) 111 | 112 | 113 | """evaluated error **2""" 114 | 115 | 116 | def squared_error(xi, yi, theta): 117 | return error(xi, yi, theta) ** 2 118 | 119 | 120 | """the gradient corresponding to the ith squared error term""" 121 | 122 | 123 | def squared_error_gradient(xi, yi, theta): 124 | return [-2 * x_ij * error(xi, yi, theta) 125 | for x_ij in xi] 126 | 127 | 128 | """ calculate ridge penalty""" 129 | 130 | 131 | def ridge_penalty(lamda, theta): 132 | return lamda * numpy.dot(theta[1:], theta[1:]) / 2 133 | 134 | 135 | """calculate ridge gradient simply""" 136 | 137 | 138 | def ridge_penalty_gradient(lamda, theta): 139 | return [0] + [lamda * thetai for thetai in theta[1:]] 140 | 141 | 142 | def logreg_sgd(X, y, alpha=.001, iters=100000, eps=1e-2, lamda=0.001): 143 | n, d = X.shape 144 | # print(n, d) 145 | theta = numpy.zeros((d, 1)) 146 | 147 | random.seed(0) 148 | theta = [random.random() for xi in X[0]] 149 | 150 | gradient_fn = logistic_log_gradient_i 151 | target_fn = logistic_log_likelihood_i # target is to maximize likelihood value (approaching to zero) 152 | 153 | data = zip(X, y) 154 | 155 | alpha_0 = alpha # a step length 156 | max_theta, max_value = -Inf, -500000 157 | counter_of_no_improve = 0 # counter 158 | while counter_of_no_improve < iters: 159 | 160 | log_likelihood_value = sum((target_fn(x_i, y_i, theta) + ridge_penalty(lamda, theta)) for x_i, y_i in 161 | data) / n # According to theory of logistic likelihood; add ridge_penalty to prevent from overfitting. 162 | print(log_likelihood_value, max_value, max_theta, theta) # print for processing verbosely 163 | if log_likelihood_value > max_value: # if value bigger, it was improved. 164 | print("Likelihood Improved.") 165 | if abs(log_likelihood_value - max_value) < eps: # once training finished, response the maximum theta. 166 | print("Target Minimum eps Achieved( < 1e-2 ): ", abs(log_likelihood_value - max_value)) 167 | max_theta, max_value = theta, log_likelihood_value 168 | return max_theta 169 | else: 170 | print("eps: ", abs(log_likelihood_value - max_value)) # if not smaller than eps, continue training. 171 | 172 | """if find a new maximum, renew the value, and initialize the alpha, which is the walking length.""" 173 | max_theta, max_value = theta, log_likelihood_value # save the newest theta as max_theta for return the output and further training 174 | counter_of_no_improve = 0 175 | alpha = alpha_0 176 | else: 177 | """if it was not improved, narrow the walking length and try to walk next step(shrink the step size).""" 178 | counter_of_no_improve += 1 179 | print("Not improved. iter of Narrow the Step Length: ", counter_of_no_improve) 180 | alpha *= 0.9 181 | 182 | 183 | for xi, yi in data: 184 | gradient_i = gradient_fn(xi, yi, theta) + ridge_penalty_gradient(lamda, theta) # calculate gradient 185 | 186 | theta = vector_add(theta, scalar_multiply_2(alpha, gradient_i)) # take a step 187 | 188 | # if training so many time and over the iterator number, finish training. 189 | theta = max_theta 190 | 191 | return theta 192 | 193 | 194 | def predict_prob(X, theta): # According to theory of logistic likelihood: probability 195 | return 1. / (1 + numpy.exp(-numpy.dot(X, theta))) 196 | 197 | 198 | def evaluate(y_test, y_prob): # Evaluation, in accordance with theory of statics. 199 | tpr = [] 200 | fpr = [] 201 | tp, fp, fn, tn = 0, 0, 0, 0 # true positive, false positive, false negative, true negative. 202 | for index, i in enumerate(y_test): 203 | j = index 204 | 205 | # print("y_prob:",y_prob[j]) 206 | round_prob=round(y_prob[j]) 207 | if (i == 1 and round_prob == 1): 208 | tp = tp + 1 209 | elif (i == 0 and round_prob == 1): 210 | fp = fp + 1 211 | elif (i == 1 and round_prob == 0): 212 | fn = fn + 1 213 | elif (i == 0 and round_prob == 0): 214 | tn = tn + 1 215 | 216 | # accuracy 217 | correct = tp + tn 218 | total = tp + fp + fn + tn 219 | accuracy = correct / total 220 | 221 | # precision 222 | precision = tp / (tp + fp) 223 | 224 | # recall 225 | recall = tp / (tp + fn) 226 | 227 | # f1_score 228 | p = precision 229 | r = recall 230 | 231 | f1score = 2 * p * r / (p + r) 232 | 233 | print("Accuracy: {0}".format(accuracy)) 234 | print("Precision: {0}".format(precision)) 235 | print("Recall: {0}".format(recall)) 236 | print("F1 Score: {0}".format(f1score)) 237 | 238 | 239 | 240 | def plot_roc_curve(y_test, y_prob): 241 | # compute tpr and fpr of different thresholds 242 | tpr = [] 243 | fpr = [] 244 | plt.plot(fpr, tpr) 245 | plt.xlabel("False Positive Rate") 246 | plt.ylabel("True Positive Rate") 247 | plt.title('ROC ') 248 | plt.xlim(0,1) 249 | plt.ylim(0,1) 250 | plt.gca().set_aspect('equal', adjustable='box') 251 | fpr, tpr, thh = sklearn.metrics.roc_curve(y_test, y_prob, 1) 252 | plt.plot(fpr, tpr, color='green', marker='o', linestyle='solid') 253 | plt.savefig("roc_curve.png") 254 | plt.show() 255 | 256 | def main(argv): 257 | """data preprocessing""" 258 | 259 | """preprocessing x and y of training data""" 260 | x_train2, t1, y_train, t2 = load_train_data(train_ratio=0.99) 261 | """preprocessing x and y of testing data""" 262 | t3, X_test, t4, y_test = load_test_data(train_ratio=0.01) 263 | """scale X dataset""" 264 | X_train_scale, X_test_scale = scale_features(x_train2, X_test, 0, 1) 265 | 266 | """training and get model""" 267 | theta = logreg_sgd(X_train_scale, y_train) 268 | 269 | """result output""" 270 | y_prob = predict_prob(X_train_scale, theta) 271 | print("Logreg train accuracy: %f" % (sklearn.metrics.accuracy_score(y_train, y_prob > .5))) 272 | y_prob = predict_prob(X_test_scale, theta) 273 | print("Logreg test accuracy: %f" % (sklearn.metrics.accuracy_score(y_test, y_prob > .5))) 274 | 275 | evaluate(y_test.flatten(), y_prob.flatten()) 276 | plot_roc_curve(y_test.flatten(), y_prob.flatten()) 277 | 278 | if __name__ == "__main__": 279 | main(sys.argv) 280 | -------------------------------------------------------------------------------- /KNN Classifier/knn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Aug 01 18:13:26 2018 5 | 6 | @author: Ke-Hsin, Lo 7 | """ 8 | 9 | import unicodecsv 10 | import random 11 | import operator 12 | import math 13 | import numpy 14 | 15 | import pandas as pd 16 | import sklearn.metrics 17 | import sklearn.model_selection 18 | import sklearn.linear_model 19 | import sklearn.preprocessing 20 | import matplotlib.pyplot as plt 21 | 22 | # getdata() function definition 23 | def getdata(filename): 24 | with open(filename, 'rb') as f: 25 | reader = unicodecsv.reader(f) 26 | return list(reader) 27 | 28 | 29 | 30 | 31 | 32 | def cosine_similarity(v1, v2): 33 | 34 | sum_xx, sum_xy, sum_yy = 0.0, 0.0, 0.0 35 | # print "len: %d" %(len(v1)) 36 | for i in range(0, len(v1)-1): 37 | # print (v1[i]) 38 | sum_xx += math.pow(float(v1[i]), 2) 39 | sum_xy += float(v1[i]) * float(v2[i]) 40 | sum_yy += math.pow(float(v2[i]), 2) 41 | 42 | return sum_xy / math.sqrt(sum_xx * sum_yy) 43 | 44 | def cosine_distance(v1, v2): 45 | 1-cosine_similarity(v1,v2) 46 | 47 | # KNN prediction and model training 48 | def knn_predict(test_data, train_data, k_value, category): 49 | totalcount = 0 50 | for i in test_data: #select tested data 51 | cos_similarity_list = [] # all distance array 52 | 53 | classNum=dict() #a dictionary of nebor 54 | classNum['Normal'] = 0 55 | classNum['Reconnaissance'] = 0 56 | classNum['Exploits'] = 0 57 | classNum['Fuzzers'] = 0 58 | classNum['DoS'] = 0 59 | classNum['Generic'] = 0 60 | classNum['Shellcode'] = 0 61 | classNum['Analysis'] = 0 62 | classNum['Worms'] = 0 63 | classNum['Backdoors'] = 0 64 | 65 | jcount = 0 66 | 67 | for j in train_data: # find in train data to get the nearest point 68 | # print "i: %s" %(i) 69 | cos_sm = cosine_similarity(i, j) # 1 test data train set 70 | cos_similarity_list.append((category[jcount], cos_sm)) #the distance with the category 71 | # print cos_similarity_list # just for debugging and observing; in general running, this line will not be used. 72 | print "count: %s" %(jcount) 73 | cos_similarity_list.sort(key=operator.itemgetter(1), reverse=True) #use cos piority 74 | ''' similarity priority list has been built; we can find the first k nearest neighbors ''' 75 | jcount += 1 76 | totalcount += 1 77 | print "Processing: %s" % (totalcount) 78 | 79 | knn = cos_similarity_list[:k_value] # select first k neighbors 80 | 81 | print knn 82 | for k in knn: #k[0] is the most simliar. 83 | if k[0] == 'Normal': 84 | classNum['Normal'] += 1 85 | elif k[0] == 'Reconnaissance': 86 | classNum['Reconnaissance'] += 1 87 | elif k[0] == 'Exploits': 88 | classNum['Exploits'] += 1 89 | elif k[0] == 'Fuzzers': 90 | classNum['Fuzzers'] += 1 91 | elif k[0] == 'DoS': 92 | classNum['DoS'] += 1 93 | elif k[0] == 'Generic': 94 | classNum['Generic'] += 1 95 | elif k[0] == 'Shellcode': 96 | classNum['Shellcode'] += 1 97 | elif k[0] == 'Analysis': 98 | classNum['Analysis'] += 1 99 | elif k[0] == 'Worms': 100 | classNum['Worms'] += 1 101 | elif k[0] == 'Backdoors': 102 | classNum['Backdoors'] += 1 103 | 104 | 105 | # print "result: %d %d %d %d %d" %(classNum['Normal'], classNum['Reconnaissance'], classNum['Exploits'], classNum['Fuzzers'], classNum['DoS']) 106 | max_value = max(classNum, key=classNum.get) # max(classNum) 107 | print "max_value %s" %(max_value) 108 | 109 | # recover_key(classNum, max_value) 110 | 111 | # max_index = recover_key(classNum, max_value) 112 | # print "max_index %s" %(max_index) 113 | i.append(max_value) # append prediction; tag category 114 | 115 | def recover_key(dictionary, value): 116 | for a_key in dictionary.keys(): 117 | if (dictionary[a_key] == value): 118 | return a_key 119 | 120 | # Accuracy calculation function 121 | def accuracy(test_data, true_result): 122 | correct = 0 123 | for i in test_data: 124 | #print len(i) 125 | #print i[len(i)-1] 126 | 127 | jcount = 0 128 | if true_result[jcount] == i[len(i)-1]: 129 | correct += 1 130 | jcount+=1 131 | 132 | accuracy = float(correct) / len(test_data) * 100 # accuracy 133 | return accuracy 134 | 135 | 136 | def KNN(K, train_x, train_y, test_x, test_y): 137 | # dataset = getdata('UNSW_NB15_training-set_selected.csv') # getdata function call with csv file as parameter 138 | # print len(dataset) 139 | # train_dataset, test_dataset = shuffle(dataset) # train test data split 140 | # K = 3 # Assumed K value 141 | 142 | train_dataset = train_x.tolist() 143 | print "Number of training X: %s" %len(train_dataset) 144 | print "Number of training Y: %s" %len(train_y) 145 | test_dataset = test_x.tolist() 146 | print "Number of testing X: %s" %len(test_dataset) 147 | 148 | print "Training Set KNN Process:" 149 | knn_predict(train_dataset, train_dataset, K, train_y) 150 | print "Testing Set KNN Process:" 151 | knn_predict(test_dataset, train_dataset, K, train_y) 152 | atrain = round(accuracy(train_dataset, train_y),5) 153 | TrainError = float(100.00000- float(atrain)) 154 | atest = round(accuracy(test_dataset, test_y),5) 155 | TestError = 100.00000- atest 156 | # print test_dataset 157 | print "Accuracy of train_dataset : ", atrain 158 | print "Train error : ", TrainError 159 | print "Accuracy of test_dataset: ", atest 160 | print "Test error: ", TestError 161 | return TrainError, TestError, atrain, atest 162 | 163 | def load_train_data(train_ratio=0.12): 164 | data = pd.read_csv('./UNSW_NB15_training-set_selected.csv', header=None, 165 | names=['x%i' % (i) for i in range(33)] + ['logic']+['y']) 166 | Xt = numpy.asarray(data[['x%i' % (i) for i in range(33)]]) 167 | yt = numpy.asarray(data['y']) 168 | 169 | return sklearn.model_selection.train_test_split(Xt, yt, test_size=1 - train_ratio, random_state=0) 170 | 171 | 172 | def load_test_data(train_ratio=0.88): 173 | data = pd.read_csv('./UNSW_NB15_testing-set_selected.csv', header=None, 174 | names=['x%i' % (i) for i in range(33)] + ['logic']+['y']) 175 | Xtt = numpy.asarray(data[['x%i' % (i) for i in range(33)]]) 176 | ytt = numpy.asarray(data['y']) 177 | return sklearn.model_selection.train_test_split(Xtt, ytt, test_size=1 - train_ratio, random_state=0) 178 | 179 | def scale_features(X_train, X_test, low=0, upp=1): 180 | minmax_scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(low, upp)).fit(numpy.vstack((X_train 181 | ))) # Transforms features by scaling each feature to a given range(0~1) in order to reinforce dataset and fit training set. 182 | X_train_scale = minmax_scaler.transform(X_train) 183 | X_test_scale = minmax_scaler.transform(X_test) 184 | return X_train_scale, X_test_scale 185 | 186 | """preprocessing x and y of training data""" 187 | x_train2, t1, y_train, t2 = load_train_data(train_ratio=0.003) #1 188 | """preprocessing x and y of testing data""" 189 | t3, X_test, t4, y_test = load_test_data(train_ratio=(1-0.003)) #2 190 | 191 | """scale X dataset""" 192 | X_train_scale, X_test_scale = scale_features(x_train2, X_test, 0, 1) 193 | TrainError = [] 194 | TestError = [] 195 | TrainAccuracy = [] 196 | TestAccuracy = [] 197 | plt.figure(2) 198 | ax1 = plt.subplot(211) 199 | ax2 = plt.subplot(212) 200 | plt.figure(3) 201 | bx1 = plt.subplot(311) 202 | bx2 = plt.subplot(312) 203 | x = [] 204 | 205 | '''knn start: for small sample, start from 1; this from 9 is for this big sample set. Because there are same similarity in diffrent kind.''' 206 | for k in range(13,3,-1): #3 207 | print "K: %d" %(k) 208 | TrainErrorTemp, TestErrorTemp, AoTrain, AoTest = KNN(k, x_train2, y_train, X_test, y_test) 209 | TrainError.append(TrainErrorTemp) 210 | TestError.append(TestErrorTemp) 211 | TrainAccuracy.append(AoTrain) 212 | TestAccuracy.append( AoTest) 213 | print " " 214 | x.append(k) 215 | 216 | 217 | plt.sca(ax1) 218 | plt.plot(x, TrainError) 219 | 220 | 221 | plt.sca(ax2) 222 | plt.plot(x, TestError) 223 | 224 | 225 | plt.sca(bx1) 226 | plt.plot(x, TrainAccuracy) 227 | 228 | 229 | plt.sca(bx2) 230 | plt.plot(x, TestAccuracy) 231 | 232 | 233 | plt.xlabel('x axis') # make axis labels 234 | plt.ylabel('y axis') 235 | plt.show() 236 | 237 | -------------------------------------------------------------------------------- /Logistic regression for binary classification/logreg_attack.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed July 18 11:04:32 2018 5 | 6 | @author: Ke-Hsin,Lo 7 | """ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import division 11 | from __future__ import print_function 12 | # import nesssary package 13 | import math 14 | import sys 15 | import numpy 16 | from numpy import Inf 17 | import pandas as pd 18 | import sklearn.metrics 19 | import sklearn.model_selection 20 | import sklearn.linear_model 21 | import sklearn.preprocessing 22 | import random 23 | import matplotlib.pyplot as plt 24 | 25 | def load_train_data(train_ratio=1): 26 | data = pd.read_csv('./UNSW_NB15_training-set_selected.csv', header=None, 27 | names=['x%i' % (i) for i in range(37)] + ['y']) 28 | Xt = numpy.asarray(data[['x%i' % (i) for i in range(37)]]) 29 | yt = numpy.asarray(data['y']) 30 | return sklearn.model_selection.train_test_split(Xt, yt, test_size=1 - train_ratio, random_state=0) 31 | 32 | 33 | def load_test_data(train_ratio=0): 34 | data = pd.read_csv('./UNSW_NB15_testing-set_selected.csv', header=None, 35 | names=['x%i' % (i) for i in range(37)] + ['y']) 36 | Xtt = numpy.asarray(data[['x%i' % (i) for i in range(37)]]) 37 | ytt = numpy.asarray(data['y']) 38 | return sklearn.model_selection.train_test_split(Xtt, ytt, test_size=1 - train_ratio, random_state=0) 39 | 40 | 41 | def scale_features(X_train, X_test, low=0, upp=1): 42 | minmax_scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(low, upp)).fit(numpy.vstack((X_train))) # Transforms features by scaling each feature to a given range(0~1) in order to reinforce dataset and fit training set. 43 | X_train_scale = minmax_scaler.transform(X_train) 44 | X_test_scale = minmax_scaler.transform(X_test) 45 | return X_train_scale, X_test_scale 46 | 47 | 48 | def logistic(x): 49 | return 1.0 / (1 + math.exp(-x)) 50 | 51 | 52 | def logistic_derivative(x): 53 | return logistic(x) * (1 - logistic(x)) 54 | 55 | 56 | def logistic_log_likelihood_i(x_i, y_i, theta): # 0/1 : logL= y * logf + (1-y) * log(1-f) 57 | if y_i == 1.0: 58 | return math.log(logistic(numpy.dot(x_i, theta))) 59 | else: 60 | return math.log(1 - logistic(numpy.dot(x_i, theta))) 61 | 62 | 63 | def logistic_log_likelihood(x, y, beta): 64 | return sum(logistic_log_likelihood_i(x_i, y_i, beta) 65 | for x_i, y_i in zip(x, y)) 66 | 67 | 68 | """i is the index of the data point; 69 | j the index of the derivative""" 70 | 71 | 72 | def logistic_log_partial_ij(x_i, yi, theta, j): #calculate gives the gradient 73 | 74 | return (yi - logistic(numpy.dot(x_i, theta))) * x_i[j] 75 | 76 | """the gradient of the log likelihood 77 | corresponding to the i-th data point""" 78 | 79 | 80 | def logistic_log_gradient_i(xi, yi, theta): #calcaulate its it partial derivative by treating it as a function of just its ith variable, holding the o ther variable fixed 81 | return [logistic_log_partial_ij(xi, yi, theta, j) 82 | for j, _ in enumerate(theta)] 83 | 84 | 85 | def logistic_log_gradient(x, y, beta): 86 | return reduce(vector_add, 87 | [logistic_log_gradient_i(x_i, y_i, beta) 88 | for x_i, y_i in zip(x, y)]) 89 | 90 | 91 | """adds two vectors""" 92 | 93 | 94 | def vector_add(v, w): 95 | return [v_i + w_i for v_i, w_i in zip(v, w)] 96 | 97 | 98 | """scalar number multiplies vector ver 2; same as ver 1""" 99 | 100 | 101 | def scalar_multiply_2(c, v): 102 | row = [] 103 | 104 | row = numpy.asarray(c) * v 105 | 106 | return row 107 | 108 | 109 | def error(xi, yi, theta): 110 | return yi - predict_prob(xi, theta) 111 | 112 | 113 | """evaluated error **2""" 114 | 115 | 116 | def squared_error(xi, yi, theta): 117 | return error(xi, yi, theta) ** 2 118 | 119 | 120 | """the gradient corresponding to the ith squared error term""" 121 | 122 | 123 | def squared_error_gradient(xi, yi, theta): 124 | return [-2 * x_ij * error(xi, yi, theta) 125 | for x_ij in xi] 126 | 127 | 128 | """ calculate ridge penalty""" 129 | 130 | 131 | def ridge_penalty(lamda, theta): 132 | return lamda * numpy.dot(theta[1:], theta[1:]) / 2 133 | 134 | 135 | """calculate ridge gradient simply""" 136 | 137 | 138 | def ridge_penalty_gradient(lamda, theta): 139 | return [0] + [lamda * thetai for thetai in theta[1:]] 140 | 141 | 142 | def logreg_sgd(X, y, alpha=.001, iters=100000, eps=1e-2, lamda=0.001): 143 | n, d = X.shape 144 | # print(n, d) 145 | theta = numpy.zeros((d, 1)) 146 | 147 | random.seed(0) 148 | theta = [random.random() for xi in X[0]] 149 | 150 | gradient_fn = logistic_log_gradient_i 151 | target_fn = logistic_log_likelihood_i # target is to maximize likelihood value (approaching to zero) 152 | 153 | data = zip(X, y) 154 | 155 | alpha_0 = alpha # a step length 156 | max_theta, max_value = -Inf, -500000 157 | counter_of_no_improve = 0 # counter 158 | while counter_of_no_improve < iters: 159 | 160 | log_likelihood_value = sum((target_fn(x_i, y_i, theta) + ridge_penalty(lamda, theta)) for x_i, y_i in 161 | data) / n # According to theory of logistic likelihood; add ridge_penalty to prevent from overfitting. 162 | print(log_likelihood_value, max_value, max_theta, theta) # print for processing verbosely 163 | if log_likelihood_value > max_value: # if value bigger, it was improved. 164 | print("Likelihood Improved.") 165 | if abs(log_likelihood_value - max_value) < eps: # once training finished, response the maximum theta. 166 | print("Target Minimum eps Achieved( < 1e-2 ): ", abs(log_likelihood_value - max_value)) 167 | max_theta, max_value = theta, log_likelihood_value 168 | return max_theta 169 | else: 170 | print("eps: ", abs(log_likelihood_value - max_value)) # if not smaller than eps, continue training. 171 | 172 | """if find a new maximum, renew the value, and initialize the alpha, which is the walking length.""" 173 | max_theta, max_value = theta, log_likelihood_value # save the newest theta as max_theta for return the output and further training 174 | counter_of_no_improve = 0 175 | alpha = alpha_0 176 | else: 177 | """if it was not improved, narrow the walking length and try to walk next step(shrink the step size).""" 178 | counter_of_no_improve += 1 179 | print("Not improved. iter of Narrow the Step Length: ", counter_of_no_improve) 180 | alpha *= 0.9 181 | 182 | 183 | for xi, yi in data: 184 | gradient_i = gradient_fn(xi, yi, theta) + ridge_penalty_gradient(lamda, theta) # calculate gradient 185 | 186 | theta = vector_add(theta, scalar_multiply_2(alpha, gradient_i)) # take a step 187 | 188 | # if training so many time and over the iterator number, finish training. 189 | theta = max_theta 190 | 191 | return theta 192 | 193 | 194 | def predict_prob(X, theta): # According to theory of logistic likelihood: probability 195 | return 1. / (1 + numpy.exp(-numpy.dot(X, theta))) 196 | 197 | 198 | def evaluate(y_test, y_prob): # Evaluation, in accordance with theory of statics. 199 | tpr = [] 200 | fpr = [] 201 | tp, fp, fn, tn = 0, 0, 0, 0 # true positive, false positive, false negative, true negative. 202 | for index, i in enumerate(y_test): 203 | j = index 204 | 205 | # print("y_prob:",y_prob[j]) 206 | round_prob=round(y_prob[j]) 207 | if (i == 1 and round_prob == 1): 208 | tp = tp + 1 209 | elif (i == 0 and round_prob == 1): 210 | fp = fp + 1 211 | elif (i == 1 and round_prob == 0): 212 | fn = fn + 1 213 | elif (i == 0 and round_prob == 0): 214 | tn = tn + 1 215 | 216 | # accuracy 217 | correct = tp + tn 218 | total = tp + fp + fn + tn 219 | accuracy = correct / total 220 | 221 | # precision 222 | precision = tp / (tp + fp) 223 | 224 | # recall 225 | recall = tp / (tp + fn) 226 | 227 | # f1_score 228 | p = precision 229 | r = recall 230 | 231 | f1score = 2 * p * r / (p + r) 232 | 233 | print("Accuracy: {0}".format(accuracy)) 234 | print("Precision: {0}".format(precision)) 235 | print("Recall: {0}".format(recall)) 236 | print("F1 Score: {0}".format(f1score)) 237 | 238 | 239 | 240 | def plot_roc_curve(y_test, y_prob): 241 | # compute tpr and fpr of different thresholds 242 | tpr = [] 243 | fpr = [] 244 | plt.plot(fpr, tpr) 245 | plt.xlabel("False Positive Rate") 246 | plt.ylabel("True Positive Rate") 247 | plt.title('ROC ') 248 | plt.xlim(0,1) 249 | plt.ylim(0,1) 250 | plt.gca().set_aspect('equal', adjustable='box') 251 | fpr, tpr, thh = sklearn.metrics.roc_curve(y_test, y_prob, 1) 252 | plt.plot(fpr, tpr, color='green', marker='o', linestyle='solid') 253 | plt.savefig("roc_curve.png") 254 | plt.show() 255 | 256 | def main(argv): 257 | """data preprocessing""" 258 | 259 | """preprocessing x and y of training data""" 260 | x_train2, t1, y_train, t2 = load_train_data(train_ratio=0.99) 261 | """preprocessing x and y of testing data""" 262 | t3, X_test, t4, y_test = load_test_data(train_ratio=0.01) 263 | """scale X dataset""" 264 | X_train_scale, X_test_scale = scale_features(x_train2, X_test, 0, 1) 265 | 266 | """training and get model""" 267 | theta = logreg_sgd(X_train_scale, y_train) 268 | 269 | """result output""" 270 | y_prob = predict_prob(X_train_scale, theta) 271 | print("Logreg train accuracy: %f" % (sklearn.metrics.accuracy_score(y_train, y_prob > .5))) 272 | y_prob = predict_prob(X_test_scale, theta) 273 | print("Logreg test accuracy: %f" % (sklearn.metrics.accuracy_score(y_test, y_prob > .5))) 274 | 275 | evaluate(y_test.flatten(), y_prob.flatten()) 276 | plot_roc_curve(y_test.flatten(), y_prob.flatten()) 277 | 278 | if __name__ == "__main__": 279 | main(sys.argv) 280 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine-learning-algorithms-for-detecting-network-attacks-with-UNSW-NB15-data-set 2 | Due to the increasingly development of network technology recently, there are various cyber-attacks posed the huge threats to different fields around the world. Many studies and researches about cyber-security are carried out by experts in order to construct a safe network environment for people. The aim of the work is to build the detection models for classifying the attack data. Hence, we applied the UNSW-NB15 network data set which combines both normal and modern low-level attacks because we would like to create the experimental scenario close to the real world. Two classifiers are logistic regression and decision tree model for binary classification in the work. The deployed technique for decision tree achieved the highest result with 99.99% of testing accuracy compare to the 78.15% of logistic regression classifier. On the other hand, the KNN model is used for categorizing the multi-class in the project, and the averaged accuracy for testing is around 23% for ten categories classification. 3 | 4 | The details of the UNSW-NB15 data set are published in following the papers: 5 | 6 | Moustafa, Nour, and Jill Slay. "UNSW-NB15: a comprehensive data set for network intrusion detection systems (UNSW-NB15 network data set)."Military Communications and Information Systems Conference (MilCIS), 2015. IEEE, 2015. 7 | Moustafa, Nour, and Jill Slay. "The evaluation of Network Anomaly Detection Systems: Statistical analysis of the UNSW-NB15 data set and the comparison with the KDD99 data set." Information Security Journal: A Global Perspective (2016): 1-14. 8 | 9 | Find the dataset here: https://www.unsw.adfa.edu.au/australian-centre-for-cyber-security/cybersecurity/ADFA-NB15-Datasets/ 10 | --------------------------------------------------------------------------------