├── .gitignore
├── Dataset
    ├── .gitattributes
    └── How to get the data.txt
├── DNN
    ├── cm.png
    ├── model_1.h5
    ├── dnn_model.sav
    ├── dnn_model1.sav
    ├── dnn.py
    └── predict.py
├── KNN
    ├── cm.png
    ├── knn_3_model.sav
    ├── knn.py
    └── predict.py
├── DecisionTree
    ├── cm.png
    ├── 7_cm_dt.png
    ├── decisiontree_model.sav
    ├── decisiontree.py
    └── predict.py
├── RandomForest
    ├── cm.png
    ├── randomforest_model.sav
    ├── randomforest.py
    └── predict.py
├── dictionary.py
├── README.md
└── test.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | network_traffic.csv
2 | 


--------------------------------------------------------------------------------
/Dataset/.gitattributes:
--------------------------------------------------------------------------------
1 | *.csv filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/DNN/cm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/DNN/cm.png


--------------------------------------------------------------------------------
/KNN/cm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/KNN/cm.png


--------------------------------------------------------------------------------
/DNN/model_1.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/DNN/model_1.h5


--------------------------------------------------------------------------------
/DNN/dnn_model.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/DNN/dnn_model.sav


--------------------------------------------------------------------------------
/DNN/dnn_model1.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/DNN/dnn_model1.sav


--------------------------------------------------------------------------------
/DecisionTree/cm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/DecisionTree/cm.png


--------------------------------------------------------------------------------
/KNN/knn_3_model.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/KNN/knn_3_model.sav


--------------------------------------------------------------------------------
/RandomForest/cm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/RandomForest/cm.png


--------------------------------------------------------------------------------
/DecisionTree/7_cm_dt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/DecisionTree/7_cm_dt.png


--------------------------------------------------------------------------------
/DecisionTree/decisiontree_model.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/DecisionTree/decisiontree_model.sav


--------------------------------------------------------------------------------
/RandomForest/randomforest_model.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/RandomForest/randomforest_model.sav


--------------------------------------------------------------------------------
/dictionary.py:
--------------------------------------------------------------------------------
 1 | def name_convert(x):
 2 |     file = open('..\test.txt', 'r')
 3 |     dict_c = {}
 4 |     for i in file.readlines():
 5 |         i = str(i).strip()
 6 |         name, cl = str(i).split("	")[0], str(i).split("	")[1]
 7 |         dict_c[name] = cl
 8 |     try:
 9 |         return dict_c[x]
10 |     except KeyError as e:
11 |         return "default"
12 | 


--------------------------------------------------------------------------------
/Dataset/How to get the data.txt:
--------------------------------------------------------------------------------
 1 | Since .pcap files are big in size we processed the data in csv format. And still the file is about 150mb. Please email me: 
 2 | pritom007@live.com 
 3 | or 
 4 | pritom@sjtu.edu.com and also write a short note why you need the data.
 5 | 
 6 | This data is only available for research purpose not for commercial use.
 7 | 
 8 | Please cite my work if you use this dataset for research.
 9 | 
10 | @article{mondal2021dynamic,
11 |   title={A dynamic network traffic classifier using supervised ML for a Docker-based SDN network},
12 |   author={Mondal, Pritom Kumar and Aguirre Sanchez, Lizeth P and Benedetto, Emmanuele and Shen, Yao and Guo, Minyi},
13 |   journal={Connection Science},
14 |   pages={1--26},
15 |   year={2021},
16 |   publisher={Taylor \& Francis}
17 | }
18 | 
19 | read more: https://doi.org/10.1080/09540091.2020.1870437
20 | 


--------------------------------------------------------------------------------
/DecisionTree/decisiontree.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sklearn.metrics import accuracy_score, classification_report
 3 | from sklearn.model_selection import train_test_split
 4 | import numpy as np
 5 | from sklearn.tree import DecisionTreeClassifier
 6 | import pickle
 7 | 
 8 | file_dir = "..\\Data\\"
 9 | 
10 | names = ['#flow_id', 'protocol', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'ndpi_proto_num', 'src2dst_packets',
11 |          'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes', 'ndpi_proto', 'class']
12 | print("Loading Dataset total_class.csv")
13 | df = pd.read_csv(file_dir + 'total_class.csv', names=names)
14 | 
15 | X = np.asarray(
16 |     df[['protocol', 'src_port', 'dst_port', 'src2dst_packets', 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes']][
17 |     1:])
18 | y = []
19 | 
20 | my_tags = []
21 | classes = open("decisiontree.txt", "w+")
22 | for i in df['class'][1:]:
23 |     if i not in my_tags:
24 |         my_tags.append(i)
25 | for i in df['class'][1:]:
26 |     classes.write(i + " " + str(my_tags.index(i)) + "\n")
27 |     y.append(my_tags.index(i))
28 | y = np.asarray(y)
29 | print("Splitting dataset")
30 | x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True)
31 | 
32 | clf = DecisionTreeClassifier(random_state=0)
33 | 
34 | print("Training Started")
35 | clf.fit(x_train, y_train)
36 | 
37 | print("Testing the classifier")
38 | y_pred = clf.predict(x_test)
39 | 
40 | print("Saving the model")
41 | filename = 'decisiontree_model.sav'
42 | pickle.dump(clf, open(filename, 'wb'))
43 | 
44 | print('accuracy %s' % accuracy_score(y_pred, y_test))
45 | print(classification_report(y_test, y_pred, target_names=my_tags, labels=range(len(my_tags))))
46 | 


--------------------------------------------------------------------------------
/RandomForest/randomforest.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sklearn.metrics import accuracy_score, classification_report
 3 | from sklearn.model_selection import train_test_split
 4 | import numpy as np
 5 | from sklearn.ensemble import RandomForestClassifier
 6 | import pickle
 7 | 
 8 | 
 9 | file_dir = "D:\\SDN Project\\Data\\"
10 | 
11 | names = ['#flow_id', 'protocol', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'ndpi_proto_num', 'src2dst_packets',
12 |          'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes', 'ndpi_proto', 'class']
13 | print("Loading Dataset total_class.csv")
14 | df = pd.read_csv(file_dir + 'total_class.csv', names=names)
15 | 
16 | X = np.asarray(
17 |     df[['protocol', 'src_port', 'dst_port', 'src2dst_packets', 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes']][
18 |     1:])
19 | y = []
20 | 
21 | my_tags = []
22 | classes = open("randomforest.txt", "w+")
23 | for i in df['class'][1:]:
24 |     if i not in my_tags:
25 |         my_tags.append(i)
26 | for i in df['class'][1:]:
27 |     classes.write(i + " " + str(my_tags.index(i)) + "\n")
28 |     y.append(my_tags.index(i))
29 | y = np.asarray(y)
30 | print("Splitting dataset")
31 | x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.30, shuffle=True)
32 | 
33 | clf = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=1)
34 | print("Training Started")
35 | clf.fit(x_train, y_train)
36 | 
37 | print("Testing the classifier")
38 | y_pred = clf.predict(x_test)
39 | 
40 | print("Saving the model")
41 | filename = 'randomforest_model.sav'
42 | pickle.dump(clf, open(filename, 'wb'))
43 | 
44 | print('accuracy %s' % accuracy_score(y_pred, y_test))
45 | print(classification_report(y_test, y_pred, target_names=my_tags, labels=range(len(my_tags))))
46 | 


--------------------------------------------------------------------------------
/KNN/knn.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sklearn.metrics import accuracy_score, classification_report
 3 | from sklearn.model_selection import train_test_split
 4 | import numpy as np
 5 | from sklearn.neighbors import KNeighborsClassifier
 6 | #from sklearn.neighbors.nca import NeighborhoodComponentsAnalysis
 7 | 
 8 | import pickle
 9 | file_dir = "D:\\SDN Project\\Data\\"
10 | 
11 | names = ['#flow_id', 'protocol', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'ndpi_proto_num', 'src2dst_packets',
12 |          'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes', 'ndpi_proto', 'class']
13 | print("Loading Dataset total_class.csv")
14 | df = pd.read_csv(file_dir + 'total_class.csv', names=names)
15 | 
16 | X = np.asarray(
17 |     df[['protocol', 'src_port', 'dst_port', 'src2dst_packets', 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes']][
18 |     1:])
19 | y = []
20 | 
21 | my_tags = []
22 | classes = open("knn.txt", "w+")
23 | for i in df['class'][1:]:
24 |     if i not in my_tags:
25 |         my_tags.append(i)
26 | for i in df['class'][1:]:
27 |     classes.write(i + " " + str(my_tags.index(i)) + "\n")
28 |     y.append(my_tags.index(i))
29 | y = np.asarray(y)
30 | print("Splitting dataset")
31 | x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True)
32 | max = 0
33 | for i in range(2, 15):
34 |     knn = KNeighborsClassifier(n_neighbors=i)
35 | 
36 |     #print("Training Started")
37 |     knn.fit(x_train, y_train)
38 | 
39 |     #print("Testing the classifier")
40 |     y_pred = knn.predict(x_test)
41 |     acc = accuracy_score(y_pred, y_test)
42 |     print(f'K={i} accuracy %s' % acc)
43 |     if acc >= max:
44 |         print("Saving the model")
45 |         filename = 'knn_'+str(i)+'_model.sav'
46 |         pickle.dump(knn, open(filename, 'wb'))
47 |         max = acc
48 |         print('saved acc: ', max)
49 | 
50 | 
51 | #print(classification_report(y_test, y_pred, target_names=my_tags, labels=range(len(my_tags))))


--------------------------------------------------------------------------------
/DNN/dnn.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | import matplotlib.pyplot as plt
 3 | import keras
 4 | import pandas as pd
 5 | import tensorflow as tf
 6 | 
 7 | # Helper libraries
 8 | import numpy as np
 9 | 
10 | from keras import backend as K
11 | from matplotlib import pyplot
12 | 
13 | from sklearn.model_selection import train_test_split
14 | import pickle
15 | 
16 | def recall_m(y_true, y_pred):
17 |     true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
18 |     possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
19 |     recall = true_positives / (possible_positives + K.epsilon())
20 |     return recall
21 | 
22 | 
23 | def precision_m(y_true, y_pred):
24 |     true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
25 |     predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
26 |     precision = true_positives / (predicted_positives + K.epsilon())
27 |     return precision
28 | 
29 | 
30 | def f1_m(y_true, y_pred):
31 |     precision = precision_m(y_true, y_pred)
32 |     recall = recall_m(y_true, y_pred)
33 |     return 2 * ((precision * recall) / (precision + recall + K.epsilon()))
34 | 
35 | 
36 | file_dir = "D:\\SDN Project\\Data\\"#"D:\\SDN Project\\"
37 | 
38 | 
39 | names = ['protocol','src_ip' , 'src_port', 'dst_ip', 'dst_port', 'ndpi_proto_num', 'src2dst_packets',
40 |         'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes', 'ndpi_proto', 'class']
41 | df = pd.read_csv(file_dir + 'total_class.csv', names=names)
42 | array = df.values
43 | 
44 | X = np.asarray(df[['protocol', 'src_port', 'dst_port', 'src2dst_packets', 'src2dst_bytes', 'dst2src_packets','dst2src_bytes']][1:])
45 | y = []
46 | 
47 | my_tags = []
48 | classes = open("dnn.txt", "w+")
49 | for i in df['class'][1:]:
50 |     if i not in my_tags:
51 |         my_tags.append(i)
52 | for i in df['class'][1:]:
53 |     classes.write(i+" "+str(my_tags.index(i))+"\n")
54 |     y.append(my_tags.index(i))
55 | y = np.asarray(y)
56 | print(X.shape, y.shape)
57 | 
58 | 
59 | x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True)
60 | features = len(x_train[0])
61 | 
62 | model = keras.Sequential([
63 |     keras.layers.Dense(features, kernel_regularizer=tf.keras.regularizers.l1(0.1)),
64 |     keras.layers.Dense(512, activation=tf.nn.relu, kernel_regularizer=tf.keras.regularizers.l2(0.1)),
65 |     keras.layers.Dense(256, activation=tf.nn.relu, kernel_regularizer=tf.keras.regularizers.l1(0.1)),
66 |     keras.layers.Dense(128, activation=tf.nn.relu, kernel_regularizer=tf.keras.regularizers.l2(0.1)),
67 |     keras.layers.Dense(10, activation=tf.nn.softmax)
68 | ])
69 | model.compile(optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001),
70 |               loss='sparse_categorical_crossentropy',
71 |               metrics=['acc', f1_m, precision_m, recall_m])
72 | 
73 | history = model.fit(x_train, y_train, batch_size=1000, epochs=400)
74 | loss, accuracy, f1_score, precision, recall = model.evaluate(x_test, np.array(y_test), verbose=0)
75 | model.save("dnn_model1.sav")
76 | y_predict = model.predict(x_test)
77 | 
78 | print("Saving the model")
79 | filename = 'dnn_model.sav'
80 | pickle.dump(model, open(filename, 'wb'))
81 | 
82 | for i in y_predict:
83 |     print(np.argmax(i))
84 | print(f'loss: {loss}, acc: {accuracy}, f1_score: {f1_score}, precision: {precision}, recall: {recall}')
85 | print(model.summary())
86 | print(x_test[0])
87 | pyplot.subplot(212)
88 | pyplot.title('Accuracy')
89 | pyplot.plot(history.history['acc'], label='train')
90 | pyplot.legend()
91 | pyplot.show()
92 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Network-Traffic-Classification
 2 | 
 3 | This is a research project for classifying network traffic. We collected more than 300000 flows from the network. After that, we used nDPI to analyze the flows. We got more than 100 types of applications. Then we group that application into 10 classes. After that, we tried different ML algorithms to classify them.
 4 | 
 5 | Our current results-
 6 | 
 7 | Decision tree 95.8% accuracy
 8 | 
 9 | (I have added a new file https://github.com/pritom007/Network-Traffic-Classification/blob/master/DecisionTree/DecisionTree.ipynb with clean code. You just follow this code and implement it for KNN, RF)
10 | 
11 | Random forest 96.69% accuracy
12 | 
13 | KNN 97.24% accuracy
14 | 
15 | PAA 99.29% accuracy (read the paper to know more)
16 | 
17 | To get the dataset check out the instructions in the dataset folder.
18 | 
19 | # How Did we collect Data
20 | 
21 | We used Wireshark to collect the packets. Since for the project we wanted to use lab environment data, we first redirected our lab-network to one personal computer(pc) and in that pc we used Wireshark. After collecting the packets (as a .pcap file), we used ndpi to analyze the packets and get extract flow info and then we export that data as an excel file. The `data.csv` contains information on all parameters. However, for our project, we only used the top 7 most important parameters as features.
22 | 
23 | Github has limited the download so I am sharing a gdrive link for downloading the raw data: https://drive.google.com/file/d/1lcQmYyZutjsW_yJoHgx3Vles8eCgwQeD/view?usp=sharing
24 | 
25 | After you download it, you have to pre-process, in the paper, we showed in a table that how did we group the applications in 10 classes. 
26 | 
27 | Please read the following paper to know more: https://doi.org/10.1080/09540091.2020.1870437
28 | 
29 | ## To cite the paper and code:
30 | 
31 | article{mondal2021dynamic,<br>
32 |   title={A dynamic network traffic classifier using supervised ML for a Docker-based SDN network},<br>
33 |   author={Mondal, Pritom Kumar and Aguirre Sanchez, Lizeth P and Benedetto, Emmanuele and Shen, Yao and Guo, Minyi},<br>
34 |   journal={Connection Science},<br>
35 |   pages={1--26},<br>
36 |   year={2021},<br>
37 |   publisher={Taylor \& Francis}<br>
38 | }
39 | 
40 | # 中文版
41 | # 网络流量分类
42 | 
43 | 这是一个对网络流量进行分类的研究项目。我们从网络中收集了超过 300000 个流。之后，我们使用 nDPI 来分析流量。我们收到了 100 多种类型的应用程序。然后我们将该应用程序分为 10 个类。之后，我们尝试了不同的 ML 算法来对它们进行分类。
44 | 
45 | 我们目前的结果——
46 | 
47 | Decision tree 95.8% 准确率
48 | 
49 | （我用干净的代码添加了一个新文件 https://github.com/pritom007/Network-Traffic-Classification/blob/master/DecisionTree/DecisionTree.ipynb。您只需按照此代码并为 KNN、RF 实现它）
50 | 
51 | Random forest 96.69% 准确率
52 | 
53 | KNN 97.24% 准确率
54 | 
55 | PAA 99.29% 准确率（阅读论文了解更多）
56 | 
57 | 要获取数据集，请查看数据集文件夹中的说明。
58 | 
59 | # 如何收集数据的
60 | 
61 | 我们使用 Wireshark 来收集数据包。由于对于我们想要使用实验室环境数据的项目，我们首先将我们的实验室网络重定向到一台个人计算机（PC），并在该 PC 中使用 Wireshark。收集数据包（作为 .pcap 文件）后，我们使用 ndpi 分析数据包并获取提取流信息，然后将该数据导出为 excel 文件。 `data.csv` 包含有关所有参数的信息。然而，对于我们的项目，我们只使用了前 7 个最重要的参数作为特征。
62 | 
63 | Github 限制了下载，所以我分享了一个用于下载原始数据的 gdrive 链接：https://drive.google.com/file/d/1lcQmYyZutjsW_yJoHgx3Vles8eCgwQeD/view?usp=sharing
64 | 
65 | 下载后，您必须进行预处理，在论文中，我们在表格中展示了我们如何将应用程序分为 10 个类。
66 | 
67 | 请阅读以下论文了解更多信息：https://doi.org/10.1080/09540091.2020.1870437
68 | 
69 | ## 引用论文和代码：
70 | 
71 | article{mondal2021dynamic,<br>
72 |   title={A dynamic network traffic classifier using supervised ML for a Docker-based SDN network},<br>
73 |   author={Mondal, Pritom Kumar and Aguirre Sanchez, Lizeth P and Benedetto, Emmanuele and Shen, Yao and Guo, Minyi},<br>
74 |   journal={Connection Science},<br>
75 |   pages={1--26},<br>
76 |   year={2021},<br>
77 |   publisher={Taylor \& Francis}<br>
78 | }
79 | 


--------------------------------------------------------------------------------
/test.txt:
--------------------------------------------------------------------------------
  1 | DHCPV6	critical
  2 | DHCP	critical
  3 | ApplePush	p2p
  4 | Google	transaction
  5 | HTTP_Proxy	transaction
  6 | HTTP.GenericProtocol	transaction
  7 | HTTP.Microsoft	transaction
  8 | HTTP.Office365	transaction
  9 | HTTP.WindowsUpdate	transaction
 10 | HTTP.UbuntuONE	transaction
 11 | HTTP.MSN	transaction
 12 | HTTP.QQ	transaction
 13 | HTTP.Amazon	transaction
 14 | Amazon	transaction
 15 | Dropbox	transaction
 16 | HTTP	transaction
 17 | HTTP.Cloudflare	transaction
 18 | HTTP.Sina(Weibo)	transaction
 19 | HTTP.Google	transaction
 20 | HTTP.ApplePush	transaction
 21 | HTTP.GoogleServices	transaction
 22 | HTTP.Facebook	transaction
 23 | HTTP.MS_OneDrive	transaction
 24 | HTTP.Apple	transaction
 25 | HTTP.Skype	transaction
 26 | HTTP.AppleStore	transaction
 27 | HTTP.Github	transaction
 28 | NETBIOS	critical
 29 | FACEBOOK 	bulk
 30 | QQ	bulk
 31 | TLS.WeChat	bulk
 32 | Viber	bulk
 33 | HTTP.WeChat	bulk
 34 | TLS.WeChat	bulk
 35 | TLS.Twitter	bulk
 36 | Facebook.chat	bulk
 37 | Wechat	bulk
 38 | Skype	bulk
 39 | SMTPS	bulk
 40 | SMTP	bulk
 41 | IMAPS.Tor	bulk
 42 | IMAPS	bulk
 43 | IMAP	bulk
 44 | IMAPS.QQ	bulk
 45 | IMAPS.Yahoo	bulk
 46 | TLS	transaction
 47 | TLS.MSN	transaction
 48 | TLS.WindowsUpdate	transaction
 49 | TLS.Dropbox	transaction
 50 | TLS.Facebook	transaction
 51 | TLS.TLS_No_Cert	transaction
 52 | TLS.Signal	transaction
 53 | TLS.Github	transaction
 54 | TLS.Microsoft	transaction
 55 | TLS_No_Cert	transaction
 56 | TLS.GenericProtocol	transaction
 57 | TLS.Google	transaction
 58 | TLS.Cloudflare	transaction
 59 | TLS.Office365	transaction
 60 | TLS.Amazon	transaction
 61 | TLS.LinkedIn	transaction
 62 | TLS.QQ	transaction
 63 | TLS.MS_OneDrive	transaction
 64 | TLS.Skype	transaction
 65 | TLS.Sina(Weibo)	transaction
 66 | TLS.GoogleServices	transaction
 67 | TLS.Spotify	transaction
 68 | TLS.Yahoo	transaction
 69 | TLS.UbuntuONE	transaction
 70 | TLS.Steam	transaction
 71 | TLS.Apple	transaction
 72 | TLS.AppleiCloud	transaction
 73 | TLS.Wikipedia	transaction
 74 | TLS.Starcraft	transaction
 75 | TLS.GoogleDocs	transaction
 76 | TLS.ApplePush	transaction
 77 | TLS.AppleiTunes	transaction
 78 | TLS.AppleStore	transaction
 79 | TLS.IMAPS	transaction
 80 | QUIC.Google	transaction
 81 | QUIC.GoogleServices	transaction
 82 | QUIC	transaction
 83 | DNS.GoogleServices	critical
 84 | DNS.Google	critical
 85 | DNS.Office365	critical
 86 | DNS.MS_OneDrive	critical
 87 | DNS.GenericProtocol	critical
 88 | DNS.LinkedIn	critical
 89 | DNS.Microsoft	critical
 90 | DNS.WindowsUpdate	critical
 91 | DNS.QQ	critical
 92 | DNS.AmazonVideo	critical
 93 | DNS.UbuntuONE	critical
 94 | DNS.Facebook	critical
 95 | DNS.Dropbox	critical
 96 | DNS.PlayStore	critical
 97 | DNS.MSN	critical
 98 | DNS.Skype	critical
 99 | DNS.Amazon	critical
100 | DNS	critical
101 | DNS.Wikipedia	critical
102 | DNS.Instagram	critical
103 | DNS.YouTube	critical
104 | DNS.Sina(Weibo)	critical
105 | DNS.Yahoo	critical
106 | DNS.Steam	critical
107 | DNS.ApplePush	critical
108 | DNS.Xbox	critical
109 | DNS.Twitter	critical
110 | DNS.GoogleMaps	critical
111 | DNS.AppleiTunes	critical
112 | DNS.Github	critical
113 | DNS.Spotify	critical
114 | DNS.GoogleDocs	critical
115 | DNS.Apple	critical
116 | DNS.GoogleDrive	critical
117 | DNS.GMail	critical
118 | DNS.GooglePlus	critical
119 | DNS.AppleiCloud	critical
120 | DNS.NetFlix	critical
121 | MDNS	critical
122 | LLMNR	critical
123 | UPnP	OAM
124 | SSDP	OAM
125 | BJNP	OAM
126 | Starcraft	p2p
127 | Playstation	p2p
128 | steam 	p2p
129 | TLS.Xbox	p2p
130 | Xbox	p2p
131 | Steam	p2p
132 | BitTorrent	p2p
133 | eDonkey	p2p
134 | Pando_Media_Booster	p2p
135 | Thunder	p2p
136 | TLS.AmazonVideo	video
137 | HTTP.AmazonVideo	video
138 | TLS.YouTube	video
139 | TeamViewer	transaction
140 | RDP	transaction
141 | RTP	control
142 | BGP	control
143 | ICMP	control
144 | NTP	control
145 | ICMPv6	control
146 | IGMP	control
147 | RX	signaling
148 | STUN	signaling
149 | H323	signaling
150 | STUN.SkypeCall	signaling
151 | STUN.Signal	signaling
152 | STUN.WhatsAppVoice	signaling
153 | STUN.GoogleHangoutDuo	signaling
154 | Skype.SkypeCall	VoIP
155 | WhatsAppVoice	VoIP
156 | FTP_CONTROL	bulk
157 | SSH.Google	OAM
158 | SSH	OAM
159 | TELNET	OAM
160 | CHECKMK	OAM
161 | SNMP	OAM
162 | Whois-DAS	bulk
163 | MsSQL-TDS	bulk
164 | PostgreSQL	bulk
165 | IPsec	OAM
166 | Citrix	default
167 | Redis	default
168 | Targus Dataspeed	default
169 | Tor	default
170 | SOCKS	default
171 | Mining	default
172 | SOCKS	default
173 | COAP	default
174 | MQTT	default
175 | CiscoVPN	default
176 | Unknown	default
177 | 


--------------------------------------------------------------------------------
/KNN/predict.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import pandas as pd
  3 | # Helper libraries
  4 | import numpy as np
  5 | import pickle
  6 | from sklearn.metrics import accuracy_score, confusion_matrix
  7 | from sklearn.utils.multiclass import unique_labels
  8 | import seaborn as sns
  9 | from dictionary import name_convert
 10 | 
 11 | 
 12 | def cm_analysis(y_true, y_pred, labels, ymap=None, title="", figsize=(10, 10)):
 13 |     """
 14 |     Generate matrix plot of confusion matrix with pretty annotations.
 15 |     The plot image is saved to disk.
 16 |     args:
 17 |       y_true:    true label of the data, with shape (nsamples,)
 18 |       y_pred:    prediction of the data, with shape (nsamples,)
 19 |       filename:  filename of figure file to save
 20 |       labels:    string array, name the order of class labels in the confusion matrix.
 21 |                  use `clf.classes_` if using scikit-learn models.
 22 |                  with shape (nclass,).
 23 |       ymap:      dict: any -> string, length == nclass.
 24 |                  if not None, map the labels & ys to more understandable strings.
 25 |                  Caution: original y_true, y_pred and labels must align.
 26 |       figsize:   the size of the figure plotted.
 27 |     """
 28 |     if ymap is not None:
 29 |         y_pred = [ymap[yi] for yi in y_pred]
 30 |         y_true = [ymap[yi] for yi in y_true]
 31 |         labels = [ymap[yi] for yi in labels]
 32 |     l = []
 33 |     for i in labels:
 34 |         l.append(labels.index(i))
 35 |     cm = confusion_matrix(y_true, y_pred, labels=l)
 36 |     cm_sum = np.sum(cm, axis=1, keepdims=True)
 37 |     cm_perc = cm / cm_sum.astype(float) * 100
 38 |     annot = np.empty_like(cm).astype(str)
 39 |     nrows, ncols = cm.shape
 40 |     for i in range(nrows):
 41 |         for j in range(ncols):
 42 |             c = cm[i, j]
 43 |             p = cm_perc[i, j]
 44 |             if i == j:
 45 |                 s = cm_sum[i]
 46 |                 annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
 47 |             elif c == 0:
 48 |                 annot[i, j] = ''
 49 |             else:
 50 |                 annot[i, j] = '%.1f%%\n%d' % (p, c)
 51 |     cm = pd.DataFrame(cm, index=labels, columns=labels)
 52 |     cm.index.name = 'Actual'
 53 |     cm.columns.name = 'Predicted'
 54 |     fig, ax = plt.subplots(figsize=figsize)
 55 |     ax.set(title=title)
 56 |     sns.heatmap(cm, annot=annot, fmt='', ax=ax)
 57 |     plt.savefig("cm.png")
 58 |     plt.show()
 59 | 
 60 | 
 61 | def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, title=None, cmap=plt.cm.Blues):
 62 |     """
 63 |     This function prints and plots the confusion matrix.
 64 |     Normalization can be applied by setting `normalize=True`.
 65 |     """
 66 |     if not title:
 67 |         if normalize:
 68 |             title = 'Normalized confusion matrix'
 69 |         else:
 70 |             title = 'Confusion matrix, without normalization'
 71 | 
 72 |     # Compute confusion matrix
 73 |     cm = confusion_matrix(y_true, y_pred)
 74 |     # Only use the labels that appear in the data
 75 |     classes = classes[unique_labels(y_true, y_pred)]
 76 |     if normalize:
 77 |         cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
 78 |         print("Normalized confusion matrix")
 79 |     else:
 80 |         print('Confusion matrix, without normalization')
 81 | 
 82 |     print(cm)
 83 | 
 84 |     fig, ax = plt.subplots()
 85 |     im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
 86 |     ax.figure.colorbar(im, ax=ax)
 87 |     # We want to show all ticks...
 88 |     ax.set(xticks=np.arange(cm.shape[1]),
 89 |            yticks=np.arange(cm.shape[0]),
 90 |            # ... and label them with the respective list entries
 91 |            xticklabels=classes, yticklabels=classes,
 92 |            title=title,
 93 |            ylabel='True label',
 94 |            xlabel='Predicted label')
 95 | 
 96 |     # Rotate the tick labels and set their alignment.
 97 |     plt.setp(ax.get_xticklabels(), rotation=90, ha="right",
 98 |              rotation_mode="anchor")
 99 | 
100 |     # Loop over data dimensions and create text annotations.
101 |     fmt = '.2f' if normalize else 'd'
102 |     thresh = cm.max() / 2.
103 |     for i in range(cm.shape[0]):
104 |         for j in range(cm.shape[1]):
105 |             ax.text(j, i, format(cm[i, j], fmt),
106 |                     ha="center", va="center",
107 |                     color="white" if cm[i, j] > thresh else "black")
108 |     fig.tight_layout()
109 |     return ax
110 | 
111 | 
112 | file_dir = "D:\\SDN Project\\Data\\Processed\\"
113 | 
114 | names = ['#flow_id', 'protocol', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'ndpi_proto_num', 'src2dst_packets',
115 |          'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes', 'ndpi_proto']
116 | df = pd.read_csv(file_dir + '26sep19.csv', names=names)
117 | X = np.asarray(
118 |     df[['protocol', 'src_port', 'dst_port', 'src2dst_packets', 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes']][
119 |     1:])
120 | y = df['ndpi_proto'][1:]
121 | 
122 | # read the dictionary
123 | 
124 | read_file = open("knn.txt", 'r')
125 | dict_class = []
126 | for i in read_file.readlines():
127 |     name, index = str(i).split(" ")[0], str(i).split(" ")[1]
128 |     if name not in dict_class:
129 |         dict_class.append(name)
130 | print(dict_class)
131 | # Recreate the exact same model
132 | count = 1
133 | loaded_model = pickle.load(open('knn_3_model.sav', 'rb'))
134 | result = loaded_model.predict(X)
135 | original = []
136 | for i in result:
137 |     print(f'{count} {i} {dict_class[i]} {name_convert(y[count])}')
138 |     if name_convert(y[count]) not in dict_class:
139 |         original.append(7)
140 |     else:
141 |         original.append(dict_class.index(name_convert(y[count])))
142 |     count += 1
143 | print(accuracy_score(original, result))
144 | print(confusion_matrix(original, result))
145 | cm_analysis(original, result, dict_class, ymap=None, title="Confusion Matrix", figsize=(10, 10))
146 | plot_confusion_matrix(original, result, classes= np.asarray(dict_class), normalize=False, title="Confusion Matrix",cmap=plt.cm.Reds)
147 | plt.show()
148 | plt.savefig("cm_1.png")


--------------------------------------------------------------------------------
/RandomForest/predict.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import pandas as pd
  3 | # Helper libraries
  4 | import numpy as np
  5 | import pickle
  6 | from sklearn.metrics import accuracy_score, confusion_matrix
  7 | from sklearn.utils.multiclass import unique_labels
  8 | import seaborn as sns
  9 | from dictionary import name_convert
 10 | 
 11 | 
 12 | def cm_analysis(y_true, y_pred, labels, ymap=None, title="", figsize=(10, 10)):
 13 |     """
 14 |     Generate matrix plot of confusion matrix with pretty annotations.
 15 |     The plot image is saved to disk.
 16 |     args:
 17 |       y_true:    true label of the data, with shape (nsamples,)
 18 |       y_pred:    prediction of the data, with shape (nsamples,)
 19 |       filename:  filename of figure file to save
 20 |       labels:    string array, name the order of class labels in the confusion matrix.
 21 |                  use `clf.classes_` if using scikit-learn models.
 22 |                  with shape (nclass,).
 23 |       ymap:      dict: any -> string, length == nclass.
 24 |                  if not None, map the labels & ys to more understandable strings.
 25 |                  Caution: original y_true, y_pred and labels must align.
 26 |       figsize:   the size of the figure plotted.
 27 |     """
 28 |     if ymap is not None:
 29 |         y_pred = [ymap[yi] for yi in y_pred]
 30 |         y_true = [ymap[yi] for yi in y_true]
 31 |         labels = [ymap[yi] for yi in labels]
 32 |     l = []
 33 |     for i in labels:
 34 |         l.append(labels.index(i))
 35 |     cm = confusion_matrix(y_true, y_pred, labels=l)
 36 |     cm_sum = np.sum(cm, axis=1, keepdims=True)
 37 |     cm_perc = cm / cm_sum.astype(float) * 100
 38 |     annot = np.empty_like(cm).astype(str)
 39 |     nrows, ncols = cm.shape
 40 |     for i in range(nrows):
 41 |         for j in range(ncols):
 42 |             c = cm[i, j]
 43 |             p = cm_perc[i, j]
 44 |             if i == j:
 45 |                 s = cm_sum[i]
 46 |                 annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
 47 |             elif c == 0:
 48 |                 annot[i, j] = ''
 49 |             else:
 50 |                 annot[i, j] = '%.1f%%\n%d' % (p, c)
 51 |     cm = pd.DataFrame(cm, index=labels, columns=labels)
 52 |     cm.index.name = 'Actual'
 53 |     cm.columns.name = 'Predicted'
 54 |     fig, ax = plt.subplots(figsize=figsize)
 55 |     ax.set(title=title)
 56 |     sns.heatmap(cm, annot=annot, fmt='', ax=ax)
 57 |     plt.savefig("cm.png")
 58 |     plt.show()
 59 | 
 60 | 
 61 | def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, title=None, cmap=plt.cm.Blues):
 62 |     """
 63 |     This function prints and plots the confusion matrix.
 64 |     Normalization can be applied by setting `normalize=True`.
 65 |     """
 66 |     if not title:
 67 |         if normalize:
 68 |             title = 'Normalized confusion matrix'
 69 |         else:
 70 |             title = 'Confusion matrix, without normalization'
 71 | 
 72 |     # Compute confusion matrix
 73 |     cm = confusion_matrix(y_true, y_pred)
 74 |     # Only use the labels that appear in the data
 75 |     classes = classes[unique_labels(y_true, y_pred)]
 76 |     if normalize:
 77 |         cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
 78 |         print("Normalized confusion matrix")
 79 |     else:
 80 |         print('Confusion matrix, without normalization')
 81 | 
 82 |     print(cm)
 83 | 
 84 |     fig, ax = plt.subplots()
 85 |     im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
 86 |     ax.figure.colorbar(im, ax=ax)
 87 |     # We want to show all ticks...
 88 |     ax.set(xticks=np.arange(cm.shape[1]),
 89 |            yticks=np.arange(cm.shape[0]),
 90 |            # ... and label them with the respective list entries
 91 |            xticklabels=classes, yticklabels=classes,
 92 |            title=title,
 93 |            ylabel='True label',
 94 |            xlabel='Predicted label')
 95 | 
 96 |     # Rotate the tick labels and set their alignment.
 97 |     plt.setp(ax.get_xticklabels(), rotation=90, ha="right",
 98 |              rotation_mode="anchor")
 99 | 
100 |     # Loop over data dimensions and create text annotations.
101 |     fmt = '.2f' if normalize else 'd'
102 |     thresh = cm.max() / 2.
103 |     for i in range(cm.shape[0]):
104 |         for j in range(cm.shape[1]):
105 |             ax.text(j, i, format(cm[i, j], fmt),
106 |                     ha="center", va="center",
107 |                     color="white" if cm[i, j] > thresh else "black")
108 |     fig.tight_layout()
109 |     return ax
110 | 
111 | 
112 | file_dir = "D:\\SDN Project\\Data\\Processed\\"
113 | 
114 | names = ['#flow_id', 'protocol', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'ndpi_proto_num', 'src2dst_packets',
115 |          'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes', 'ndpi_proto']
116 | df = pd.read_csv(file_dir + '26sep19.csv', names=names)
117 | X = np.asarray(
118 |     df[['protocol', 'src_port', 'dst_port', 'src2dst_packets', 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes']][
119 |     1:])
120 | y = df['ndpi_proto'][1:]
121 | 
122 | # read the dictionary
123 | 
124 | read_file = open("randomforest.txt", 'r')
125 | dict_class = []
126 | for i in read_file.readlines():
127 |     name, index = str(i).split(" ")[0], str(i).split(" ")[1]
128 |     if name not in dict_class:
129 |         dict_class.append(name)
130 | # Recreate the exact same model
131 | count = 1
132 | loaded_model = pickle.load(open('randomforest_model.sav', 'rb'))
133 | result = loaded_model.predict(X)
134 | original = []
135 | for i in result:
136 |     print(f'{count} {i} {dict_class[i]} {name_convert(y[count])}')
137 |     if name_convert(y[count]) not in dict_class:
138 |         original.append(7)
139 |     else:
140 |         original.append(dict_class.index(name_convert(y[count])))
141 |     count += 1
142 | print(accuracy_score(original, result))
143 | print(confusion_matrix(original, result))
144 | cm_analysis(original, result, dict_class, ymap=None, title="Confusion Matrix", figsize=(10, 10))
145 | plot_confusion_matrix(original, result, classes= np.asarray(dict_class), normalize=False, title="Confusion Matrix",cmap=plt.cm.Reds)
146 | plt.show()
147 | plt.savefig("cm_1.png")


--------------------------------------------------------------------------------
/DecisionTree/predict.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import pandas as pd
  3 | # Helper libraries
  4 | import numpy as np
  5 | import pickle
  6 | from sklearn.metrics import accuracy_score, confusion_matrix
  7 | from sklearn.utils.multiclass import unique_labels
  8 | import seaborn as sns
  9 | from dictionary import name_convert
 10 | 
 11 | 
 12 | def cm_analysis(y_true, y_pred, labels, ymap=None, title="", figsize=(10, 10)):
 13 |     """
 14 |     Generate matrix plot of confusion matrix with pretty annotations.
 15 |     The plot image is saved to disk.
 16 |     args:
 17 |       y_true:    true label of the data, with shape (nsamples,)
 18 |       y_pred:    prediction of the data, with shape (nsamples,)
 19 |       filename:  filename of figure file to save
 20 |       labels:    string array, name the order of class labels in the confusion matrix.
 21 |                  use `clf.classes_` if using scikit-learn models.
 22 |                  with shape (nclass,).
 23 |       ymap:      dict: any -> string, length == nclass.
 24 |                  if not None, map the labels & ys to more understandable strings.
 25 |                  Caution: original y_true, y_pred and labels must align.
 26 |       figsize:   the size of the figure plotted.
 27 |     """
 28 |     if ymap is not None:
 29 |         y_pred = [ymap[yi] for yi in y_pred]
 30 |         y_true = [ymap[yi] for yi in y_true]
 31 |         labels = [ymap[yi] for yi in labels]
 32 |     l = []
 33 |     for i in labels:
 34 |         l.append(labels.index(i))
 35 |     cm = confusion_matrix(y_true, y_pred, labels=l)
 36 |     cm_sum = np.sum(cm, axis=1, keepdims=True)
 37 |     cm_perc = cm / cm_sum.astype(float) * 100
 38 |     annot = np.empty_like(cm).astype(str)
 39 |     nrows, ncols = cm.shape
 40 |     for i in range(nrows):
 41 |         for j in range(ncols):
 42 |             c = cm[i, j]
 43 |             p = cm_perc[i, j]
 44 |             if i == j:
 45 |                 s = cm_sum[i]
 46 |                 annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
 47 |             elif c == 0:
 48 |                 annot[i, j] = ''
 49 |             else:
 50 |                 annot[i, j] = '%.1f%%\n%d' % (p, c)
 51 |     cm = pd.DataFrame(cm, index=labels, columns=labels)
 52 |     cm.index.name = 'Actual'
 53 |     cm.columns.name = 'Predicted'
 54 |     fig, ax = plt.subplots(figsize=figsize)
 55 |     ax.set(title=title)
 56 |     sns.heatmap(cm, annot=annot, fmt='', ax=ax)
 57 |     plt.savefig("cm.png")
 58 |     plt.show()
 59 | 
 60 | 
 61 | def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, title=None, cmap=plt.cm.Blues):
 62 |     """
 63 |     This function prints and plots the confusion matrix.
 64 |     Normalization can be applied by setting `normalize=True`.
 65 |     """
 66 |     if not title:
 67 |         if normalize:
 68 |             title = 'Normalized confusion matrix'
 69 |         else:
 70 |             title = 'Confusion matrix, without normalization'
 71 | 
 72 |     # Compute confusion matrix
 73 |     cm = confusion_matrix(y_true, y_pred)
 74 |     # Only use the labels that appear in the data
 75 |     classes = classes[unique_labels(y_true, y_pred)]
 76 |     if normalize:
 77 |         cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
 78 |         print("Normalized confusion matrix")
 79 |     else:
 80 |         print('Confusion matrix, without normalization')
 81 | 
 82 |     print(cm)
 83 | 
 84 |     fig, ax = plt.subplots()
 85 |     im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
 86 |     ax.figure.colorbar(im, ax=ax)
 87 |     # We want to show all ticks...
 88 |     ax.set(xticks=np.arange(cm.shape[1]),
 89 |            yticks=np.arange(cm.shape[0]),
 90 |            # ... and label them with the respective list entries
 91 |            xticklabels=classes, yticklabels=classes,
 92 |            title=title,
 93 |            ylabel='True label',
 94 |            xlabel='Predicted label')
 95 | 
 96 |     # Rotate the tick labels and set their alignment.
 97 |     plt.setp(ax.get_xticklabels(), rotation=90, ha="right",
 98 |              rotation_mode="anchor")
 99 | 
100 |     # Loop over data dimensions and create text annotations.
101 |     fmt = '.2f' if normalize else 'd'
102 |     thresh = cm.max() / 2.
103 |     for i in range(cm.shape[0]):
104 |         for j in range(cm.shape[1]):
105 |             ax.text(j, i, format(cm[i, j], fmt),
106 |                     ha="center", va="center",
107 |                     color="white" if cm[i, j] > thresh else "black")
108 |     fig.tight_layout()
109 |     return ax
110 | 
111 | 
112 | file_dir = "D:\\SDN Project\\Data\\Processed\\"
113 | 
114 | names = ['#flow_id', 'protocol', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'ndpi_proto_num', 'src2dst_packets',
115 |          'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes', 'ndpi_proto']
116 | df = pd.read_csv(file_dir + '26sep19.csv', names=names)
117 | X = np.asarray(
118 |     df[['protocol', 'src_port', 'dst_port', 'src2dst_packets', 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes']][
119 |     1:])
120 | y = df['ndpi_proto'][1:]
121 | 
122 | # read the dictionary
123 | 
124 | read_file = open("decisiontree.txt", 'r')
125 | dict_class = []
126 | for i in read_file.readlines():
127 |     name, index = str(i).split(" ")[0], str(i).split(" ")[1]
128 |     if name not in dict_class:
129 |         dict_class.append(name)
130 | print(dict_class)
131 | # Recreate the exact same model
132 | count = 1
133 | loaded_model = pickle.load(open('decisiontree_model.sav', 'rb'))
134 | result = loaded_model.predict(X)
135 | original = []
136 | for i in result:
137 |     print(f'{count} {i} {dict_class[i]} {name_convert(y[count])}')
138 |     if name_convert(y[count]) not in dict_class:
139 |         original.append(7)
140 |     else:
141 |         original.append(dict_class.index(name_convert(y[count])))
142 |     count += 1
143 | print(accuracy_score(original, result))
144 | print(confusion_matrix(original, result))
145 | cm_analysis(original, result, dict_class, ymap=None, title="Confusion Matrix", figsize=(10, 10))
146 | plot_confusion_matrix(original, result, classes= np.asarray(dict_class), normalize=False, title="Confusion Matrix",cmap=plt.cm.Reds)
147 | plt.show()
148 | plt.savefig("cm_1.png")
149 | 


--------------------------------------------------------------------------------
/DNN/predict.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import pandas as pd
  3 | import keras
  4 | # Helper libraries
  5 | import numpy as np
  6 | import pickle
  7 | from sklearn.metrics import accuracy_score, confusion_matrix
  8 | from sklearn.utils.multiclass import unique_labels
  9 | import seaborn as sns
 10 | from dictionary import name_convert
 11 | from keras.models import load_model
 12 | 
 13 | 
 14 | def accu(y_true=[], y_pred=[]):
 15 |     good = 0
 16 |     for i in range(0, len(y_true)):
 17 |         if y_true[i]== np.argmax(y_pred[i]):
 18 |             good+=1
 19 |     return good/len(y_true)
 20 | 
 21 | def cm_analysis(y_true, y_pred, labels, ymap=None, title="", figsize=(10, 10)):
 22 |     """
 23 |     Generate matrix plot of confusion matrix with pretty annotations.
 24 |     The plot image is saved to disk.
 25 |     args:
 26 |       y_true:    true label of the data, with shape (nsamples,)
 27 |       y_pred:    prediction of the data, with shape (nsamples,)
 28 |       filename:  filename of figure file to save
 29 |       labels:    string array, name the order of class labels in the confusion matrix.
 30 |                  use `clf.classes_` if using scikit-learn models.
 31 |                  with shape (nclass,).
 32 |       ymap:      dict: any -> string, length == nclass.
 33 |                  if not None, map the labels & ys to more understandable strings.
 34 |                  Caution: original y_true, y_pred and labels must align.
 35 |       figsize:   the size of the figure plotted.
 36 |     """
 37 |     if ymap is not None:
 38 |         y_pred = [ymap[yi] for yi in y_pred]
 39 |         y_true = [ymap[yi] for yi in y_true]
 40 |         labels = [ymap[yi] for yi in labels]
 41 |     l = []
 42 |     for i in labels:
 43 |         l.append(labels.index(i))
 44 |     cm = confusion_matrix(y_true, y_pred, labels=l)
 45 |     cm_sum = np.sum(cm, axis=1, keepdims=True)
 46 |     cm_perc = cm / cm_sum.astype(float) * 100
 47 |     annot = np.empty_like(cm).astype(str)
 48 |     nrows, ncols = cm.shape
 49 |     for i in range(nrows):
 50 |         for j in range(ncols):
 51 |             c = cm[i, j]
 52 |             p = cm_perc[i, j]
 53 |             if i == j:
 54 |                 s = cm_sum[i]
 55 |                 annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
 56 |             elif c == 0:
 57 |                 annot[i, j] = ''
 58 |             else:
 59 |                 annot[i, j] = '%.1f%%\n%d' % (p, c)
 60 |     cm = pd.DataFrame(cm, index=labels, columns=labels)
 61 |     cm.index.name = 'Actual'
 62 |     cm.columns.name = 'Predicted'
 63 |     fig, ax = plt.subplots(figsize=figsize)
 64 |     ax.set(title=title)
 65 |     sns.heatmap(cm, annot=annot, fmt='', ax=ax)
 66 |     plt.savefig("cm.png")
 67 |     plt.show()
 68 | 
 69 | 
 70 | def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, title=None, cmap=plt.cm.Blues):
 71 |     """
 72 |     This function prints and plots the confusion matrix.
 73 |     Normalization can be applied by setting `normalize=True`.
 74 |     """
 75 |     if not title:
 76 |         if normalize:
 77 |             title = 'Normalized confusion matrix'
 78 |         else:
 79 |             title = 'Confusion matrix, without normalization'
 80 | 
 81 |     # Compute confusion matrix
 82 |     cm = confusion_matrix(y_true, y_pred)
 83 |     # Only use the labels that appear in the data
 84 |     classes = classes[unique_labels(y_true, y_pred)]
 85 |     if normalize:
 86 |         cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
 87 |         print("Normalized confusion matrix")
 88 |     else:
 89 |         print('Confusion matrix, without normalization')
 90 | 
 91 |     print(cm)
 92 | 
 93 |     fig, ax = plt.subplots()
 94 |     im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
 95 |     ax.figure.colorbar(im, ax=ax)
 96 |     # We want to show all ticks...
 97 |     ax.set(xticks=np.arange(cm.shape[1]),
 98 |            yticks=np.arange(cm.shape[0]),
 99 |            # ... and label them with the respective list entries
100 |            xticklabels=classes, yticklabels=classes,
101 |            title=title,
102 |            ylabel='True label',
103 |            xlabel='Predicted label')
104 | 
105 |     # Rotate the tick labels and set their alignment.
106 |     plt.setp(ax.get_xticklabels(), rotation=90, ha="right",
107 |              rotation_mode="anchor")
108 | 
109 |     # Loop over data dimensions and create text annotations.
110 |     fmt = '.2f' if normalize else 'd'
111 |     thresh = cm.max() / 2.
112 |     for i in range(cm.shape[0]):
113 |         for j in range(cm.shape[1]):
114 |             ax.text(j, i, format(cm[i, j], fmt),
115 |                     ha="center", va="center",
116 |                     color="white" if cm[i, j] > thresh else "black")
117 |     fig.tight_layout()
118 |     return ax
119 | 
120 | 
121 | file_dir = "D:\\SDN Project\\Data\\Processed\\"
122 | 
123 | names = ['#flow_id', 'protocol', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'ndpi_proto_num', 'src2dst_packets',
124 |          'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes', 'ndpi_proto']
125 | df = pd.read_csv(file_dir + '26sep19.csv', names=names)
126 | X = np.asarray(
127 |     df[['protocol', 'src_port', 'dst_port', 'src2dst_packets', 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes']][
128 |     1:])
129 | y = df['ndpi_proto'][1:]
130 | 
131 | # read the dictionary
132 | 
133 | read_file = open("dnn.txt", 'r')
134 | dict_class = []
135 | for i in read_file.readlines():
136 |     name, index = str(i).split(" ")[0], str(i).split(" ")[1]
137 |     if name not in dict_class:
138 |         dict_class.append(name)
139 | print(dict_class)
140 | # Recreate the exact same model
141 | count = 1
142 | loaded_model = load_model('dnn_model1.sav')
143 | #loaded_model = pickle.load(open('dnn_model.sav', 'rb'))
144 | result = loaded_model.predict(X)
145 | 
146 | original = []
147 | y_pred = []
148 | for i in result:
149 |     #print(np.argmax(i))
150 |     print(f'{count} {np.argmax(i)} {dict_class[np.argmax(i)]} {name_convert(y[count])}')
151 |     y_pred.append(np.argmax(i))
152 |     if name_convert(y[count]) not in dict_class:
153 |         original.append(7)
154 |     else:
155 |         original.append(dict_class.index(name_convert(y[count])))
156 |     count += 1
157 | 
158 | 
159 | #print(f"accuracy: {accu(original,result)}")
160 | #print(keras.metrics.sparse_categorical_accuracy(original, result))
161 | #print(accuracy_score(y_true, y_pred))
162 | print(confusion_matrix(original, y_pred))
163 | cm_analysis(original, y_pred, dict_class, ymap=None, title="Confusion Matrix", figsize=(10, 10))
164 | plot_confusion_matrix(original, y_pred, classes=np.asarray(dict_class), normalize=False, title="Confusion Matrix",cmap=plt.cm.Reds)
165 | plt.show()
166 | plt.savefig("cm_1.png")
167 | 


--------------------------------------------------------------------------------