├── .gitignore
├── Dataset
├── .gitattributes
└── How to get the data.txt
├── DNN
├── cm.png
├── model_1.h5
├── dnn_model.sav
├── dnn_model1.sav
├── dnn.py
└── predict.py
├── KNN
├── cm.png
├── knn_3_model.sav
├── knn.py
└── predict.py
├── DecisionTree
├── cm.png
├── 7_cm_dt.png
├── decisiontree_model.sav
├── decisiontree.py
└── predict.py
├── RandomForest
├── cm.png
├── randomforest_model.sav
├── randomforest.py
└── predict.py
├── dictionary.py
├── README.md
└── test.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | network_traffic.csv
2 |
--------------------------------------------------------------------------------
/Dataset/.gitattributes:
--------------------------------------------------------------------------------
1 | *.csv filter=lfs diff=lfs merge=lfs -text
2 |
--------------------------------------------------------------------------------
/DNN/cm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/DNN/cm.png
--------------------------------------------------------------------------------
/KNN/cm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/KNN/cm.png
--------------------------------------------------------------------------------
/DNN/model_1.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/DNN/model_1.h5
--------------------------------------------------------------------------------
/DNN/dnn_model.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/DNN/dnn_model.sav
--------------------------------------------------------------------------------
/DNN/dnn_model1.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/DNN/dnn_model1.sav
--------------------------------------------------------------------------------
/DecisionTree/cm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/DecisionTree/cm.png
--------------------------------------------------------------------------------
/KNN/knn_3_model.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/KNN/knn_3_model.sav
--------------------------------------------------------------------------------
/RandomForest/cm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/RandomForest/cm.png
--------------------------------------------------------------------------------
/DecisionTree/7_cm_dt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/DecisionTree/7_cm_dt.png
--------------------------------------------------------------------------------
/DecisionTree/decisiontree_model.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/DecisionTree/decisiontree_model.sav
--------------------------------------------------------------------------------
/RandomForest/randomforest_model.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/RandomForest/randomforest_model.sav
--------------------------------------------------------------------------------
/dictionary.py:
--------------------------------------------------------------------------------
1 | def name_convert(x):
2 | file = open('..\test.txt', 'r')
3 | dict_c = {}
4 | for i in file.readlines():
5 | i = str(i).strip()
6 | name, cl = str(i).split(" ")[0], str(i).split(" ")[1]
7 | dict_c[name] = cl
8 | try:
9 | return dict_c[x]
10 | except KeyError as e:
11 | return "default"
12 |
--------------------------------------------------------------------------------
/Dataset/How to get the data.txt:
--------------------------------------------------------------------------------
1 | Since .pcap files are big in size we processed the data in csv format. And still the file is about 150mb. Please email me:
2 | pritom007@live.com
3 | or
4 | pritom@sjtu.edu.com and also write a short note why you need the data.
5 |
6 | This data is only available for research purpose not for commercial use.
7 |
8 | Please cite my work if you use this dataset for research.
9 |
10 | @article{mondal2021dynamic,
11 | title={A dynamic network traffic classifier using supervised ML for a Docker-based SDN network},
12 | author={Mondal, Pritom Kumar and Aguirre Sanchez, Lizeth P and Benedetto, Emmanuele and Shen, Yao and Guo, Minyi},
13 | journal={Connection Science},
14 | pages={1--26},
15 | year={2021},
16 | publisher={Taylor \& Francis}
17 | }
18 |
19 | read more: https://doi.org/10.1080/09540091.2020.1870437
20 |
--------------------------------------------------------------------------------
/DecisionTree/decisiontree.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sklearn.metrics import accuracy_score, classification_report
3 | from sklearn.model_selection import train_test_split
4 | import numpy as np
5 | from sklearn.tree import DecisionTreeClassifier
6 | import pickle
7 |
8 | file_dir = "..\\Data\\"
9 |
10 | names = ['#flow_id', 'protocol', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'ndpi_proto_num', 'src2dst_packets',
11 | 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes', 'ndpi_proto', 'class']
12 | print("Loading Dataset total_class.csv")
13 | df = pd.read_csv(file_dir + 'total_class.csv', names=names)
14 |
15 | X = np.asarray(
16 | df[['protocol', 'src_port', 'dst_port', 'src2dst_packets', 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes']][
17 | 1:])
18 | y = []
19 |
20 | my_tags = []
21 | classes = open("decisiontree.txt", "w+")
22 | for i in df['class'][1:]:
23 | if i not in my_tags:
24 | my_tags.append(i)
25 | for i in df['class'][1:]:
26 | classes.write(i + " " + str(my_tags.index(i)) + "\n")
27 | y.append(my_tags.index(i))
28 | y = np.asarray(y)
29 | print("Splitting dataset")
30 | x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True)
31 |
32 | clf = DecisionTreeClassifier(random_state=0)
33 |
34 | print("Training Started")
35 | clf.fit(x_train, y_train)
36 |
37 | print("Testing the classifier")
38 | y_pred = clf.predict(x_test)
39 |
40 | print("Saving the model")
41 | filename = 'decisiontree_model.sav'
42 | pickle.dump(clf, open(filename, 'wb'))
43 |
44 | print('accuracy %s' % accuracy_score(y_pred, y_test))
45 | print(classification_report(y_test, y_pred, target_names=my_tags, labels=range(len(my_tags))))
46 |
--------------------------------------------------------------------------------
/RandomForest/randomforest.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sklearn.metrics import accuracy_score, classification_report
3 | from sklearn.model_selection import train_test_split
4 | import numpy as np
5 | from sklearn.ensemble import RandomForestClassifier
6 | import pickle
7 |
8 |
9 | file_dir = "D:\\SDN Project\\Data\\"
10 |
11 | names = ['#flow_id', 'protocol', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'ndpi_proto_num', 'src2dst_packets',
12 | 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes', 'ndpi_proto', 'class']
13 | print("Loading Dataset total_class.csv")
14 | df = pd.read_csv(file_dir + 'total_class.csv', names=names)
15 |
16 | X = np.asarray(
17 | df[['protocol', 'src_port', 'dst_port', 'src2dst_packets', 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes']][
18 | 1:])
19 | y = []
20 |
21 | my_tags = []
22 | classes = open("randomforest.txt", "w+")
23 | for i in df['class'][1:]:
24 | if i not in my_tags:
25 | my_tags.append(i)
26 | for i in df['class'][1:]:
27 | classes.write(i + " " + str(my_tags.index(i)) + "\n")
28 | y.append(my_tags.index(i))
29 | y = np.asarray(y)
30 | print("Splitting dataset")
31 | x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.30, shuffle=True)
32 |
33 | clf = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=1)
34 | print("Training Started")
35 | clf.fit(x_train, y_train)
36 |
37 | print("Testing the classifier")
38 | y_pred = clf.predict(x_test)
39 |
40 | print("Saving the model")
41 | filename = 'randomforest_model.sav'
42 | pickle.dump(clf, open(filename, 'wb'))
43 |
44 | print('accuracy %s' % accuracy_score(y_pred, y_test))
45 | print(classification_report(y_test, y_pred, target_names=my_tags, labels=range(len(my_tags))))
46 |
--------------------------------------------------------------------------------
/KNN/knn.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sklearn.metrics import accuracy_score, classification_report
3 | from sklearn.model_selection import train_test_split
4 | import numpy as np
5 | from sklearn.neighbors import KNeighborsClassifier
6 | #from sklearn.neighbors.nca import NeighborhoodComponentsAnalysis
7 |
8 | import pickle
9 | file_dir = "D:\\SDN Project\\Data\\"
10 |
11 | names = ['#flow_id', 'protocol', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'ndpi_proto_num', 'src2dst_packets',
12 | 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes', 'ndpi_proto', 'class']
13 | print("Loading Dataset total_class.csv")
14 | df = pd.read_csv(file_dir + 'total_class.csv', names=names)
15 |
16 | X = np.asarray(
17 | df[['protocol', 'src_port', 'dst_port', 'src2dst_packets', 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes']][
18 | 1:])
19 | y = []
20 |
21 | my_tags = []
22 | classes = open("knn.txt", "w+")
23 | for i in df['class'][1:]:
24 | if i not in my_tags:
25 | my_tags.append(i)
26 | for i in df['class'][1:]:
27 | classes.write(i + " " + str(my_tags.index(i)) + "\n")
28 | y.append(my_tags.index(i))
29 | y = np.asarray(y)
30 | print("Splitting dataset")
31 | x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True)
32 | max = 0
33 | for i in range(2, 15):
34 | knn = KNeighborsClassifier(n_neighbors=i)
35 |
36 | #print("Training Started")
37 | knn.fit(x_train, y_train)
38 |
39 | #print("Testing the classifier")
40 | y_pred = knn.predict(x_test)
41 | acc = accuracy_score(y_pred, y_test)
42 | print(f'K={i} accuracy %s' % acc)
43 | if acc >= max:
44 | print("Saving the model")
45 | filename = 'knn_'+str(i)+'_model.sav'
46 | pickle.dump(knn, open(filename, 'wb'))
47 | max = acc
48 | print('saved acc: ', max)
49 |
50 |
51 | #print(classification_report(y_test, y_pred, target_names=my_tags, labels=range(len(my_tags))))
--------------------------------------------------------------------------------
/DNN/dnn.py:
--------------------------------------------------------------------------------
1 | import ast
2 | import matplotlib.pyplot as plt
3 | import keras
4 | import pandas as pd
5 | import tensorflow as tf
6 |
7 | # Helper libraries
8 | import numpy as np
9 |
10 | from keras import backend as K
11 | from matplotlib import pyplot
12 |
13 | from sklearn.model_selection import train_test_split
14 | import pickle
15 |
16 | def recall_m(y_true, y_pred):
17 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
18 | possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
19 | recall = true_positives / (possible_positives + K.epsilon())
20 | return recall
21 |
22 |
23 | def precision_m(y_true, y_pred):
24 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
25 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
26 | precision = true_positives / (predicted_positives + K.epsilon())
27 | return precision
28 |
29 |
30 | def f1_m(y_true, y_pred):
31 | precision = precision_m(y_true, y_pred)
32 | recall = recall_m(y_true, y_pred)
33 | return 2 * ((precision * recall) / (precision + recall + K.epsilon()))
34 |
35 |
36 | file_dir = "D:\\SDN Project\\Data\\"#"D:\\SDN Project\\"
37 |
38 |
39 | names = ['protocol','src_ip' , 'src_port', 'dst_ip', 'dst_port', 'ndpi_proto_num', 'src2dst_packets',
40 | 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes', 'ndpi_proto', 'class']
41 | df = pd.read_csv(file_dir + 'total_class.csv', names=names)
42 | array = df.values
43 |
44 | X = np.asarray(df[['protocol', 'src_port', 'dst_port', 'src2dst_packets', 'src2dst_bytes', 'dst2src_packets','dst2src_bytes']][1:])
45 | y = []
46 |
47 | my_tags = []
48 | classes = open("dnn.txt", "w+")
49 | for i in df['class'][1:]:
50 | if i not in my_tags:
51 | my_tags.append(i)
52 | for i in df['class'][1:]:
53 | classes.write(i+" "+str(my_tags.index(i))+"\n")
54 | y.append(my_tags.index(i))
55 | y = np.asarray(y)
56 | print(X.shape, y.shape)
57 |
58 |
59 | x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True)
60 | features = len(x_train[0])
61 |
62 | model = keras.Sequential([
63 | keras.layers.Dense(features, kernel_regularizer=tf.keras.regularizers.l1(0.1)),
64 | keras.layers.Dense(512, activation=tf.nn.relu, kernel_regularizer=tf.keras.regularizers.l2(0.1)),
65 | keras.layers.Dense(256, activation=tf.nn.relu, kernel_regularizer=tf.keras.regularizers.l1(0.1)),
66 | keras.layers.Dense(128, activation=tf.nn.relu, kernel_regularizer=tf.keras.regularizers.l2(0.1)),
67 | keras.layers.Dense(10, activation=tf.nn.softmax)
68 | ])
69 | model.compile(optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001),
70 | loss='sparse_categorical_crossentropy',
71 | metrics=['acc', f1_m, precision_m, recall_m])
72 |
73 | history = model.fit(x_train, y_train, batch_size=1000, epochs=400)
74 | loss, accuracy, f1_score, precision, recall = model.evaluate(x_test, np.array(y_test), verbose=0)
75 | model.save("dnn_model1.sav")
76 | y_predict = model.predict(x_test)
77 |
78 | print("Saving the model")
79 | filename = 'dnn_model.sav'
80 | pickle.dump(model, open(filename, 'wb'))
81 |
82 | for i in y_predict:
83 | print(np.argmax(i))
84 | print(f'loss: {loss}, acc: {accuracy}, f1_score: {f1_score}, precision: {precision}, recall: {recall}')
85 | print(model.summary())
86 | print(x_test[0])
87 | pyplot.subplot(212)
88 | pyplot.title('Accuracy')
89 | pyplot.plot(history.history['acc'], label='train')
90 | pyplot.legend()
91 | pyplot.show()
92 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Network-Traffic-Classification
2 |
3 | This is a research project for classifying network traffic. We collected more than 300000 flows from the network. After that, we used nDPI to analyze the flows. We got more than 100 types of applications. Then we group that application into 10 classes. After that, we tried different ML algorithms to classify them.
4 |
5 | Our current results-
6 |
7 | Decision tree 95.8% accuracy
8 |
9 | (I have added a new file https://github.com/pritom007/Network-Traffic-Classification/blob/master/DecisionTree/DecisionTree.ipynb with clean code. You just follow this code and implement it for KNN, RF)
10 |
11 | Random forest 96.69% accuracy
12 |
13 | KNN 97.24% accuracy
14 |
15 | PAA 99.29% accuracy (read the paper to know more)
16 |
17 | To get the dataset check out the instructions in the dataset folder.
18 |
19 | # How Did we collect Data
20 |
21 | We used Wireshark to collect the packets. Since for the project we wanted to use lab environment data, we first redirected our lab-network to one personal computer(pc) and in that pc we used Wireshark. After collecting the packets (as a .pcap file), we used ndpi to analyze the packets and get extract flow info and then we export that data as an excel file. The `data.csv` contains information on all parameters. However, for our project, we only used the top 7 most important parameters as features.
22 |
23 | Github has limited the download so I am sharing a gdrive link for downloading the raw data: https://drive.google.com/file/d/1lcQmYyZutjsW_yJoHgx3Vles8eCgwQeD/view?usp=sharing
24 |
25 | After you download it, you have to pre-process, in the paper, we showed in a table that how did we group the applications in 10 classes.
26 |
27 | Please read the following paper to know more: https://doi.org/10.1080/09540091.2020.1870437
28 |
29 | ## To cite the paper and code:
30 |
31 | article{mondal2021dynamic,
32 | title={A dynamic network traffic classifier using supervised ML for a Docker-based SDN network},
33 | author={Mondal, Pritom Kumar and Aguirre Sanchez, Lizeth P and Benedetto, Emmanuele and Shen, Yao and Guo, Minyi},
34 | journal={Connection Science},
35 | pages={1--26},
36 | year={2021},
37 | publisher={Taylor \& Francis}
38 | }
39 |
40 | # 中文版
41 | # 网络流量分类
42 |
43 | 这是一个对网络流量进行分类的研究项目。我们从网络中收集了超过 300000 个流。之后,我们使用 nDPI 来分析流量。我们收到了 100 多种类型的应用程序。然后我们将该应用程序分为 10 个类。之后,我们尝试了不同的 ML 算法来对它们进行分类。
44 |
45 | 我们目前的结果——
46 |
47 | Decision tree 95.8% 准确率
48 |
49 | (我用干净的代码添加了一个新文件 https://github.com/pritom007/Network-Traffic-Classification/blob/master/DecisionTree/DecisionTree.ipynb。您只需按照此代码并为 KNN、RF 实现它)
50 |
51 | Random forest 96.69% 准确率
52 |
53 | KNN 97.24% 准确率
54 |
55 | PAA 99.29% 准确率(阅读论文了解更多)
56 |
57 | 要获取数据集,请查看数据集文件夹中的说明。
58 |
59 | # 如何收集数据的
60 |
61 | 我们使用 Wireshark 来收集数据包。由于对于我们想要使用实验室环境数据的项目,我们首先将我们的实验室网络重定向到一台个人计算机(PC),并在该 PC 中使用 Wireshark。收集数据包(作为 .pcap 文件)后,我们使用 ndpi 分析数据包并获取提取流信息,然后将该数据导出为 excel 文件。 `data.csv` 包含有关所有参数的信息。然而,对于我们的项目,我们只使用了前 7 个最重要的参数作为特征。
62 |
63 | Github 限制了下载,所以我分享了一个用于下载原始数据的 gdrive 链接:https://drive.google.com/file/d/1lcQmYyZutjsW_yJoHgx3Vles8eCgwQeD/view?usp=sharing
64 |
65 | 下载后,您必须进行预处理,在论文中,我们在表格中展示了我们如何将应用程序分为 10 个类。
66 |
67 | 请阅读以下论文了解更多信息:https://doi.org/10.1080/09540091.2020.1870437
68 |
69 | ## 引用论文和代码:
70 |
71 | article{mondal2021dynamic,
72 | title={A dynamic network traffic classifier using supervised ML for a Docker-based SDN network},
73 | author={Mondal, Pritom Kumar and Aguirre Sanchez, Lizeth P and Benedetto, Emmanuele and Shen, Yao and Guo, Minyi},
74 | journal={Connection Science},
75 | pages={1--26},
76 | year={2021},
77 | publisher={Taylor \& Francis}
78 | }
79 |
--------------------------------------------------------------------------------
/test.txt:
--------------------------------------------------------------------------------
1 | DHCPV6 critical
2 | DHCP critical
3 | ApplePush p2p
4 | Google transaction
5 | HTTP_Proxy transaction
6 | HTTP.GenericProtocol transaction
7 | HTTP.Microsoft transaction
8 | HTTP.Office365 transaction
9 | HTTP.WindowsUpdate transaction
10 | HTTP.UbuntuONE transaction
11 | HTTP.MSN transaction
12 | HTTP.QQ transaction
13 | HTTP.Amazon transaction
14 | Amazon transaction
15 | Dropbox transaction
16 | HTTP transaction
17 | HTTP.Cloudflare transaction
18 | HTTP.Sina(Weibo) transaction
19 | HTTP.Google transaction
20 | HTTP.ApplePush transaction
21 | HTTP.GoogleServices transaction
22 | HTTP.Facebook transaction
23 | HTTP.MS_OneDrive transaction
24 | HTTP.Apple transaction
25 | HTTP.Skype transaction
26 | HTTP.AppleStore transaction
27 | HTTP.Github transaction
28 | NETBIOS critical
29 | FACEBOOK bulk
30 | QQ bulk
31 | TLS.WeChat bulk
32 | Viber bulk
33 | HTTP.WeChat bulk
34 | TLS.WeChat bulk
35 | TLS.Twitter bulk
36 | Facebook.chat bulk
37 | Wechat bulk
38 | Skype bulk
39 | SMTPS bulk
40 | SMTP bulk
41 | IMAPS.Tor bulk
42 | IMAPS bulk
43 | IMAP bulk
44 | IMAPS.QQ bulk
45 | IMAPS.Yahoo bulk
46 | TLS transaction
47 | TLS.MSN transaction
48 | TLS.WindowsUpdate transaction
49 | TLS.Dropbox transaction
50 | TLS.Facebook transaction
51 | TLS.TLS_No_Cert transaction
52 | TLS.Signal transaction
53 | TLS.Github transaction
54 | TLS.Microsoft transaction
55 | TLS_No_Cert transaction
56 | TLS.GenericProtocol transaction
57 | TLS.Google transaction
58 | TLS.Cloudflare transaction
59 | TLS.Office365 transaction
60 | TLS.Amazon transaction
61 | TLS.LinkedIn transaction
62 | TLS.QQ transaction
63 | TLS.MS_OneDrive transaction
64 | TLS.Skype transaction
65 | TLS.Sina(Weibo) transaction
66 | TLS.GoogleServices transaction
67 | TLS.Spotify transaction
68 | TLS.Yahoo transaction
69 | TLS.UbuntuONE transaction
70 | TLS.Steam transaction
71 | TLS.Apple transaction
72 | TLS.AppleiCloud transaction
73 | TLS.Wikipedia transaction
74 | TLS.Starcraft transaction
75 | TLS.GoogleDocs transaction
76 | TLS.ApplePush transaction
77 | TLS.AppleiTunes transaction
78 | TLS.AppleStore transaction
79 | TLS.IMAPS transaction
80 | QUIC.Google transaction
81 | QUIC.GoogleServices transaction
82 | QUIC transaction
83 | DNS.GoogleServices critical
84 | DNS.Google critical
85 | DNS.Office365 critical
86 | DNS.MS_OneDrive critical
87 | DNS.GenericProtocol critical
88 | DNS.LinkedIn critical
89 | DNS.Microsoft critical
90 | DNS.WindowsUpdate critical
91 | DNS.QQ critical
92 | DNS.AmazonVideo critical
93 | DNS.UbuntuONE critical
94 | DNS.Facebook critical
95 | DNS.Dropbox critical
96 | DNS.PlayStore critical
97 | DNS.MSN critical
98 | DNS.Skype critical
99 | DNS.Amazon critical
100 | DNS critical
101 | DNS.Wikipedia critical
102 | DNS.Instagram critical
103 | DNS.YouTube critical
104 | DNS.Sina(Weibo) critical
105 | DNS.Yahoo critical
106 | DNS.Steam critical
107 | DNS.ApplePush critical
108 | DNS.Xbox critical
109 | DNS.Twitter critical
110 | DNS.GoogleMaps critical
111 | DNS.AppleiTunes critical
112 | DNS.Github critical
113 | DNS.Spotify critical
114 | DNS.GoogleDocs critical
115 | DNS.Apple critical
116 | DNS.GoogleDrive critical
117 | DNS.GMail critical
118 | DNS.GooglePlus critical
119 | DNS.AppleiCloud critical
120 | DNS.NetFlix critical
121 | MDNS critical
122 | LLMNR critical
123 | UPnP OAM
124 | SSDP OAM
125 | BJNP OAM
126 | Starcraft p2p
127 | Playstation p2p
128 | steam p2p
129 | TLS.Xbox p2p
130 | Xbox p2p
131 | Steam p2p
132 | BitTorrent p2p
133 | eDonkey p2p
134 | Pando_Media_Booster p2p
135 | Thunder p2p
136 | TLS.AmazonVideo video
137 | HTTP.AmazonVideo video
138 | TLS.YouTube video
139 | TeamViewer transaction
140 | RDP transaction
141 | RTP control
142 | BGP control
143 | ICMP control
144 | NTP control
145 | ICMPv6 control
146 | IGMP control
147 | RX signaling
148 | STUN signaling
149 | H323 signaling
150 | STUN.SkypeCall signaling
151 | STUN.Signal signaling
152 | STUN.WhatsAppVoice signaling
153 | STUN.GoogleHangoutDuo signaling
154 | Skype.SkypeCall VoIP
155 | WhatsAppVoice VoIP
156 | FTP_CONTROL bulk
157 | SSH.Google OAM
158 | SSH OAM
159 | TELNET OAM
160 | CHECKMK OAM
161 | SNMP OAM
162 | Whois-DAS bulk
163 | MsSQL-TDS bulk
164 | PostgreSQL bulk
165 | IPsec OAM
166 | Citrix default
167 | Redis default
168 | Targus Dataspeed default
169 | Tor default
170 | SOCKS default
171 | Mining default
172 | SOCKS default
173 | COAP default
174 | MQTT default
175 | CiscoVPN default
176 | Unknown default
177 |
--------------------------------------------------------------------------------
/KNN/predict.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import pandas as pd
3 | # Helper libraries
4 | import numpy as np
5 | import pickle
6 | from sklearn.metrics import accuracy_score, confusion_matrix
7 | from sklearn.utils.multiclass import unique_labels
8 | import seaborn as sns
9 | from dictionary import name_convert
10 |
11 |
12 | def cm_analysis(y_true, y_pred, labels, ymap=None, title="", figsize=(10, 10)):
13 | """
14 | Generate matrix plot of confusion matrix with pretty annotations.
15 | The plot image is saved to disk.
16 | args:
17 | y_true: true label of the data, with shape (nsamples,)
18 | y_pred: prediction of the data, with shape (nsamples,)
19 | filename: filename of figure file to save
20 | labels: string array, name the order of class labels in the confusion matrix.
21 | use `clf.classes_` if using scikit-learn models.
22 | with shape (nclass,).
23 | ymap: dict: any -> string, length == nclass.
24 | if not None, map the labels & ys to more understandable strings.
25 | Caution: original y_true, y_pred and labels must align.
26 | figsize: the size of the figure plotted.
27 | """
28 | if ymap is not None:
29 | y_pred = [ymap[yi] for yi in y_pred]
30 | y_true = [ymap[yi] for yi in y_true]
31 | labels = [ymap[yi] for yi in labels]
32 | l = []
33 | for i in labels:
34 | l.append(labels.index(i))
35 | cm = confusion_matrix(y_true, y_pred, labels=l)
36 | cm_sum = np.sum(cm, axis=1, keepdims=True)
37 | cm_perc = cm / cm_sum.astype(float) * 100
38 | annot = np.empty_like(cm).astype(str)
39 | nrows, ncols = cm.shape
40 | for i in range(nrows):
41 | for j in range(ncols):
42 | c = cm[i, j]
43 | p = cm_perc[i, j]
44 | if i == j:
45 | s = cm_sum[i]
46 | annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
47 | elif c == 0:
48 | annot[i, j] = ''
49 | else:
50 | annot[i, j] = '%.1f%%\n%d' % (p, c)
51 | cm = pd.DataFrame(cm, index=labels, columns=labels)
52 | cm.index.name = 'Actual'
53 | cm.columns.name = 'Predicted'
54 | fig, ax = plt.subplots(figsize=figsize)
55 | ax.set(title=title)
56 | sns.heatmap(cm, annot=annot, fmt='', ax=ax)
57 | plt.savefig("cm.png")
58 | plt.show()
59 |
60 |
61 | def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, title=None, cmap=plt.cm.Blues):
62 | """
63 | This function prints and plots the confusion matrix.
64 | Normalization can be applied by setting `normalize=True`.
65 | """
66 | if not title:
67 | if normalize:
68 | title = 'Normalized confusion matrix'
69 | else:
70 | title = 'Confusion matrix, without normalization'
71 |
72 | # Compute confusion matrix
73 | cm = confusion_matrix(y_true, y_pred)
74 | # Only use the labels that appear in the data
75 | classes = classes[unique_labels(y_true, y_pred)]
76 | if normalize:
77 | cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
78 | print("Normalized confusion matrix")
79 | else:
80 | print('Confusion matrix, without normalization')
81 |
82 | print(cm)
83 |
84 | fig, ax = plt.subplots()
85 | im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
86 | ax.figure.colorbar(im, ax=ax)
87 | # We want to show all ticks...
88 | ax.set(xticks=np.arange(cm.shape[1]),
89 | yticks=np.arange(cm.shape[0]),
90 | # ... and label them with the respective list entries
91 | xticklabels=classes, yticklabels=classes,
92 | title=title,
93 | ylabel='True label',
94 | xlabel='Predicted label')
95 |
96 | # Rotate the tick labels and set their alignment.
97 | plt.setp(ax.get_xticklabels(), rotation=90, ha="right",
98 | rotation_mode="anchor")
99 |
100 | # Loop over data dimensions and create text annotations.
101 | fmt = '.2f' if normalize else 'd'
102 | thresh = cm.max() / 2.
103 | for i in range(cm.shape[0]):
104 | for j in range(cm.shape[1]):
105 | ax.text(j, i, format(cm[i, j], fmt),
106 | ha="center", va="center",
107 | color="white" if cm[i, j] > thresh else "black")
108 | fig.tight_layout()
109 | return ax
110 |
111 |
112 | file_dir = "D:\\SDN Project\\Data\\Processed\\"
113 |
114 | names = ['#flow_id', 'protocol', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'ndpi_proto_num', 'src2dst_packets',
115 | 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes', 'ndpi_proto']
116 | df = pd.read_csv(file_dir + '26sep19.csv', names=names)
117 | X = np.asarray(
118 | df[['protocol', 'src_port', 'dst_port', 'src2dst_packets', 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes']][
119 | 1:])
120 | y = df['ndpi_proto'][1:]
121 |
122 | # read the dictionary
123 |
124 | read_file = open("knn.txt", 'r')
125 | dict_class = []
126 | for i in read_file.readlines():
127 | name, index = str(i).split(" ")[0], str(i).split(" ")[1]
128 | if name not in dict_class:
129 | dict_class.append(name)
130 | print(dict_class)
131 | # Recreate the exact same model
132 | count = 1
133 | loaded_model = pickle.load(open('knn_3_model.sav', 'rb'))
134 | result = loaded_model.predict(X)
135 | original = []
136 | for i in result:
137 | print(f'{count} {i} {dict_class[i]} {name_convert(y[count])}')
138 | if name_convert(y[count]) not in dict_class:
139 | original.append(7)
140 | else:
141 | original.append(dict_class.index(name_convert(y[count])))
142 | count += 1
143 | print(accuracy_score(original, result))
144 | print(confusion_matrix(original, result))
145 | cm_analysis(original, result, dict_class, ymap=None, title="Confusion Matrix", figsize=(10, 10))
146 | plot_confusion_matrix(original, result, classes= np.asarray(dict_class), normalize=False, title="Confusion Matrix",cmap=plt.cm.Reds)
147 | plt.show()
148 | plt.savefig("cm_1.png")
--------------------------------------------------------------------------------
/RandomForest/predict.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import pandas as pd
3 | # Helper libraries
4 | import numpy as np
5 | import pickle
6 | from sklearn.metrics import accuracy_score, confusion_matrix
7 | from sklearn.utils.multiclass import unique_labels
8 | import seaborn as sns
9 | from dictionary import name_convert
10 |
11 |
12 | def cm_analysis(y_true, y_pred, labels, ymap=None, title="", figsize=(10, 10)):
13 | """
14 | Generate matrix plot of confusion matrix with pretty annotations.
15 | The plot image is saved to disk.
16 | args:
17 | y_true: true label of the data, with shape (nsamples,)
18 | y_pred: prediction of the data, with shape (nsamples,)
19 | filename: filename of figure file to save
20 | labels: string array, name the order of class labels in the confusion matrix.
21 | use `clf.classes_` if using scikit-learn models.
22 | with shape (nclass,).
23 | ymap: dict: any -> string, length == nclass.
24 | if not None, map the labels & ys to more understandable strings.
25 | Caution: original y_true, y_pred and labels must align.
26 | figsize: the size of the figure plotted.
27 | """
28 | if ymap is not None:
29 | y_pred = [ymap[yi] for yi in y_pred]
30 | y_true = [ymap[yi] for yi in y_true]
31 | labels = [ymap[yi] for yi in labels]
32 | l = []
33 | for i in labels:
34 | l.append(labels.index(i))
35 | cm = confusion_matrix(y_true, y_pred, labels=l)
36 | cm_sum = np.sum(cm, axis=1, keepdims=True)
37 | cm_perc = cm / cm_sum.astype(float) * 100
38 | annot = np.empty_like(cm).astype(str)
39 | nrows, ncols = cm.shape
40 | for i in range(nrows):
41 | for j in range(ncols):
42 | c = cm[i, j]
43 | p = cm_perc[i, j]
44 | if i == j:
45 | s = cm_sum[i]
46 | annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
47 | elif c == 0:
48 | annot[i, j] = ''
49 | else:
50 | annot[i, j] = '%.1f%%\n%d' % (p, c)
51 | cm = pd.DataFrame(cm, index=labels, columns=labels)
52 | cm.index.name = 'Actual'
53 | cm.columns.name = 'Predicted'
54 | fig, ax = plt.subplots(figsize=figsize)
55 | ax.set(title=title)
56 | sns.heatmap(cm, annot=annot, fmt='', ax=ax)
57 | plt.savefig("cm.png")
58 | plt.show()
59 |
60 |
61 | def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, title=None, cmap=plt.cm.Blues):
62 | """
63 | This function prints and plots the confusion matrix.
64 | Normalization can be applied by setting `normalize=True`.
65 | """
66 | if not title:
67 | if normalize:
68 | title = 'Normalized confusion matrix'
69 | else:
70 | title = 'Confusion matrix, without normalization'
71 |
72 | # Compute confusion matrix
73 | cm = confusion_matrix(y_true, y_pred)
74 | # Only use the labels that appear in the data
75 | classes = classes[unique_labels(y_true, y_pred)]
76 | if normalize:
77 | cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
78 | print("Normalized confusion matrix")
79 | else:
80 | print('Confusion matrix, without normalization')
81 |
82 | print(cm)
83 |
84 | fig, ax = plt.subplots()
85 | im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
86 | ax.figure.colorbar(im, ax=ax)
87 | # We want to show all ticks...
88 | ax.set(xticks=np.arange(cm.shape[1]),
89 | yticks=np.arange(cm.shape[0]),
90 | # ... and label them with the respective list entries
91 | xticklabels=classes, yticklabels=classes,
92 | title=title,
93 | ylabel='True label',
94 | xlabel='Predicted label')
95 |
96 | # Rotate the tick labels and set their alignment.
97 | plt.setp(ax.get_xticklabels(), rotation=90, ha="right",
98 | rotation_mode="anchor")
99 |
100 | # Loop over data dimensions and create text annotations.
101 | fmt = '.2f' if normalize else 'd'
102 | thresh = cm.max() / 2.
103 | for i in range(cm.shape[0]):
104 | for j in range(cm.shape[1]):
105 | ax.text(j, i, format(cm[i, j], fmt),
106 | ha="center", va="center",
107 | color="white" if cm[i, j] > thresh else "black")
108 | fig.tight_layout()
109 | return ax
110 |
111 |
112 | file_dir = "D:\\SDN Project\\Data\\Processed\\"
113 |
114 | names = ['#flow_id', 'protocol', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'ndpi_proto_num', 'src2dst_packets',
115 | 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes', 'ndpi_proto']
116 | df = pd.read_csv(file_dir + '26sep19.csv', names=names)
117 | X = np.asarray(
118 | df[['protocol', 'src_port', 'dst_port', 'src2dst_packets', 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes']][
119 | 1:])
120 | y = df['ndpi_proto'][1:]
121 |
122 | # read the dictionary
123 |
124 | read_file = open("randomforest.txt", 'r')
125 | dict_class = []
126 | for i in read_file.readlines():
127 | name, index = str(i).split(" ")[0], str(i).split(" ")[1]
128 | if name not in dict_class:
129 | dict_class.append(name)
130 | # Recreate the exact same model
131 | count = 1
132 | loaded_model = pickle.load(open('randomforest_model.sav', 'rb'))
133 | result = loaded_model.predict(X)
134 | original = []
135 | for i in result:
136 | print(f'{count} {i} {dict_class[i]} {name_convert(y[count])}')
137 | if name_convert(y[count]) not in dict_class:
138 | original.append(7)
139 | else:
140 | original.append(dict_class.index(name_convert(y[count])))
141 | count += 1
142 | print(accuracy_score(original, result))
143 | print(confusion_matrix(original, result))
144 | cm_analysis(original, result, dict_class, ymap=None, title="Confusion Matrix", figsize=(10, 10))
145 | plot_confusion_matrix(original, result, classes= np.asarray(dict_class), normalize=False, title="Confusion Matrix",cmap=plt.cm.Reds)
146 | plt.show()
147 | plt.savefig("cm_1.png")
--------------------------------------------------------------------------------
/DecisionTree/predict.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import pandas as pd
3 | # Helper libraries
4 | import numpy as np
5 | import pickle
6 | from sklearn.metrics import accuracy_score, confusion_matrix
7 | from sklearn.utils.multiclass import unique_labels
8 | import seaborn as sns
9 | from dictionary import name_convert
10 |
11 |
12 | def cm_analysis(y_true, y_pred, labels, ymap=None, title="", figsize=(10, 10)):
13 | """
14 | Generate matrix plot of confusion matrix with pretty annotations.
15 | The plot image is saved to disk.
16 | args:
17 | y_true: true label of the data, with shape (nsamples,)
18 | y_pred: prediction of the data, with shape (nsamples,)
19 | filename: filename of figure file to save
20 | labels: string array, name the order of class labels in the confusion matrix.
21 | use `clf.classes_` if using scikit-learn models.
22 | with shape (nclass,).
23 | ymap: dict: any -> string, length == nclass.
24 | if not None, map the labels & ys to more understandable strings.
25 | Caution: original y_true, y_pred and labels must align.
26 | figsize: the size of the figure plotted.
27 | """
28 | if ymap is not None:
29 | y_pred = [ymap[yi] for yi in y_pred]
30 | y_true = [ymap[yi] for yi in y_true]
31 | labels = [ymap[yi] for yi in labels]
32 | l = []
33 | for i in labels:
34 | l.append(labels.index(i))
35 | cm = confusion_matrix(y_true, y_pred, labels=l)
36 | cm_sum = np.sum(cm, axis=1, keepdims=True)
37 | cm_perc = cm / cm_sum.astype(float) * 100
38 | annot = np.empty_like(cm).astype(str)
39 | nrows, ncols = cm.shape
40 | for i in range(nrows):
41 | for j in range(ncols):
42 | c = cm[i, j]
43 | p = cm_perc[i, j]
44 | if i == j:
45 | s = cm_sum[i]
46 | annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
47 | elif c == 0:
48 | annot[i, j] = ''
49 | else:
50 | annot[i, j] = '%.1f%%\n%d' % (p, c)
51 | cm = pd.DataFrame(cm, index=labels, columns=labels)
52 | cm.index.name = 'Actual'
53 | cm.columns.name = 'Predicted'
54 | fig, ax = plt.subplots(figsize=figsize)
55 | ax.set(title=title)
56 | sns.heatmap(cm, annot=annot, fmt='', ax=ax)
57 | plt.savefig("cm.png")
58 | plt.show()
59 |
60 |
61 | def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, title=None, cmap=plt.cm.Blues):
62 | """
63 | This function prints and plots the confusion matrix.
64 | Normalization can be applied by setting `normalize=True`.
65 | """
66 | if not title:
67 | if normalize:
68 | title = 'Normalized confusion matrix'
69 | else:
70 | title = 'Confusion matrix, without normalization'
71 |
72 | # Compute confusion matrix
73 | cm = confusion_matrix(y_true, y_pred)
74 | # Only use the labels that appear in the data
75 | classes = classes[unique_labels(y_true, y_pred)]
76 | if normalize:
77 | cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
78 | print("Normalized confusion matrix")
79 | else:
80 | print('Confusion matrix, without normalization')
81 |
82 | print(cm)
83 |
84 | fig, ax = plt.subplots()
85 | im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
86 | ax.figure.colorbar(im, ax=ax)
87 | # We want to show all ticks...
88 | ax.set(xticks=np.arange(cm.shape[1]),
89 | yticks=np.arange(cm.shape[0]),
90 | # ... and label them with the respective list entries
91 | xticklabels=classes, yticklabels=classes,
92 | title=title,
93 | ylabel='True label',
94 | xlabel='Predicted label')
95 |
96 | # Rotate the tick labels and set their alignment.
97 | plt.setp(ax.get_xticklabels(), rotation=90, ha="right",
98 | rotation_mode="anchor")
99 |
100 | # Loop over data dimensions and create text annotations.
101 | fmt = '.2f' if normalize else 'd'
102 | thresh = cm.max() / 2.
103 | for i in range(cm.shape[0]):
104 | for j in range(cm.shape[1]):
105 | ax.text(j, i, format(cm[i, j], fmt),
106 | ha="center", va="center",
107 | color="white" if cm[i, j] > thresh else "black")
108 | fig.tight_layout()
109 | return ax
110 |
111 |
112 | file_dir = "D:\\SDN Project\\Data\\Processed\\"
113 |
114 | names = ['#flow_id', 'protocol', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'ndpi_proto_num', 'src2dst_packets',
115 | 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes', 'ndpi_proto']
116 | df = pd.read_csv(file_dir + '26sep19.csv', names=names)
117 | X = np.asarray(
118 | df[['protocol', 'src_port', 'dst_port', 'src2dst_packets', 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes']][
119 | 1:])
120 | y = df['ndpi_proto'][1:]
121 |
122 | # read the dictionary
123 |
124 | read_file = open("decisiontree.txt", 'r')
125 | dict_class = []
126 | for i in read_file.readlines():
127 | name, index = str(i).split(" ")[0], str(i).split(" ")[1]
128 | if name not in dict_class:
129 | dict_class.append(name)
130 | print(dict_class)
131 | # Recreate the exact same model
132 | count = 1
133 | loaded_model = pickle.load(open('decisiontree_model.sav', 'rb'))
134 | result = loaded_model.predict(X)
135 | original = []
136 | for i in result:
137 | print(f'{count} {i} {dict_class[i]} {name_convert(y[count])}')
138 | if name_convert(y[count]) not in dict_class:
139 | original.append(7)
140 | else:
141 | original.append(dict_class.index(name_convert(y[count])))
142 | count += 1
143 | print(accuracy_score(original, result))
144 | print(confusion_matrix(original, result))
145 | cm_analysis(original, result, dict_class, ymap=None, title="Confusion Matrix", figsize=(10, 10))
146 | plot_confusion_matrix(original, result, classes= np.asarray(dict_class), normalize=False, title="Confusion Matrix",cmap=plt.cm.Reds)
147 | plt.show()
148 | plt.savefig("cm_1.png")
149 |
--------------------------------------------------------------------------------
/DNN/predict.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import pandas as pd
3 | import keras
4 | # Helper libraries
5 | import numpy as np
6 | import pickle
7 | from sklearn.metrics import accuracy_score, confusion_matrix
8 | from sklearn.utils.multiclass import unique_labels
9 | import seaborn as sns
10 | from dictionary import name_convert
11 | from keras.models import load_model
12 |
13 |
14 | def accu(y_true=[], y_pred=[]):
15 | good = 0
16 | for i in range(0, len(y_true)):
17 | if y_true[i]== np.argmax(y_pred[i]):
18 | good+=1
19 | return good/len(y_true)
20 |
21 | def cm_analysis(y_true, y_pred, labels, ymap=None, title="", figsize=(10, 10)):
22 | """
23 | Generate matrix plot of confusion matrix with pretty annotations.
24 | The plot image is saved to disk.
25 | args:
26 | y_true: true label of the data, with shape (nsamples,)
27 | y_pred: prediction of the data, with shape (nsamples,)
28 | filename: filename of figure file to save
29 | labels: string array, name the order of class labels in the confusion matrix.
30 | use `clf.classes_` if using scikit-learn models.
31 | with shape (nclass,).
32 | ymap: dict: any -> string, length == nclass.
33 | if not None, map the labels & ys to more understandable strings.
34 | Caution: original y_true, y_pred and labels must align.
35 | figsize: the size of the figure plotted.
36 | """
37 | if ymap is not None:
38 | y_pred = [ymap[yi] for yi in y_pred]
39 | y_true = [ymap[yi] for yi in y_true]
40 | labels = [ymap[yi] for yi in labels]
41 | l = []
42 | for i in labels:
43 | l.append(labels.index(i))
44 | cm = confusion_matrix(y_true, y_pred, labels=l)
45 | cm_sum = np.sum(cm, axis=1, keepdims=True)
46 | cm_perc = cm / cm_sum.astype(float) * 100
47 | annot = np.empty_like(cm).astype(str)
48 | nrows, ncols = cm.shape
49 | for i in range(nrows):
50 | for j in range(ncols):
51 | c = cm[i, j]
52 | p = cm_perc[i, j]
53 | if i == j:
54 | s = cm_sum[i]
55 | annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
56 | elif c == 0:
57 | annot[i, j] = ''
58 | else:
59 | annot[i, j] = '%.1f%%\n%d' % (p, c)
60 | cm = pd.DataFrame(cm, index=labels, columns=labels)
61 | cm.index.name = 'Actual'
62 | cm.columns.name = 'Predicted'
63 | fig, ax = plt.subplots(figsize=figsize)
64 | ax.set(title=title)
65 | sns.heatmap(cm, annot=annot, fmt='', ax=ax)
66 | plt.savefig("cm.png")
67 | plt.show()
68 |
69 |
70 | def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, title=None, cmap=plt.cm.Blues):
71 | """
72 | This function prints and plots the confusion matrix.
73 | Normalization can be applied by setting `normalize=True`.
74 | """
75 | if not title:
76 | if normalize:
77 | title = 'Normalized confusion matrix'
78 | else:
79 | title = 'Confusion matrix, without normalization'
80 |
81 | # Compute confusion matrix
82 | cm = confusion_matrix(y_true, y_pred)
83 | # Only use the labels that appear in the data
84 | classes = classes[unique_labels(y_true, y_pred)]
85 | if normalize:
86 | cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
87 | print("Normalized confusion matrix")
88 | else:
89 | print('Confusion matrix, without normalization')
90 |
91 | print(cm)
92 |
93 | fig, ax = plt.subplots()
94 | im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
95 | ax.figure.colorbar(im, ax=ax)
96 | # We want to show all ticks...
97 | ax.set(xticks=np.arange(cm.shape[1]),
98 | yticks=np.arange(cm.shape[0]),
99 | # ... and label them with the respective list entries
100 | xticklabels=classes, yticklabels=classes,
101 | title=title,
102 | ylabel='True label',
103 | xlabel='Predicted label')
104 |
105 | # Rotate the tick labels and set their alignment.
106 | plt.setp(ax.get_xticklabels(), rotation=90, ha="right",
107 | rotation_mode="anchor")
108 |
109 | # Loop over data dimensions and create text annotations.
110 | fmt = '.2f' if normalize else 'd'
111 | thresh = cm.max() / 2.
112 | for i in range(cm.shape[0]):
113 | for j in range(cm.shape[1]):
114 | ax.text(j, i, format(cm[i, j], fmt),
115 | ha="center", va="center",
116 | color="white" if cm[i, j] > thresh else "black")
117 | fig.tight_layout()
118 | return ax
119 |
120 |
121 | file_dir = "D:\\SDN Project\\Data\\Processed\\"
122 |
123 | names = ['#flow_id', 'protocol', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'ndpi_proto_num', 'src2dst_packets',
124 | 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes', 'ndpi_proto']
125 | df = pd.read_csv(file_dir + '26sep19.csv', names=names)
126 | X = np.asarray(
127 | df[['protocol', 'src_port', 'dst_port', 'src2dst_packets', 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes']][
128 | 1:])
129 | y = df['ndpi_proto'][1:]
130 |
131 | # read the dictionary
132 |
133 | read_file = open("dnn.txt", 'r')
134 | dict_class = []
135 | for i in read_file.readlines():
136 | name, index = str(i).split(" ")[0], str(i).split(" ")[1]
137 | if name not in dict_class:
138 | dict_class.append(name)
139 | print(dict_class)
140 | # Recreate the exact same model
141 | count = 1
142 | loaded_model = load_model('dnn_model1.sav')
143 | #loaded_model = pickle.load(open('dnn_model.sav', 'rb'))
144 | result = loaded_model.predict(X)
145 |
146 | original = []
147 | y_pred = []
148 | for i in result:
149 | #print(np.argmax(i))
150 | print(f'{count} {np.argmax(i)} {dict_class[np.argmax(i)]} {name_convert(y[count])}')
151 | y_pred.append(np.argmax(i))
152 | if name_convert(y[count]) not in dict_class:
153 | original.append(7)
154 | else:
155 | original.append(dict_class.index(name_convert(y[count])))
156 | count += 1
157 |
158 |
159 | #print(f"accuracy: {accu(original,result)}")
160 | #print(keras.metrics.sparse_categorical_accuracy(original, result))
161 | #print(accuracy_score(y_true, y_pred))
162 | print(confusion_matrix(original, y_pred))
163 | cm_analysis(original, y_pred, dict_class, ymap=None, title="Confusion Matrix", figsize=(10, 10))
164 | plot_confusion_matrix(original, y_pred, classes=np.asarray(dict_class), normalize=False, title="Confusion Matrix",cmap=plt.cm.Reds)
165 | plt.show()
166 | plt.savefig("cm_1.png")
167 |
--------------------------------------------------------------------------------