├── .gitignore ├── Dataset ├── .gitattributes └── How to get the data.txt ├── DNN ├── cm.png ├── model_1.h5 ├── dnn_model.sav ├── dnn_model1.sav ├── dnn.py └── predict.py ├── KNN ├── cm.png ├── knn_3_model.sav ├── knn.py └── predict.py ├── DecisionTree ├── cm.png ├── 7_cm_dt.png ├── decisiontree_model.sav ├── decisiontree.py └── predict.py ├── RandomForest ├── cm.png ├── randomforest_model.sav ├── randomforest.py └── predict.py ├── dictionary.py ├── README.md └── test.txt /.gitignore: -------------------------------------------------------------------------------- 1 | network_traffic.csv 2 | -------------------------------------------------------------------------------- /Dataset/.gitattributes: -------------------------------------------------------------------------------- 1 | *.csv filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /DNN/cm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/DNN/cm.png -------------------------------------------------------------------------------- /KNN/cm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/KNN/cm.png -------------------------------------------------------------------------------- /DNN/model_1.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/DNN/model_1.h5 -------------------------------------------------------------------------------- /DNN/dnn_model.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/DNN/dnn_model.sav -------------------------------------------------------------------------------- /DNN/dnn_model1.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/DNN/dnn_model1.sav -------------------------------------------------------------------------------- /DecisionTree/cm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/DecisionTree/cm.png -------------------------------------------------------------------------------- /KNN/knn_3_model.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/KNN/knn_3_model.sav -------------------------------------------------------------------------------- /RandomForest/cm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/RandomForest/cm.png -------------------------------------------------------------------------------- /DecisionTree/7_cm_dt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/DecisionTree/7_cm_dt.png -------------------------------------------------------------------------------- /DecisionTree/decisiontree_model.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/DecisionTree/decisiontree_model.sav -------------------------------------------------------------------------------- /RandomForest/randomforest_model.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pritom007/Network-Traffic-Classification/HEAD/RandomForest/randomforest_model.sav -------------------------------------------------------------------------------- /dictionary.py: -------------------------------------------------------------------------------- 1 | def name_convert(x): 2 | file = open('..\test.txt', 'r') 3 | dict_c = {} 4 | for i in file.readlines(): 5 | i = str(i).strip() 6 | name, cl = str(i).split(" ")[0], str(i).split(" ")[1] 7 | dict_c[name] = cl 8 | try: 9 | return dict_c[x] 10 | except KeyError as e: 11 | return "default" 12 | -------------------------------------------------------------------------------- /Dataset/How to get the data.txt: -------------------------------------------------------------------------------- 1 | Since .pcap files are big in size we processed the data in csv format. And still the file is about 150mb. Please email me: 2 | pritom007@live.com 3 | or 4 | pritom@sjtu.edu.com and also write a short note why you need the data. 5 | 6 | This data is only available for research purpose not for commercial use. 7 | 8 | Please cite my work if you use this dataset for research. 9 | 10 | @article{mondal2021dynamic, 11 | title={A dynamic network traffic classifier using supervised ML for a Docker-based SDN network}, 12 | author={Mondal, Pritom Kumar and Aguirre Sanchez, Lizeth P and Benedetto, Emmanuele and Shen, Yao and Guo, Minyi}, 13 | journal={Connection Science}, 14 | pages={1--26}, 15 | year={2021}, 16 | publisher={Taylor \& Francis} 17 | } 18 | 19 | read more: https://doi.org/10.1080/09540091.2020.1870437 20 | -------------------------------------------------------------------------------- /DecisionTree/decisiontree.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.metrics import accuracy_score, classification_report 3 | from sklearn.model_selection import train_test_split 4 | import numpy as np 5 | from sklearn.tree import DecisionTreeClassifier 6 | import pickle 7 | 8 | file_dir = "..\\Data\\" 9 | 10 | names = ['#flow_id', 'protocol', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'ndpi_proto_num', 'src2dst_packets', 11 | 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes', 'ndpi_proto', 'class'] 12 | print("Loading Dataset total_class.csv") 13 | df = pd.read_csv(file_dir + 'total_class.csv', names=names) 14 | 15 | X = np.asarray( 16 | df[['protocol', 'src_port', 'dst_port', 'src2dst_packets', 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes']][ 17 | 1:]) 18 | y = [] 19 | 20 | my_tags = [] 21 | classes = open("decisiontree.txt", "w+") 22 | for i in df['class'][1:]: 23 | if i not in my_tags: 24 | my_tags.append(i) 25 | for i in df['class'][1:]: 26 | classes.write(i + " " + str(my_tags.index(i)) + "\n") 27 | y.append(my_tags.index(i)) 28 | y = np.asarray(y) 29 | print("Splitting dataset") 30 | x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True) 31 | 32 | clf = DecisionTreeClassifier(random_state=0) 33 | 34 | print("Training Started") 35 | clf.fit(x_train, y_train) 36 | 37 | print("Testing the classifier") 38 | y_pred = clf.predict(x_test) 39 | 40 | print("Saving the model") 41 | filename = 'decisiontree_model.sav' 42 | pickle.dump(clf, open(filename, 'wb')) 43 | 44 | print('accuracy %s' % accuracy_score(y_pred, y_test)) 45 | print(classification_report(y_test, y_pred, target_names=my_tags, labels=range(len(my_tags)))) 46 | -------------------------------------------------------------------------------- /RandomForest/randomforest.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.metrics import accuracy_score, classification_report 3 | from sklearn.model_selection import train_test_split 4 | import numpy as np 5 | from sklearn.ensemble import RandomForestClassifier 6 | import pickle 7 | 8 | 9 | file_dir = "D:\\SDN Project\\Data\\" 10 | 11 | names = ['#flow_id', 'protocol', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'ndpi_proto_num', 'src2dst_packets', 12 | 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes', 'ndpi_proto', 'class'] 13 | print("Loading Dataset total_class.csv") 14 | df = pd.read_csv(file_dir + 'total_class.csv', names=names) 15 | 16 | X = np.asarray( 17 | df[['protocol', 'src_port', 'dst_port', 'src2dst_packets', 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes']][ 18 | 1:]) 19 | y = [] 20 | 21 | my_tags = [] 22 | classes = open("randomforest.txt", "w+") 23 | for i in df['class'][1:]: 24 | if i not in my_tags: 25 | my_tags.append(i) 26 | for i in df['class'][1:]: 27 | classes.write(i + " " + str(my_tags.index(i)) + "\n") 28 | y.append(my_tags.index(i)) 29 | y = np.asarray(y) 30 | print("Splitting dataset") 31 | x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.30, shuffle=True) 32 | 33 | clf = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=1) 34 | print("Training Started") 35 | clf.fit(x_train, y_train) 36 | 37 | print("Testing the classifier") 38 | y_pred = clf.predict(x_test) 39 | 40 | print("Saving the model") 41 | filename = 'randomforest_model.sav' 42 | pickle.dump(clf, open(filename, 'wb')) 43 | 44 | print('accuracy %s' % accuracy_score(y_pred, y_test)) 45 | print(classification_report(y_test, y_pred, target_names=my_tags, labels=range(len(my_tags)))) 46 | -------------------------------------------------------------------------------- /KNN/knn.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.metrics import accuracy_score, classification_report 3 | from sklearn.model_selection import train_test_split 4 | import numpy as np 5 | from sklearn.neighbors import KNeighborsClassifier 6 | #from sklearn.neighbors.nca import NeighborhoodComponentsAnalysis 7 | 8 | import pickle 9 | file_dir = "D:\\SDN Project\\Data\\" 10 | 11 | names = ['#flow_id', 'protocol', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'ndpi_proto_num', 'src2dst_packets', 12 | 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes', 'ndpi_proto', 'class'] 13 | print("Loading Dataset total_class.csv") 14 | df = pd.read_csv(file_dir + 'total_class.csv', names=names) 15 | 16 | X = np.asarray( 17 | df[['protocol', 'src_port', 'dst_port', 'src2dst_packets', 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes']][ 18 | 1:]) 19 | y = [] 20 | 21 | my_tags = [] 22 | classes = open("knn.txt", "w+") 23 | for i in df['class'][1:]: 24 | if i not in my_tags: 25 | my_tags.append(i) 26 | for i in df['class'][1:]: 27 | classes.write(i + " " + str(my_tags.index(i)) + "\n") 28 | y.append(my_tags.index(i)) 29 | y = np.asarray(y) 30 | print("Splitting dataset") 31 | x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True) 32 | max = 0 33 | for i in range(2, 15): 34 | knn = KNeighborsClassifier(n_neighbors=i) 35 | 36 | #print("Training Started") 37 | knn.fit(x_train, y_train) 38 | 39 | #print("Testing the classifier") 40 | y_pred = knn.predict(x_test) 41 | acc = accuracy_score(y_pred, y_test) 42 | print(f'K={i} accuracy %s' % acc) 43 | if acc >= max: 44 | print("Saving the model") 45 | filename = 'knn_'+str(i)+'_model.sav' 46 | pickle.dump(knn, open(filename, 'wb')) 47 | max = acc 48 | print('saved acc: ', max) 49 | 50 | 51 | #print(classification_report(y_test, y_pred, target_names=my_tags, labels=range(len(my_tags)))) -------------------------------------------------------------------------------- /DNN/dnn.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import matplotlib.pyplot as plt 3 | import keras 4 | import pandas as pd 5 | import tensorflow as tf 6 | 7 | # Helper libraries 8 | import numpy as np 9 | 10 | from keras import backend as K 11 | from matplotlib import pyplot 12 | 13 | from sklearn.model_selection import train_test_split 14 | import pickle 15 | 16 | def recall_m(y_true, y_pred): 17 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 18 | possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) 19 | recall = true_positives / (possible_positives + K.epsilon()) 20 | return recall 21 | 22 | 23 | def precision_m(y_true, y_pred): 24 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 25 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) 26 | precision = true_positives / (predicted_positives + K.epsilon()) 27 | return precision 28 | 29 | 30 | def f1_m(y_true, y_pred): 31 | precision = precision_m(y_true, y_pred) 32 | recall = recall_m(y_true, y_pred) 33 | return 2 * ((precision * recall) / (precision + recall + K.epsilon())) 34 | 35 | 36 | file_dir = "D:\\SDN Project\\Data\\"#"D:\\SDN Project\\" 37 | 38 | 39 | names = ['protocol','src_ip' , 'src_port', 'dst_ip', 'dst_port', 'ndpi_proto_num', 'src2dst_packets', 40 | 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes', 'ndpi_proto', 'class'] 41 | df = pd.read_csv(file_dir + 'total_class.csv', names=names) 42 | array = df.values 43 | 44 | X = np.asarray(df[['protocol', 'src_port', 'dst_port', 'src2dst_packets', 'src2dst_bytes', 'dst2src_packets','dst2src_bytes']][1:]) 45 | y = [] 46 | 47 | my_tags = [] 48 | classes = open("dnn.txt", "w+") 49 | for i in df['class'][1:]: 50 | if i not in my_tags: 51 | my_tags.append(i) 52 | for i in df['class'][1:]: 53 | classes.write(i+" "+str(my_tags.index(i))+"\n") 54 | y.append(my_tags.index(i)) 55 | y = np.asarray(y) 56 | print(X.shape, y.shape) 57 | 58 | 59 | x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True) 60 | features = len(x_train[0]) 61 | 62 | model = keras.Sequential([ 63 | keras.layers.Dense(features, kernel_regularizer=tf.keras.regularizers.l1(0.1)), 64 | keras.layers.Dense(512, activation=tf.nn.relu, kernel_regularizer=tf.keras.regularizers.l2(0.1)), 65 | keras.layers.Dense(256, activation=tf.nn.relu, kernel_regularizer=tf.keras.regularizers.l1(0.1)), 66 | keras.layers.Dense(128, activation=tf.nn.relu, kernel_regularizer=tf.keras.regularizers.l2(0.1)), 67 | keras.layers.Dense(10, activation=tf.nn.softmax) 68 | ]) 69 | model.compile(optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001), 70 | loss='sparse_categorical_crossentropy', 71 | metrics=['acc', f1_m, precision_m, recall_m]) 72 | 73 | history = model.fit(x_train, y_train, batch_size=1000, epochs=400) 74 | loss, accuracy, f1_score, precision, recall = model.evaluate(x_test, np.array(y_test), verbose=0) 75 | model.save("dnn_model1.sav") 76 | y_predict = model.predict(x_test) 77 | 78 | print("Saving the model") 79 | filename = 'dnn_model.sav' 80 | pickle.dump(model, open(filename, 'wb')) 81 | 82 | for i in y_predict: 83 | print(np.argmax(i)) 84 | print(f'loss: {loss}, acc: {accuracy}, f1_score: {f1_score}, precision: {precision}, recall: {recall}') 85 | print(model.summary()) 86 | print(x_test[0]) 87 | pyplot.subplot(212) 88 | pyplot.title('Accuracy') 89 | pyplot.plot(history.history['acc'], label='train') 90 | pyplot.legend() 91 | pyplot.show() 92 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Network-Traffic-Classification 2 | 3 | This is a research project for classifying network traffic. We collected more than 300000 flows from the network. After that, we used nDPI to analyze the flows. We got more than 100 types of applications. Then we group that application into 10 classes. After that, we tried different ML algorithms to classify them. 4 | 5 | Our current results- 6 | 7 | Decision tree 95.8% accuracy 8 | 9 | (I have added a new file https://github.com/pritom007/Network-Traffic-Classification/blob/master/DecisionTree/DecisionTree.ipynb with clean code. You just follow this code and implement it for KNN, RF) 10 | 11 | Random forest 96.69% accuracy 12 | 13 | KNN 97.24% accuracy 14 | 15 | PAA 99.29% accuracy (read the paper to know more) 16 | 17 | To get the dataset check out the instructions in the dataset folder. 18 | 19 | # How Did we collect Data 20 | 21 | We used Wireshark to collect the packets. Since for the project we wanted to use lab environment data, we first redirected our lab-network to one personal computer(pc) and in that pc we used Wireshark. After collecting the packets (as a .pcap file), we used ndpi to analyze the packets and get extract flow info and then we export that data as an excel file. The `data.csv` contains information on all parameters. However, for our project, we only used the top 7 most important parameters as features. 22 | 23 | Github has limited the download so I am sharing a gdrive link for downloading the raw data: https://drive.google.com/file/d/1lcQmYyZutjsW_yJoHgx3Vles8eCgwQeD/view?usp=sharing 24 | 25 | After you download it, you have to pre-process, in the paper, we showed in a table that how did we group the applications in 10 classes. 26 | 27 | Please read the following paper to know more: https://doi.org/10.1080/09540091.2020.1870437 28 | 29 | ## To cite the paper and code: 30 | 31 | article{mondal2021dynamic,
32 | title={A dynamic network traffic classifier using supervised ML for a Docker-based SDN network},
33 | author={Mondal, Pritom Kumar and Aguirre Sanchez, Lizeth P and Benedetto, Emmanuele and Shen, Yao and Guo, Minyi},
34 | journal={Connection Science},
35 | pages={1--26},
36 | year={2021},
37 | publisher={Taylor \& Francis}
38 | } 39 | 40 | # 中文版 41 | # 网络流量分类 42 | 43 | 这是一个对网络流量进行分类的研究项目。我们从网络中收集了超过 300000 个流。之后,我们使用 nDPI 来分析流量。我们收到了 100 多种类型的应用程序。然后我们将该应用程序分为 10 个类。之后,我们尝试了不同的 ML 算法来对它们进行分类。 44 | 45 | 我们目前的结果—— 46 | 47 | Decision tree 95.8% 准确率 48 | 49 | (我用干净的代码添加了一个新文件 https://github.com/pritom007/Network-Traffic-Classification/blob/master/DecisionTree/DecisionTree.ipynb。您只需按照此代码并为 KNN、RF 实现它) 50 | 51 | Random forest 96.69% 准确率 52 | 53 | KNN 97.24% 准确率 54 | 55 | PAA 99.29% 准确率(阅读论文了解更多) 56 | 57 | 要获取数据集,请查看数据集文件夹中的说明。 58 | 59 | # 如何收集数据的 60 | 61 | 我们使用 Wireshark 来收集数据包。由于对于我们想要使用实验室环境数据的项目,我们首先将我们的实验室网络重定向到一台个人计算机(PC),并在该 PC 中使用 Wireshark。收集数据包(作为 .pcap 文件)后,我们使用 ndpi 分析数据包并获取提取流信息,然后将该数据导出为 excel 文件。 `data.csv` 包含有关所有参数的信息。然而,对于我们的项目,我们只使用了前 7 个最重要的参数作为特征。 62 | 63 | Github 限制了下载,所以我分享了一个用于下载原始数据的 gdrive 链接:https://drive.google.com/file/d/1lcQmYyZutjsW_yJoHgx3Vles8eCgwQeD/view?usp=sharing 64 | 65 | 下载后,您必须进行预处理,在论文中,我们在表格中展示了我们如何将应用程序分为 10 个类。 66 | 67 | 请阅读以下论文了解更多信息:https://doi.org/10.1080/09540091.2020.1870437 68 | 69 | ## 引用论文和代码: 70 | 71 | article{mondal2021dynamic,
72 | title={A dynamic network traffic classifier using supervised ML for a Docker-based SDN network},
73 | author={Mondal, Pritom Kumar and Aguirre Sanchez, Lizeth P and Benedetto, Emmanuele and Shen, Yao and Guo, Minyi},
74 | journal={Connection Science},
75 | pages={1--26},
76 | year={2021},
77 | publisher={Taylor \& Francis}
78 | } 79 | -------------------------------------------------------------------------------- /test.txt: -------------------------------------------------------------------------------- 1 | DHCPV6 critical 2 | DHCP critical 3 | ApplePush p2p 4 | Google transaction 5 | HTTP_Proxy transaction 6 | HTTP.GenericProtocol transaction 7 | HTTP.Microsoft transaction 8 | HTTP.Office365 transaction 9 | HTTP.WindowsUpdate transaction 10 | HTTP.UbuntuONE transaction 11 | HTTP.MSN transaction 12 | HTTP.QQ transaction 13 | HTTP.Amazon transaction 14 | Amazon transaction 15 | Dropbox transaction 16 | HTTP transaction 17 | HTTP.Cloudflare transaction 18 | HTTP.Sina(Weibo) transaction 19 | HTTP.Google transaction 20 | HTTP.ApplePush transaction 21 | HTTP.GoogleServices transaction 22 | HTTP.Facebook transaction 23 | HTTP.MS_OneDrive transaction 24 | HTTP.Apple transaction 25 | HTTP.Skype transaction 26 | HTTP.AppleStore transaction 27 | HTTP.Github transaction 28 | NETBIOS critical 29 | FACEBOOK bulk 30 | QQ bulk 31 | TLS.WeChat bulk 32 | Viber bulk 33 | HTTP.WeChat bulk 34 | TLS.WeChat bulk 35 | TLS.Twitter bulk 36 | Facebook.chat bulk 37 | Wechat bulk 38 | Skype bulk 39 | SMTPS bulk 40 | SMTP bulk 41 | IMAPS.Tor bulk 42 | IMAPS bulk 43 | IMAP bulk 44 | IMAPS.QQ bulk 45 | IMAPS.Yahoo bulk 46 | TLS transaction 47 | TLS.MSN transaction 48 | TLS.WindowsUpdate transaction 49 | TLS.Dropbox transaction 50 | TLS.Facebook transaction 51 | TLS.TLS_No_Cert transaction 52 | TLS.Signal transaction 53 | TLS.Github transaction 54 | TLS.Microsoft transaction 55 | TLS_No_Cert transaction 56 | TLS.GenericProtocol transaction 57 | TLS.Google transaction 58 | TLS.Cloudflare transaction 59 | TLS.Office365 transaction 60 | TLS.Amazon transaction 61 | TLS.LinkedIn transaction 62 | TLS.QQ transaction 63 | TLS.MS_OneDrive transaction 64 | TLS.Skype transaction 65 | TLS.Sina(Weibo) transaction 66 | TLS.GoogleServices transaction 67 | TLS.Spotify transaction 68 | TLS.Yahoo transaction 69 | TLS.UbuntuONE transaction 70 | TLS.Steam transaction 71 | TLS.Apple transaction 72 | TLS.AppleiCloud transaction 73 | TLS.Wikipedia transaction 74 | TLS.Starcraft transaction 75 | TLS.GoogleDocs transaction 76 | TLS.ApplePush transaction 77 | TLS.AppleiTunes transaction 78 | TLS.AppleStore transaction 79 | TLS.IMAPS transaction 80 | QUIC.Google transaction 81 | QUIC.GoogleServices transaction 82 | QUIC transaction 83 | DNS.GoogleServices critical 84 | DNS.Google critical 85 | DNS.Office365 critical 86 | DNS.MS_OneDrive critical 87 | DNS.GenericProtocol critical 88 | DNS.LinkedIn critical 89 | DNS.Microsoft critical 90 | DNS.WindowsUpdate critical 91 | DNS.QQ critical 92 | DNS.AmazonVideo critical 93 | DNS.UbuntuONE critical 94 | DNS.Facebook critical 95 | DNS.Dropbox critical 96 | DNS.PlayStore critical 97 | DNS.MSN critical 98 | DNS.Skype critical 99 | DNS.Amazon critical 100 | DNS critical 101 | DNS.Wikipedia critical 102 | DNS.Instagram critical 103 | DNS.YouTube critical 104 | DNS.Sina(Weibo) critical 105 | DNS.Yahoo critical 106 | DNS.Steam critical 107 | DNS.ApplePush critical 108 | DNS.Xbox critical 109 | DNS.Twitter critical 110 | DNS.GoogleMaps critical 111 | DNS.AppleiTunes critical 112 | DNS.Github critical 113 | DNS.Spotify critical 114 | DNS.GoogleDocs critical 115 | DNS.Apple critical 116 | DNS.GoogleDrive critical 117 | DNS.GMail critical 118 | DNS.GooglePlus critical 119 | DNS.AppleiCloud critical 120 | DNS.NetFlix critical 121 | MDNS critical 122 | LLMNR critical 123 | UPnP OAM 124 | SSDP OAM 125 | BJNP OAM 126 | Starcraft p2p 127 | Playstation p2p 128 | steam p2p 129 | TLS.Xbox p2p 130 | Xbox p2p 131 | Steam p2p 132 | BitTorrent p2p 133 | eDonkey p2p 134 | Pando_Media_Booster p2p 135 | Thunder p2p 136 | TLS.AmazonVideo video 137 | HTTP.AmazonVideo video 138 | TLS.YouTube video 139 | TeamViewer transaction 140 | RDP transaction 141 | RTP control 142 | BGP control 143 | ICMP control 144 | NTP control 145 | ICMPv6 control 146 | IGMP control 147 | RX signaling 148 | STUN signaling 149 | H323 signaling 150 | STUN.SkypeCall signaling 151 | STUN.Signal signaling 152 | STUN.WhatsAppVoice signaling 153 | STUN.GoogleHangoutDuo signaling 154 | Skype.SkypeCall VoIP 155 | WhatsAppVoice VoIP 156 | FTP_CONTROL bulk 157 | SSH.Google OAM 158 | SSH OAM 159 | TELNET OAM 160 | CHECKMK OAM 161 | SNMP OAM 162 | Whois-DAS bulk 163 | MsSQL-TDS bulk 164 | PostgreSQL bulk 165 | IPsec OAM 166 | Citrix default 167 | Redis default 168 | Targus Dataspeed default 169 | Tor default 170 | SOCKS default 171 | Mining default 172 | SOCKS default 173 | COAP default 174 | MQTT default 175 | CiscoVPN default 176 | Unknown default 177 | -------------------------------------------------------------------------------- /KNN/predict.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import pandas as pd 3 | # Helper libraries 4 | import numpy as np 5 | import pickle 6 | from sklearn.metrics import accuracy_score, confusion_matrix 7 | from sklearn.utils.multiclass import unique_labels 8 | import seaborn as sns 9 | from dictionary import name_convert 10 | 11 | 12 | def cm_analysis(y_true, y_pred, labels, ymap=None, title="", figsize=(10, 10)): 13 | """ 14 | Generate matrix plot of confusion matrix with pretty annotations. 15 | The plot image is saved to disk. 16 | args: 17 | y_true: true label of the data, with shape (nsamples,) 18 | y_pred: prediction of the data, with shape (nsamples,) 19 | filename: filename of figure file to save 20 | labels: string array, name the order of class labels in the confusion matrix. 21 | use `clf.classes_` if using scikit-learn models. 22 | with shape (nclass,). 23 | ymap: dict: any -> string, length == nclass. 24 | if not None, map the labels & ys to more understandable strings. 25 | Caution: original y_true, y_pred and labels must align. 26 | figsize: the size of the figure plotted. 27 | """ 28 | if ymap is not None: 29 | y_pred = [ymap[yi] for yi in y_pred] 30 | y_true = [ymap[yi] for yi in y_true] 31 | labels = [ymap[yi] for yi in labels] 32 | l = [] 33 | for i in labels: 34 | l.append(labels.index(i)) 35 | cm = confusion_matrix(y_true, y_pred, labels=l) 36 | cm_sum = np.sum(cm, axis=1, keepdims=True) 37 | cm_perc = cm / cm_sum.astype(float) * 100 38 | annot = np.empty_like(cm).astype(str) 39 | nrows, ncols = cm.shape 40 | for i in range(nrows): 41 | for j in range(ncols): 42 | c = cm[i, j] 43 | p = cm_perc[i, j] 44 | if i == j: 45 | s = cm_sum[i] 46 | annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s) 47 | elif c == 0: 48 | annot[i, j] = '' 49 | else: 50 | annot[i, j] = '%.1f%%\n%d' % (p, c) 51 | cm = pd.DataFrame(cm, index=labels, columns=labels) 52 | cm.index.name = 'Actual' 53 | cm.columns.name = 'Predicted' 54 | fig, ax = plt.subplots(figsize=figsize) 55 | ax.set(title=title) 56 | sns.heatmap(cm, annot=annot, fmt='', ax=ax) 57 | plt.savefig("cm.png") 58 | plt.show() 59 | 60 | 61 | def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, title=None, cmap=plt.cm.Blues): 62 | """ 63 | This function prints and plots the confusion matrix. 64 | Normalization can be applied by setting `normalize=True`. 65 | """ 66 | if not title: 67 | if normalize: 68 | title = 'Normalized confusion matrix' 69 | else: 70 | title = 'Confusion matrix, without normalization' 71 | 72 | # Compute confusion matrix 73 | cm = confusion_matrix(y_true, y_pred) 74 | # Only use the labels that appear in the data 75 | classes = classes[unique_labels(y_true, y_pred)] 76 | if normalize: 77 | cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] 78 | print("Normalized confusion matrix") 79 | else: 80 | print('Confusion matrix, without normalization') 81 | 82 | print(cm) 83 | 84 | fig, ax = plt.subplots() 85 | im = ax.imshow(cm, interpolation='nearest', cmap=cmap) 86 | ax.figure.colorbar(im, ax=ax) 87 | # We want to show all ticks... 88 | ax.set(xticks=np.arange(cm.shape[1]), 89 | yticks=np.arange(cm.shape[0]), 90 | # ... and label them with the respective list entries 91 | xticklabels=classes, yticklabels=classes, 92 | title=title, 93 | ylabel='True label', 94 | xlabel='Predicted label') 95 | 96 | # Rotate the tick labels and set their alignment. 97 | plt.setp(ax.get_xticklabels(), rotation=90, ha="right", 98 | rotation_mode="anchor") 99 | 100 | # Loop over data dimensions and create text annotations. 101 | fmt = '.2f' if normalize else 'd' 102 | thresh = cm.max() / 2. 103 | for i in range(cm.shape[0]): 104 | for j in range(cm.shape[1]): 105 | ax.text(j, i, format(cm[i, j], fmt), 106 | ha="center", va="center", 107 | color="white" if cm[i, j] > thresh else "black") 108 | fig.tight_layout() 109 | return ax 110 | 111 | 112 | file_dir = "D:\\SDN Project\\Data\\Processed\\" 113 | 114 | names = ['#flow_id', 'protocol', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'ndpi_proto_num', 'src2dst_packets', 115 | 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes', 'ndpi_proto'] 116 | df = pd.read_csv(file_dir + '26sep19.csv', names=names) 117 | X = np.asarray( 118 | df[['protocol', 'src_port', 'dst_port', 'src2dst_packets', 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes']][ 119 | 1:]) 120 | y = df['ndpi_proto'][1:] 121 | 122 | # read the dictionary 123 | 124 | read_file = open("knn.txt", 'r') 125 | dict_class = [] 126 | for i in read_file.readlines(): 127 | name, index = str(i).split(" ")[0], str(i).split(" ")[1] 128 | if name not in dict_class: 129 | dict_class.append(name) 130 | print(dict_class) 131 | # Recreate the exact same model 132 | count = 1 133 | loaded_model = pickle.load(open('knn_3_model.sav', 'rb')) 134 | result = loaded_model.predict(X) 135 | original = [] 136 | for i in result: 137 | print(f'{count} {i} {dict_class[i]} {name_convert(y[count])}') 138 | if name_convert(y[count]) not in dict_class: 139 | original.append(7) 140 | else: 141 | original.append(dict_class.index(name_convert(y[count]))) 142 | count += 1 143 | print(accuracy_score(original, result)) 144 | print(confusion_matrix(original, result)) 145 | cm_analysis(original, result, dict_class, ymap=None, title="Confusion Matrix", figsize=(10, 10)) 146 | plot_confusion_matrix(original, result, classes= np.asarray(dict_class), normalize=False, title="Confusion Matrix",cmap=plt.cm.Reds) 147 | plt.show() 148 | plt.savefig("cm_1.png") -------------------------------------------------------------------------------- /RandomForest/predict.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import pandas as pd 3 | # Helper libraries 4 | import numpy as np 5 | import pickle 6 | from sklearn.metrics import accuracy_score, confusion_matrix 7 | from sklearn.utils.multiclass import unique_labels 8 | import seaborn as sns 9 | from dictionary import name_convert 10 | 11 | 12 | def cm_analysis(y_true, y_pred, labels, ymap=None, title="", figsize=(10, 10)): 13 | """ 14 | Generate matrix plot of confusion matrix with pretty annotations. 15 | The plot image is saved to disk. 16 | args: 17 | y_true: true label of the data, with shape (nsamples,) 18 | y_pred: prediction of the data, with shape (nsamples,) 19 | filename: filename of figure file to save 20 | labels: string array, name the order of class labels in the confusion matrix. 21 | use `clf.classes_` if using scikit-learn models. 22 | with shape (nclass,). 23 | ymap: dict: any -> string, length == nclass. 24 | if not None, map the labels & ys to more understandable strings. 25 | Caution: original y_true, y_pred and labels must align. 26 | figsize: the size of the figure plotted. 27 | """ 28 | if ymap is not None: 29 | y_pred = [ymap[yi] for yi in y_pred] 30 | y_true = [ymap[yi] for yi in y_true] 31 | labels = [ymap[yi] for yi in labels] 32 | l = [] 33 | for i in labels: 34 | l.append(labels.index(i)) 35 | cm = confusion_matrix(y_true, y_pred, labels=l) 36 | cm_sum = np.sum(cm, axis=1, keepdims=True) 37 | cm_perc = cm / cm_sum.astype(float) * 100 38 | annot = np.empty_like(cm).astype(str) 39 | nrows, ncols = cm.shape 40 | for i in range(nrows): 41 | for j in range(ncols): 42 | c = cm[i, j] 43 | p = cm_perc[i, j] 44 | if i == j: 45 | s = cm_sum[i] 46 | annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s) 47 | elif c == 0: 48 | annot[i, j] = '' 49 | else: 50 | annot[i, j] = '%.1f%%\n%d' % (p, c) 51 | cm = pd.DataFrame(cm, index=labels, columns=labels) 52 | cm.index.name = 'Actual' 53 | cm.columns.name = 'Predicted' 54 | fig, ax = plt.subplots(figsize=figsize) 55 | ax.set(title=title) 56 | sns.heatmap(cm, annot=annot, fmt='', ax=ax) 57 | plt.savefig("cm.png") 58 | plt.show() 59 | 60 | 61 | def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, title=None, cmap=plt.cm.Blues): 62 | """ 63 | This function prints and plots the confusion matrix. 64 | Normalization can be applied by setting `normalize=True`. 65 | """ 66 | if not title: 67 | if normalize: 68 | title = 'Normalized confusion matrix' 69 | else: 70 | title = 'Confusion matrix, without normalization' 71 | 72 | # Compute confusion matrix 73 | cm = confusion_matrix(y_true, y_pred) 74 | # Only use the labels that appear in the data 75 | classes = classes[unique_labels(y_true, y_pred)] 76 | if normalize: 77 | cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] 78 | print("Normalized confusion matrix") 79 | else: 80 | print('Confusion matrix, without normalization') 81 | 82 | print(cm) 83 | 84 | fig, ax = plt.subplots() 85 | im = ax.imshow(cm, interpolation='nearest', cmap=cmap) 86 | ax.figure.colorbar(im, ax=ax) 87 | # We want to show all ticks... 88 | ax.set(xticks=np.arange(cm.shape[1]), 89 | yticks=np.arange(cm.shape[0]), 90 | # ... and label them with the respective list entries 91 | xticklabels=classes, yticklabels=classes, 92 | title=title, 93 | ylabel='True label', 94 | xlabel='Predicted label') 95 | 96 | # Rotate the tick labels and set their alignment. 97 | plt.setp(ax.get_xticklabels(), rotation=90, ha="right", 98 | rotation_mode="anchor") 99 | 100 | # Loop over data dimensions and create text annotations. 101 | fmt = '.2f' if normalize else 'd' 102 | thresh = cm.max() / 2. 103 | for i in range(cm.shape[0]): 104 | for j in range(cm.shape[1]): 105 | ax.text(j, i, format(cm[i, j], fmt), 106 | ha="center", va="center", 107 | color="white" if cm[i, j] > thresh else "black") 108 | fig.tight_layout() 109 | return ax 110 | 111 | 112 | file_dir = "D:\\SDN Project\\Data\\Processed\\" 113 | 114 | names = ['#flow_id', 'protocol', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'ndpi_proto_num', 'src2dst_packets', 115 | 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes', 'ndpi_proto'] 116 | df = pd.read_csv(file_dir + '26sep19.csv', names=names) 117 | X = np.asarray( 118 | df[['protocol', 'src_port', 'dst_port', 'src2dst_packets', 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes']][ 119 | 1:]) 120 | y = df['ndpi_proto'][1:] 121 | 122 | # read the dictionary 123 | 124 | read_file = open("randomforest.txt", 'r') 125 | dict_class = [] 126 | for i in read_file.readlines(): 127 | name, index = str(i).split(" ")[0], str(i).split(" ")[1] 128 | if name not in dict_class: 129 | dict_class.append(name) 130 | # Recreate the exact same model 131 | count = 1 132 | loaded_model = pickle.load(open('randomforest_model.sav', 'rb')) 133 | result = loaded_model.predict(X) 134 | original = [] 135 | for i in result: 136 | print(f'{count} {i} {dict_class[i]} {name_convert(y[count])}') 137 | if name_convert(y[count]) not in dict_class: 138 | original.append(7) 139 | else: 140 | original.append(dict_class.index(name_convert(y[count]))) 141 | count += 1 142 | print(accuracy_score(original, result)) 143 | print(confusion_matrix(original, result)) 144 | cm_analysis(original, result, dict_class, ymap=None, title="Confusion Matrix", figsize=(10, 10)) 145 | plot_confusion_matrix(original, result, classes= np.asarray(dict_class), normalize=False, title="Confusion Matrix",cmap=plt.cm.Reds) 146 | plt.show() 147 | plt.savefig("cm_1.png") -------------------------------------------------------------------------------- /DecisionTree/predict.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import pandas as pd 3 | # Helper libraries 4 | import numpy as np 5 | import pickle 6 | from sklearn.metrics import accuracy_score, confusion_matrix 7 | from sklearn.utils.multiclass import unique_labels 8 | import seaborn as sns 9 | from dictionary import name_convert 10 | 11 | 12 | def cm_analysis(y_true, y_pred, labels, ymap=None, title="", figsize=(10, 10)): 13 | """ 14 | Generate matrix plot of confusion matrix with pretty annotations. 15 | The plot image is saved to disk. 16 | args: 17 | y_true: true label of the data, with shape (nsamples,) 18 | y_pred: prediction of the data, with shape (nsamples,) 19 | filename: filename of figure file to save 20 | labels: string array, name the order of class labels in the confusion matrix. 21 | use `clf.classes_` if using scikit-learn models. 22 | with shape (nclass,). 23 | ymap: dict: any -> string, length == nclass. 24 | if not None, map the labels & ys to more understandable strings. 25 | Caution: original y_true, y_pred and labels must align. 26 | figsize: the size of the figure plotted. 27 | """ 28 | if ymap is not None: 29 | y_pred = [ymap[yi] for yi in y_pred] 30 | y_true = [ymap[yi] for yi in y_true] 31 | labels = [ymap[yi] for yi in labels] 32 | l = [] 33 | for i in labels: 34 | l.append(labels.index(i)) 35 | cm = confusion_matrix(y_true, y_pred, labels=l) 36 | cm_sum = np.sum(cm, axis=1, keepdims=True) 37 | cm_perc = cm / cm_sum.astype(float) * 100 38 | annot = np.empty_like(cm).astype(str) 39 | nrows, ncols = cm.shape 40 | for i in range(nrows): 41 | for j in range(ncols): 42 | c = cm[i, j] 43 | p = cm_perc[i, j] 44 | if i == j: 45 | s = cm_sum[i] 46 | annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s) 47 | elif c == 0: 48 | annot[i, j] = '' 49 | else: 50 | annot[i, j] = '%.1f%%\n%d' % (p, c) 51 | cm = pd.DataFrame(cm, index=labels, columns=labels) 52 | cm.index.name = 'Actual' 53 | cm.columns.name = 'Predicted' 54 | fig, ax = plt.subplots(figsize=figsize) 55 | ax.set(title=title) 56 | sns.heatmap(cm, annot=annot, fmt='', ax=ax) 57 | plt.savefig("cm.png") 58 | plt.show() 59 | 60 | 61 | def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, title=None, cmap=plt.cm.Blues): 62 | """ 63 | This function prints and plots the confusion matrix. 64 | Normalization can be applied by setting `normalize=True`. 65 | """ 66 | if not title: 67 | if normalize: 68 | title = 'Normalized confusion matrix' 69 | else: 70 | title = 'Confusion matrix, without normalization' 71 | 72 | # Compute confusion matrix 73 | cm = confusion_matrix(y_true, y_pred) 74 | # Only use the labels that appear in the data 75 | classes = classes[unique_labels(y_true, y_pred)] 76 | if normalize: 77 | cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] 78 | print("Normalized confusion matrix") 79 | else: 80 | print('Confusion matrix, without normalization') 81 | 82 | print(cm) 83 | 84 | fig, ax = plt.subplots() 85 | im = ax.imshow(cm, interpolation='nearest', cmap=cmap) 86 | ax.figure.colorbar(im, ax=ax) 87 | # We want to show all ticks... 88 | ax.set(xticks=np.arange(cm.shape[1]), 89 | yticks=np.arange(cm.shape[0]), 90 | # ... and label them with the respective list entries 91 | xticklabels=classes, yticklabels=classes, 92 | title=title, 93 | ylabel='True label', 94 | xlabel='Predicted label') 95 | 96 | # Rotate the tick labels and set their alignment. 97 | plt.setp(ax.get_xticklabels(), rotation=90, ha="right", 98 | rotation_mode="anchor") 99 | 100 | # Loop over data dimensions and create text annotations. 101 | fmt = '.2f' if normalize else 'd' 102 | thresh = cm.max() / 2. 103 | for i in range(cm.shape[0]): 104 | for j in range(cm.shape[1]): 105 | ax.text(j, i, format(cm[i, j], fmt), 106 | ha="center", va="center", 107 | color="white" if cm[i, j] > thresh else "black") 108 | fig.tight_layout() 109 | return ax 110 | 111 | 112 | file_dir = "D:\\SDN Project\\Data\\Processed\\" 113 | 114 | names = ['#flow_id', 'protocol', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'ndpi_proto_num', 'src2dst_packets', 115 | 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes', 'ndpi_proto'] 116 | df = pd.read_csv(file_dir + '26sep19.csv', names=names) 117 | X = np.asarray( 118 | df[['protocol', 'src_port', 'dst_port', 'src2dst_packets', 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes']][ 119 | 1:]) 120 | y = df['ndpi_proto'][1:] 121 | 122 | # read the dictionary 123 | 124 | read_file = open("decisiontree.txt", 'r') 125 | dict_class = [] 126 | for i in read_file.readlines(): 127 | name, index = str(i).split(" ")[0], str(i).split(" ")[1] 128 | if name not in dict_class: 129 | dict_class.append(name) 130 | print(dict_class) 131 | # Recreate the exact same model 132 | count = 1 133 | loaded_model = pickle.load(open('decisiontree_model.sav', 'rb')) 134 | result = loaded_model.predict(X) 135 | original = [] 136 | for i in result: 137 | print(f'{count} {i} {dict_class[i]} {name_convert(y[count])}') 138 | if name_convert(y[count]) not in dict_class: 139 | original.append(7) 140 | else: 141 | original.append(dict_class.index(name_convert(y[count]))) 142 | count += 1 143 | print(accuracy_score(original, result)) 144 | print(confusion_matrix(original, result)) 145 | cm_analysis(original, result, dict_class, ymap=None, title="Confusion Matrix", figsize=(10, 10)) 146 | plot_confusion_matrix(original, result, classes= np.asarray(dict_class), normalize=False, title="Confusion Matrix",cmap=plt.cm.Reds) 147 | plt.show() 148 | plt.savefig("cm_1.png") 149 | -------------------------------------------------------------------------------- /DNN/predict.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import pandas as pd 3 | import keras 4 | # Helper libraries 5 | import numpy as np 6 | import pickle 7 | from sklearn.metrics import accuracy_score, confusion_matrix 8 | from sklearn.utils.multiclass import unique_labels 9 | import seaborn as sns 10 | from dictionary import name_convert 11 | from keras.models import load_model 12 | 13 | 14 | def accu(y_true=[], y_pred=[]): 15 | good = 0 16 | for i in range(0, len(y_true)): 17 | if y_true[i]== np.argmax(y_pred[i]): 18 | good+=1 19 | return good/len(y_true) 20 | 21 | def cm_analysis(y_true, y_pred, labels, ymap=None, title="", figsize=(10, 10)): 22 | """ 23 | Generate matrix plot of confusion matrix with pretty annotations. 24 | The plot image is saved to disk. 25 | args: 26 | y_true: true label of the data, with shape (nsamples,) 27 | y_pred: prediction of the data, with shape (nsamples,) 28 | filename: filename of figure file to save 29 | labels: string array, name the order of class labels in the confusion matrix. 30 | use `clf.classes_` if using scikit-learn models. 31 | with shape (nclass,). 32 | ymap: dict: any -> string, length == nclass. 33 | if not None, map the labels & ys to more understandable strings. 34 | Caution: original y_true, y_pred and labels must align. 35 | figsize: the size of the figure plotted. 36 | """ 37 | if ymap is not None: 38 | y_pred = [ymap[yi] for yi in y_pred] 39 | y_true = [ymap[yi] for yi in y_true] 40 | labels = [ymap[yi] for yi in labels] 41 | l = [] 42 | for i in labels: 43 | l.append(labels.index(i)) 44 | cm = confusion_matrix(y_true, y_pred, labels=l) 45 | cm_sum = np.sum(cm, axis=1, keepdims=True) 46 | cm_perc = cm / cm_sum.astype(float) * 100 47 | annot = np.empty_like(cm).astype(str) 48 | nrows, ncols = cm.shape 49 | for i in range(nrows): 50 | for j in range(ncols): 51 | c = cm[i, j] 52 | p = cm_perc[i, j] 53 | if i == j: 54 | s = cm_sum[i] 55 | annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s) 56 | elif c == 0: 57 | annot[i, j] = '' 58 | else: 59 | annot[i, j] = '%.1f%%\n%d' % (p, c) 60 | cm = pd.DataFrame(cm, index=labels, columns=labels) 61 | cm.index.name = 'Actual' 62 | cm.columns.name = 'Predicted' 63 | fig, ax = plt.subplots(figsize=figsize) 64 | ax.set(title=title) 65 | sns.heatmap(cm, annot=annot, fmt='', ax=ax) 66 | plt.savefig("cm.png") 67 | plt.show() 68 | 69 | 70 | def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, title=None, cmap=plt.cm.Blues): 71 | """ 72 | This function prints and plots the confusion matrix. 73 | Normalization can be applied by setting `normalize=True`. 74 | """ 75 | if not title: 76 | if normalize: 77 | title = 'Normalized confusion matrix' 78 | else: 79 | title = 'Confusion matrix, without normalization' 80 | 81 | # Compute confusion matrix 82 | cm = confusion_matrix(y_true, y_pred) 83 | # Only use the labels that appear in the data 84 | classes = classes[unique_labels(y_true, y_pred)] 85 | if normalize: 86 | cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] 87 | print("Normalized confusion matrix") 88 | else: 89 | print('Confusion matrix, without normalization') 90 | 91 | print(cm) 92 | 93 | fig, ax = plt.subplots() 94 | im = ax.imshow(cm, interpolation='nearest', cmap=cmap) 95 | ax.figure.colorbar(im, ax=ax) 96 | # We want to show all ticks... 97 | ax.set(xticks=np.arange(cm.shape[1]), 98 | yticks=np.arange(cm.shape[0]), 99 | # ... and label them with the respective list entries 100 | xticklabels=classes, yticklabels=classes, 101 | title=title, 102 | ylabel='True label', 103 | xlabel='Predicted label') 104 | 105 | # Rotate the tick labels and set their alignment. 106 | plt.setp(ax.get_xticklabels(), rotation=90, ha="right", 107 | rotation_mode="anchor") 108 | 109 | # Loop over data dimensions and create text annotations. 110 | fmt = '.2f' if normalize else 'd' 111 | thresh = cm.max() / 2. 112 | for i in range(cm.shape[0]): 113 | for j in range(cm.shape[1]): 114 | ax.text(j, i, format(cm[i, j], fmt), 115 | ha="center", va="center", 116 | color="white" if cm[i, j] > thresh else "black") 117 | fig.tight_layout() 118 | return ax 119 | 120 | 121 | file_dir = "D:\\SDN Project\\Data\\Processed\\" 122 | 123 | names = ['#flow_id', 'protocol', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'ndpi_proto_num', 'src2dst_packets', 124 | 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes', 'ndpi_proto'] 125 | df = pd.read_csv(file_dir + '26sep19.csv', names=names) 126 | X = np.asarray( 127 | df[['protocol', 'src_port', 'dst_port', 'src2dst_packets', 'src2dst_bytes', 'dst2src_packets', 'dst2src_bytes']][ 128 | 1:]) 129 | y = df['ndpi_proto'][1:] 130 | 131 | # read the dictionary 132 | 133 | read_file = open("dnn.txt", 'r') 134 | dict_class = [] 135 | for i in read_file.readlines(): 136 | name, index = str(i).split(" ")[0], str(i).split(" ")[1] 137 | if name not in dict_class: 138 | dict_class.append(name) 139 | print(dict_class) 140 | # Recreate the exact same model 141 | count = 1 142 | loaded_model = load_model('dnn_model1.sav') 143 | #loaded_model = pickle.load(open('dnn_model.sav', 'rb')) 144 | result = loaded_model.predict(X) 145 | 146 | original = [] 147 | y_pred = [] 148 | for i in result: 149 | #print(np.argmax(i)) 150 | print(f'{count} {np.argmax(i)} {dict_class[np.argmax(i)]} {name_convert(y[count])}') 151 | y_pred.append(np.argmax(i)) 152 | if name_convert(y[count]) not in dict_class: 153 | original.append(7) 154 | else: 155 | original.append(dict_class.index(name_convert(y[count]))) 156 | count += 1 157 | 158 | 159 | #print(f"accuracy: {accu(original,result)}") 160 | #print(keras.metrics.sparse_categorical_accuracy(original, result)) 161 | #print(accuracy_score(y_true, y_pred)) 162 | print(confusion_matrix(original, y_pred)) 163 | cm_analysis(original, y_pred, dict_class, ymap=None, title="Confusion Matrix", figsize=(10, 10)) 164 | plot_confusion_matrix(original, y_pred, classes=np.asarray(dict_class), normalize=False, title="Confusion Matrix",cmap=plt.cm.Reds) 165 | plt.show() 166 | plt.savefig("cm_1.png") 167 | --------------------------------------------------------------------------------