├── In_switch_ETC
    ├── Offline_model_preparation
    │   ├── NIMS_IMA_sample_data.csv
    │   ├── Nims2023_Data_Analysis.ipynb
    │   ├── generate_table_entries.py
    │   ├── noms2024_20_5.pkl
    │   ├── pre-processing
    │   │   ├── extract_flows.sh
    │   │   ├── extract_flows_from_txt.py
    │   │   ├── extract_pkts.sh
    │   │   └── readme.md
    │   ├── readme.md
    │   └── test_data_nimsIMA.csv
    ├── README.md
    └── Switch
    │   ├── controller_digest_noms.py
    │   ├── noms_20_5_4.p4
    │   ├── readme.md
    │   └── table_entries.py
├── Offline_ETC
    ├── README.md
    ├── __init__.py
    ├── cstnet-tls13_traffic_classifier.py
    ├── data_preparation
    │   ├── pcap2csv.sh
    │   └── pkts2flows.py
    ├── encrypted_traffic_classification.py
    ├── iscxvpn2016-vpn-classifier.py
    ├── netflow_quic_traffic_classifier.py
    ├── noms2023_ima_only_traffic_classifier.py
    ├── noms2023_instant_messaging_traffic_classifier.py
    └── ucdavis_quic_classifier.py
├── README.md
└── etc_framework.png


/In_switch_ETC/Offline_model_preparation/generate_table_entries.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import pickle as pickle
  4 | import numpy as np
  5 | import pandas as pd
  6 | pd.options.mode.chained_assignment = None  # default='warn'
  7 | from sklearn import tree
  8 | import re
  9 | from netaddr import IPAddress
 10 | from statistics import mode
 11 | import random
 12 | import ipaddress
 13 | 
 14 | np.random.seed(42)
 15 | 
 16 | ## import and get entries from trained models ##
 17 | clf = pd.read_pickle('noms2024_20_5.pkl')
 18 | 
 19 | ## list the feature names
 20 | feature_names = ['Flow IAT Min', 'Max Packet Length', 'Flow IAT Max', 'Packet Length Total']
 21 | 
 22 | print(feature_names)
 23 | 
 24 | ## definition of useful functions
 25 | ## gets all splits and conditions
 26 | def get_splits(forest, feature_names):
 27 |     data = []
 28 |     #generate dataframe with all thresholds and features
 29 |     for t in range(len(forest.estimators_)):
 30 |         clf = forest[t]
 31 |         n_nodes = clf.tree_.node_count
 32 |         features  = [feature_names[i] for i in clf.tree_.feature]
 33 |         for i in range(0, n_nodes):
 34 |             node_id = i
 35 |             left_child_id = clf.tree_.children_left[i]
 36 |             right_child_id = clf.tree_.children_right[i]
 37 |             threshold = clf.tree_.threshold[i]
 38 |             feature = features[i]
 39 |             if threshold != -2.0:
 40 |                 data.append([t, node_id, left_child_id,
 41 |                              right_child_id, threshold, feature])
 42 |     data = pd.DataFrame(data)
 43 |     data.columns = ["Tree","NodeID","LeftID","RightID","Threshold","Feature"]
 44 |     return data
 45 | 
 46 | ## gets the feature table of each feature from the splits
 47 | def get_feature_table(splits_data, feature_name):
 48 |     feature_data = splits_data[splits_data["Feature"]==feature_name]
 49 |     feature_data = feature_data.sort_values(by="Threshold")
 50 |     feature_data = feature_data.reset_index(drop=True)
 51 |     ##
 52 |     # feature_data["Threshold"] = (feature_data["Threshold"]).astype(int)
 53 |     feature_data["Threshold"] = feature_data["Threshold"].astype(int)
 54 |     ##
 55 |     code_table = pd.DataFrame()
 56 |     code_table["Threshold"] = feature_data["Threshold"]
 57 |     #print(feature_data)
 58 |     #create a column for each split in each tree
 59 |     for tree_id, node in zip(list(feature_data["Tree"]), list(feature_data["NodeID"])):
 60 |         colname = "s"+str(tree_id)+"_"+str(node)
 61 |         code_table[colname] = np.where((code_table["Threshold"] <=
 62 |                                         feature_data[(feature_data["NodeID"]== node) &
 63 |                                                      (feature_data["Tree"]==tree_id)]["Threshold"].values[0]), 0, 1)
 64 |     #add a row to represent the values above the largest threshold
 65 |     temp = [max(code_table["Threshold"])+1]
 66 |     temp.extend(list([1]*(len(code_table.columns)-1)))
 67 |     code_table.loc[len(code_table)] = temp
 68 |     code_table = code_table.drop_duplicates(subset=['Threshold'])
 69 |     code_table = code_table.reset_index(drop=True)
 70 |     return code_table
 71 | 
 72 | ## get feature tables with ranges and codes only
 73 | def get_feature_codes_with_ranges(feature_table, num_of_trees):
 74 |     Codes = pd.DataFrame()
 75 |     for tree_id in range(num_of_trees):
 76 |         colname = "code"+str(tree_id)
 77 |         Codes[colname] = feature_table[feature_table[[col for col in feature_table.columns if ('s'+str(tree_id)+'_') in col]].columns[0:]].apply(lambda x: ''.join(x.dropna().astype(str)),axis=1)
 78 |         Codes[colname] = ["0b" + x for x in Codes[colname]]
 79 |     feature_table["Range"] = [0]*len(feature_table)
 80 |     feature_table["Range"].loc[0] = "0,"+str(feature_table["Threshold"].loc[0])
 81 |     for i in range(1, len(feature_table)):
 82 |         if (i==(len(feature_table))-1):
 83 |             feature_table["Range"].loc[i] = str(feature_table["Threshold"].loc[i])+","+str(feature_table["Threshold"].loc[i])
 84 |         else:
 85 |             feature_table["Range"].loc[i] = str(feature_table["Threshold"].loc[i-1]+1) + ","+str(feature_table["Threshold"].loc[i])
 86 |     Ranges = feature_table["Range"]
 87 |     return Ranges, Codes
 88 | 
 89 | ## get list of splits crossed to get to leaves
 90 | def retrieve_branches(estimator):
 91 |     number_nodes = estimator.tree_.node_count
 92 |     children_left_list = estimator.tree_.children_left
 93 |     children_right_list = estimator.tree_.children_right
 94 |     feature = estimator.tree_.feature
 95 |     threshold = estimator.tree_.threshold
 96 |     # Calculate if a node is a leaf
 97 |     is_leaves_list = [(False if cl != cr else True) for cl, cr in zip(children_left_list, children_right_list)]
 98 |     # Store the branches paths
 99 |     paths = []
100 |     for i in range(number_nodes):
101 |         if is_leaves_list[i]:
102 |             # Search leaf node in previous paths
103 |             end_node = [path[-1] for path in paths]
104 |             # If it is a leave node yield the path
105 |             if i in end_node:
106 |                 output = paths.pop(np.argwhere(i == np.array(end_node))[0][0])
107 |                 yield output
108 |         else:
109 |             # Origin and end nodes
110 |             origin, end_l, end_r = i, children_left_list[i], children_right_list[i]
111 |             # Iterate over previous paths to add nodes
112 |             for index, path in enumerate(paths):
113 |                 if origin == path[-1]:
114 |                     paths[index] = path + [end_l]
115 |                     paths.append(path + [end_r])
116 |             # Initialize path in first iteration
117 |             if i == 0:
118 |                 paths.append([i, children_left_list[i]])
119 |                 paths.append([i, children_right_list[i]])
120 | 
121 | ## get classes and certainties
122 | def get_classes(clf):
123 |     leaves = []
124 |     classes = []
125 |     certainties = []
126 |     for branch in list(retrieve_branches(clf)):
127 |         leaves.append(branch[-1])
128 |     for leaf in leaves:
129 |         if clf.tree_.n_outputs == 1:
130 |             value = clf.tree_.value[leaf][0]
131 |         else:
132 |             value = clf.tree_.value[leaf].T[0]
133 |         class_name = np.argmax(value)
134 |         certainty = int(round(max(value)/sum(value),2)*100)
135 |         classes.append(class_name)
136 |         certainties.append(certainty)
137 |     return classes, certainties
138 | 
139 | ## get the codes corresponging to the branches followed
140 | def get_leaf_paths(clf):
141 |     depth = clf.max_depth
142 |     branch_codes = []
143 |     for branch in list(retrieve_branches(clf)):
144 |         code = [0]*len(branch)
145 |         for i in range(1, len(branch)):
146 |             if (branch[i]==clf.tree_.children_left[branch[i-1]]):
147 |                 code[i] = 0
148 |             elif (branch[i]==clf.tree_.children_right[branch[i-1]]):
149 |                 code[i] = 1
150 |         branch_codes.append(list(code[1:]))
151 |     return branch_codes
152 | 
153 | ## get the order of the splits to enable code generation
154 | def get_order_of_splits(data, feature_names):
155 |     splits_order = []
156 |     for feature_name in feature_names:
157 |         feature_data = data[data.iloc[:,4]==feature_name]
158 |         feature_data = feature_data.sort_values(by="Threshold")
159 |         for node in list(feature_data.iloc[:,0]):
160 |             splits_order.append(node)
161 |     return splits_order
162 | 
163 | def get_splits_per_tree(clf, feature_names):
164 |     data = []
165 |     n_nodes = clf.tree_.node_count
166 |     #set feature names
167 |     features  = [feature_names[i] for i in clf.tree_.feature]
168 |     #generate dataframe with all thresholds and features
169 |     for i in range(0,n_nodes):
170 |         node_id = i
171 |         left_child_id = clf.tree_.children_left[i]
172 |         right_child_id = clf.tree_.children_right[i]
173 |         threshold = clf.tree_.threshold[i]
174 |         feature = features[i]
175 |         if threshold != -2.0:
176 |             data.append([node_id, left_child_id,
177 |                          right_child_id, threshold, feature])
178 |     data = pd.DataFrame(data)
179 |     data.columns = ["NodeID","LeftID","RightID","Threshold","Feature"]
180 |     return data
181 | 
182 | ## Get codes and masks
183 | def get_codes_and_masks(clf, feature_names):
184 |     splits = get_order_of_splits(get_splits_per_tree(clf, feature_names), feature_names)
185 |     depth = clf.max_depth
186 |     codes = []
187 |     masks = []
188 |     for branch, coded in zip(list(retrieve_branches(clf)), get_leaf_paths(clf)):
189 |         code = [0]*len(splits)
190 |         mask = [0]*len(splits)
191 |         for index, split in enumerate(splits):
192 |             if split in branch:
193 |                 mask[index] = 1
194 |         masks.append(mask)
195 |         codes.append(code)
196 |     masks = pd.DataFrame(masks)
197 |     masks['Mask'] = masks[masks.columns[0:]].apply(lambda x: ''.join(x.dropna().astype(str)),axis=1)
198 |     masks = ["0b" + x for x in masks['Mask']]
199 |     indices = range(0,len(splits))
200 |     temp = pd.DataFrame(columns=["split", "index"],dtype=object)
201 |     temp["split"] = splits
202 |     temp["index"] = indices
203 |     final_codes = []
204 |     for branch, code, coded in zip(list(retrieve_branches(clf)), codes, get_leaf_paths(clf)):
205 |         indices_to_use = temp[temp["split"].isin(branch)].sort_values(by="split")["index"]
206 |         for i, j in zip(range(0,len(coded)), list(indices_to_use)):
207 |             code[j] = coded[i]
208 |         final_codes.append(code)
209 |     final_codes = pd.DataFrame(final_codes)
210 |     final_codes["Code"] = final_codes[final_codes.columns[0:]].apply(lambda x: ''.join(x.dropna().astype(str)),axis=1)
211 |     final_codes = ["0b" + x for x in final_codes["Code"]]
212 |     return final_codes, masks
213 | ## End of model manipulation ##
214 | 
215 | 
216 | # Get table entries and generate file with table entries
217 | with open("table_entries.py", "w") as entries_file:
218 | 
219 |     print("from netaddr import IPAddress\n", file=entries_file)
220 | 
221 |     print("p4 = bfrt.noms_20_5_4.pipe\n", file=entries_file)
222 | 
223 |     clear_tables = """
224 | def clear_all(verbose=True, batching=True):
225 |     global p4
226 |     global bfrt
227 |     for table_types in (['MATCH_DIRECT', 'MATCH_INDIRECT_SELECTOR'],
228 |                         ['SELECTOR'],
229 |                         ['ACTION_PROFILE']):
230 |         for table in p4.info(return_info=True, print_info=False):
231 |             if table['type'] in table_types:
232 |                 if verbose:
233 |                     print("Clearing table {:<40} ... ".
234 |                           format(table['full_name']), end='', flush=True)
235 |                 table['node'].clear(batch=batching)
236 |                 if verbose:
237 |                     print('Done')
238 | """
239 | 
240 |     port_setup = """
241 | # This script configures QSFP ports automatically on the TOFINO Switch
242 | # Adapted from ICA-1131 Intel Connectivity Academy Course
243 | for qsfp_cage in [1, 5]:
244 |     for lane in range(0, 1):
245 |         dp = bfrt.port.port_hdl_info.get(CONN_ID = qsfp_cage, CHNL_ID = lane, print_ents = False).data[b'$DEV_PORT']
246 |         bfrt.port.port.add(DEV_PORT= dp, SPEED = "BF_SPEED_100G", FEC = "BF_FEC_TYP_NONE", AUTO_NEGOTIATION = "PM_AN_FORCE_DISABLE", PORT_ENABLE = True)
247 | """
248 |     print(port_setup, file=entries_file)
249 | 
250 |     print(clear_tables, file=entries_file)
251 | 
252 |     print("clear_all(verbose=True)\n", file=entries_file)
253 |     print("voting_table = p4.Ingress.voting_table", file=entries_file)
254 |     print("target_flows_table = p4.Ingress.target_flows_table", file=entries_file)
255 | 
256 |     for num_feat in range(len(feature_names)):
257 |             print("table_feature"+str(num_feat)+" = p4.Ingress.table_feature"+str(num_feat), file=entries_file)
258 |     print('', file=entries_file)
259 | 
260 |     for num_tree in range(len(clf.estimators_)):
261 |         print("code_table"+str(num_tree)+" = p4.Ingress.code_table"+str(num_tree), file=entries_file)
262 |     print('', file=entries_file)
263 | 
264 |     # Get entries for feature tables
265 |     tree_code0 = []
266 |     tree_code1 = []
267 |     tree_code2 = []
268 |     tree_code3 = []
269 |     tree_code4 = []
270 | 
271 |     for fea in range(0,len(feature_names)):
272 |         Ranges, Codes = get_feature_codes_with_ranges(get_feature_table(get_splits(clf, feature_names), feature_names[fea]), len(clf.estimators_))
273 |         for ran, cods0, cods1, cods2, cods3, cods4 in zip(Ranges, Codes.iloc[:,0], Codes.iloc[:,1], Codes.iloc[:,2], Codes.iloc[:,3], Codes.iloc[:,4]):
274 |             if(ran == Ranges[len(Ranges)-1]):
275 |                 print("table_feature"+str(fea)+".add_with_SetCode"+str(fea)+"(feature"+str(fea)+"_start="+str(ran.split(",")[0])+ \
276 |                 ", feature"+str(fea)+"_end="+str(65535)+", code0="+str(cods0) + ", code1=" + str(cods1) + ", code2=" + str(cods2) + \
277 |                 ", code3=" + str(cods3) + ", code4=" + str(cods4) + ")", file = entries_file)
278 |                 # change 65535 to the maximum value of the feature in cases where the feature is not 16 bits
279 |             else:
280 |                 print("table_feature"+str(fea)+".add_with_SetCode"+str(fea)+"(feature"+str(fea)+"_start="+str(ran.split(",")[0])+ \
281 |                 ", feature"+str(fea)+"_end="+str(ran.split(",")[1])+", code0="+str(cods0)+", code1="+str(cods1)+", code2=" +str(cods2) + \
282 |                 ", code3=" + str(cods3) + ", code4=" + str(cods4) + ")", file = entries_file)
283 |                 # change 65535 to the maximum value of the feature in cases where the feature is not 16 bits
284 |                 
285 |         tree_code0.append(len(cods0)-2)
286 |         tree_code1.append(len(cods1)-2)
287 |         tree_code2.append(len(cods2)-2)
288 |         tree_code3.append(len(cods3)-2)
289 |         tree_code4.append(len(cods4)-2)
290 | 
291 |         print('', file=entries_file)
292 | 
293 |     tree_code_sizes = [tree_code0, tree_code1, tree_code2, tree_code3, tree_code4]
294 | 
295 |     print(tree_code_sizes)
296 | 
297 |     print('print("******************* ENTERED FEATURE TABLE RULES *****************")\n',  file=entries_file)
298 | 
299 |     for tree_id in range(0, len(clf.estimators_)):
300 |         Final_Codes, Final_Masks = get_codes_and_masks(clf.estimators_[tree_id], feature_names)
301 |         Classe, Certain = get_classes(clf.estimators_[tree_id])
302 |         for cod, mas, cla, cer in zip(Final_Codes, Final_Masks, Classe, Certain):
303 |             print("code_table"+str(tree_id)+".add_with_SetClass"+str(tree_id)+"(codeword"+str(tree_id)+"=", cod, ", codeword"+str(tree_id)+"_mask=", mas, ", classe=",cla+1,")", file=entries_file)
304 |         print('', file=entries_file)
305 | 
306 |     # Get voting table entries
307 |     for i in range(1, 7):
308 |         for j in range(1, 7):
309 |             for k in range(1, 7):
310 |                 for l in range(1, 7):
311 |                     for m in range(1, 7):
312 |                         try:
313 |                             choices = [i, j, k, l, m]
314 |                             mode_number = mode(choices)
315 |                             print("voting_table.add_with_set_final_class(" + "class0=" + str(i) + ", class1=" + str(j) + \
316 |                             ", class2=" + str(k) + ", class3=" + str(l) + ", class4=" + str(m) + \
317 |                             ", class_result=" + str(mode_number) + ")", file=entries_file)
318 |                         except:
319 |                             pass
320 | 
321 |     print(" ", file=entries_file)
322 | 
323 |     # Forwarding: 0 Inference: 1
324 |     flow_id_info = pd.read_csv("test_data_nimsIMA.csv",usecols=['Flow ID','Label'])
325 |     flow_id_info = flow_id_info.drop_duplicates(subset=['Flow ID'])
326 |     for index, flow in flow_id_info.iterrows():
327 |         flow_id = flow['Flow ID']
328 |         id_values = flow_id.split(" ")
329 |         # With all tuple elements
330 |         try:
331 |             print("target_flows_table.add_with_set_flow_class("+"src_addr="+str(int(ipaddress.ip_address(id_values[0])))+ \
332 |                         ", dst_addr="+str(int(ipaddress.ip_address(id_values[1])))+ \
333 |                         ", hdr_srcport="+str(id_values[2])+ \
334 |                         ", hdr_dstport="+str(id_values[3])+ \
335 |                         ", protocol="+str(id_values[4])+ \
336 |                         ", f_class="+str(0)+")", file=entries_file)
337 |         except:
338 |             continue
339 | 
340 |     print("bfrt.complete_operations()", file=entries_file)
341 | 
342 | print("** TABLE ENTRIES GENERATED AND STORED IN DESIGNATED FILE **")
343 | 


--------------------------------------------------------------------------------
/In_switch_ETC/Offline_model_preparation/noms2024_20_5.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nds-group/ETC_NOMS_2024/1ad4bbd4e1c097e87625b185e6d0ba14fa201bbb/In_switch_ETC/Offline_model_preparation/noms2024_20_5.pkl


--------------------------------------------------------------------------------
/In_switch_ETC/Offline_model_preparation/pre-processing/extract_flows.sh:
--------------------------------------------------------------------------------
1 | for f in ./txt_files/*.txt
2 | 	do
3 | 		echo $f
4 |         python3 extract_flows_from_txt.py $f $f.csv 8
5 | 	done
6 | 


--------------------------------------------------------------------------------
/In_switch_ETC/Offline_model_preparation/pre-processing/extract_flows_from_txt.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import sys
  4 | 
  5 | filename_in  = sys.argv[1]
  6 | filename_out = sys.argv[2]
  7 | npkts        = int(sys.argv[3])
  8 | 
  9 | packet_data = pd.DataFrame()
 10 | 
 11 | packet_data = pd.read_csv(filename_in, sep = '|', header=None)
 12 | 
 13 | packet_data.columns = ['timestamp', 'ip.src', 'ip.dst', 'tcp.srcport', 'tcp.dstport', 'ip.proto', 'ip.len','udp.srcport', 'udp.dstport']
 14 | 
 15 | packet_data = packet_data[(packet_data["ip.proto"] != "1,17") & (packet_data["ip.proto"] != "1,6")].reset_index(drop=True)
 16 | packet_data = packet_data.dropna(subset=['ip.proto'])
 17 | packet_data["ip.src"] = packet_data["ip.src"].astype(str)
 18 | packet_data["ip.dst"] = packet_data["ip.dst"].astype(str)
 19 | packet_data["ip.len"] = packet_data["ip.len"].astype("int")
 20 | ##
 21 | packet_data["tcp.srcport"] = packet_data["tcp.srcport"]
 22 | packet_data["tcp.dstport"] = packet_data["tcp.dstport"]
 23 | packet_data["udp.srcport"] = packet_data["udp.srcport"].astype('Int64')
 24 | packet_data["udp.dstport"] = packet_data["udp.dstport"].astype('Int64')
 25 | #
 26 | packet_data["srcport"] = np.where(packet_data["ip.proto"] == "6", packet_data["tcp.srcport"], packet_data["udp.srcport"])
 27 | packet_data["dstport"] = np.where(packet_data["ip.proto"] == "6", packet_data["tcp.dstport"], packet_data["udp.dstport"])
 28 | #
 29 | packet_data["srcport"] = np.where(packet_data["ip.proto"] == 6, packet_data["tcp.srcport"], packet_data["udp.srcport"])
 30 | packet_data["dstport"] = np.where(packet_data["ip.proto"] == 6, packet_data["tcp.dstport"], packet_data["udp.dstport"])
 31 | #
 32 | packet_data["srcport"] = packet_data["srcport"].astype('Int64')
 33 | packet_data["dstport"] = packet_data["dstport"].astype('Int64')
 34 | 
 35 | #===============================CREATE THE FLOW IDs AND DROP UNWANTED COLUMNS =============================================#
 36 | packet_data = packet_data.drop(["tcp.srcport","tcp.dstport","udp.srcport","udp.dstport"],axis=1)
 37 | packet_data = packet_data.reset_index(drop=True)
 38 | 
 39 | packet_data["flow.id"] = packet_data["ip.src"].astype(str) + " " + packet_data["ip.dst"].astype(str) + " " + packet_data["srcport"].astype(str) + " " + packet_data["dstport"].astype(str) + " " + packet_data["ip.proto"].astype(str)
 40 | 
 41 | 
 42 | # Labeling
 43 | filename_patterns = {"background"      : "Background", 
 44 |                      "webbrowsing"     : "WebBrowsing",
 45 |                      "youtube"         : "YouTube",
 46 |                      "gmail"           : "Gmail",
 47 |                      "discord"         : "Discord",
 48 |                      "whatsapp"        : "WhatsApp",
 49 |                      "signal"          : "Signal",
 50 |                      "telegram"        : "Telegram",
 51 |                      "messenger"       : "Messenger",
 52 |                      "teams"           : "Teams"
 53 |                     }
 54 | 
 55 | for pattern, labeld in filename_patterns.items():
 56 |     if pattern in filename_in:
 57 |         label = labeld
 58 | 
 59 | number_of_pkts_limit, min_number_of_packets = npkts, npkts
 60 | #===============================Extract flows from packets and calculate features=============================================#
 61 | main_packet_size = {}  # dictionary to store list of packet sizes for each flow (Here key = flowID, value = list of packet sizes)
 62 | flow_list = []  # contains the flowIDs (a combination of SIP,DIP,srcPort, dstPort, proto)
 63 | main_inter_arrival_time = {}  # dictionary to store list of IATs for each flow (Here key = flowID, value = list of IATs)
 64 | last_time = {}  # for each flow we store timestamp of the last packet arrival
 65 | 
 66 | avg_pkt_sizes = {}  # contains the flowID and their calculated average packet sizes
 67 | string = {}  # For each flow, we have a string of feature values (just for printing purpose, on screen)
 68 | 
 69 | labels = {}  # contains the flowID and their labels
 70 | packet_count = {}  # contains flowID as key and number of packets as valu
 71 | 
 72 | # ==============================================================================================================================#
 73 | print("NOW: COLLECTING PACKETS INTO FLOWS...")
 74 | for row in packet_data.itertuples(index=True, name='Pandas'):
 75 |     time    = float(row[1])    # timestamp of the packet
 76 |     srcip   = row[2]          #src ip
 77 |     dstip   = row[3]          #dst ip
 78 |     pktsize = row[5]        #packet size   
 79 |     proto   = row[4]         #protocol
 80 |     srcport = row[6]     #source port
 81 |     dstport = row[7]     #destination port
 82 |     key     = row[8]          #key which is a concatenation of the 5-tuple to identify the flow
 83 | 
 84 |     if key in flow_list:  # check if the packet belongs to already existing flow ?
 85 |         if (len(main_packet_size[key]) < number_of_pkts_limit ):
 86 |             packet_count[key] = packet_count[key] + 1  # increment packet count
 87 |             main_packet_size[key].append(pktsize)  # append its packet size to the packet size list for this flow
 88 |             lasttime = last_time[key]
 89 |             diff = round(float(time) - float(lasttime), 9)  # calculate inter-arrival time (seconds)
 90 |             main_inter_arrival_time[key].append(diff)  # append IAT
 91 |             ##
 92 |             labels[key] = label
 93 |             ##
 94 |             last_time[key] = time  # update last time for the flow, to the timestamp of this packet
 95 | 
 96 | 
 97 |     else:  # if this packet is the first one in this NEW flow
 98 |         flow_list.append(key)  # make its entry in the existing flow List
 99 |         packet_count[key] = 1  # first packet arrived for this flow, set count =1
100 |         main_packet_size[key] = [pktsize]  # make its entry in the packet size dictionary
101 |         ##
102 |         labels[key] = label
103 |         ##
104 |         main_inter_arrival_time[key] = []  # create a blank list in this dictionary, as it is the first packet
105 | 
106 |         last_time[key] = time
107 | 
108 | # -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
109 | print("NOW: COMPUTING AND WRITING FLOW FEATURES INTO CSV...")
110 | header = "Flow ID,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Total,Packet Count,Current Packet Length,Flow IAT Min,Flow IAT Max,Flow IAT Mean,Flow Duration,Label"
111 | 
112 | with open(filename_out, "w") as text_file:
113 |     text_file.write(header)
114 |     text_file.write("\n")
115 | # -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
116 | # Calculate features related to packet size
117 |     for key in flow_list:
118 |         packet_list = main_packet_size[key]  # packet_list contains the list of packet sizes for the flow in consideration
119 |         length = len(packet_list)  # number of packets
120 |         avg_pkt_sizes[key] = sum(packet_list) / length  # calculate avg packet size, and store
121 |         min_pkt_size = min(packet_list)
122 |         max_pkt_size = max(packet_list)
123 | 
124 |         string[key] = key + "," + str(min_pkt_size) + "," + str(max_pkt_size) + "," + str(avg_pkt_sizes[key]) + "," + str(sum(packet_list)) + "," + str(len(packet_list)) + "," + str(packet_list[len(packet_list)-1]) # concatenate features in string format
125 |     # ------------------- ---------------------------------------------------------------------------------------------------------------------------------------------------
126 |     # Now calculate IAT-related features
127 |         inter_arrival_time_list = main_inter_arrival_time[key]  # a list containing IATs for the flow
128 |         length = len(inter_arrival_time_list)
129 |         if length == 0:
130 |             min_IAT = 0
131 |             max_IAT = 0
132 |         else:
133 |             min_IAT = min(inter_arrival_time_list)
134 |             min_IAT_ms = round(1000000000*min_IAT, 9) # convert in nanoseconds
135 |             max_IAT = max(inter_arrival_time_list)
136 |             max_IAT_ms = round(1000000000*max_IAT, 9) # convert in nanoseconds
137 | 
138 |         if length > 0:
139 |             flow_duration = sum(inter_arrival_time_list)  # flow duration seconds
140 |             flow_duration_ms = round(1000000000*flow_duration, 9) # convert in nanoseconds
141 |             avg_iat = flow_duration / length  # Average IAT
142 |             avg_iat_in_ms = round(1000000000*avg_iat, 9)  # convert in nanoseconds
143 | 
144 |         if(len(main_packet_size[key]) >= min_number_of_packets):
145 |             string[key] = string[key] + "," + str(min_IAT_ms) + "," + str(max_IAT_ms) + "," + str(avg_iat_in_ms) + "," + str(flow_duration_ms)
146 |             string[key] = string[key] + "," + str(labels[key])
147 |             text_file.write(string[key])
148 |             text_file.write("\n")


--------------------------------------------------------------------------------
/In_switch_ETC/Offline_model_preparation/pre-processing/extract_pkts.sh:
--------------------------------------------------------------------------------
1 | for f in *.pcap
2 | 	do
3 | 		echo $f
4 |         tshark -r $f -Y 'ip.proto == 6 or ip.proto == 17' -T fields -e frame.time_relative -e ip.src -e ip.dst -e tcp.srcport -e tcp.dstport -e ip.proto -e ip.len -e udp.srcport -e udp.dstport -E separator='|' > ./txt_files/$f.txt
5 | 	done
6 | 


--------------------------------------------------------------------------------
/In_switch_ETC/Offline_model_preparation/pre-processing/readme.md:
--------------------------------------------------------------------------------
1 | The scripts in this folder are useful for extracting the data from the pcap files.
2 | - run the _extract_pkts.sh_ script in the folder containing the downloaded pcap files to extract the packet features.
3 |     - the packet files are saved in a _txt_files_ folder
4 | - run the _extract_flows.sh_ script in the folder containing the downloaded pcap files to aggregate the packet data in the _txt_files_ folder into flow data saved in .csv files.
5 |     - this bash script makes use of the _extract_flows_from_txt.py_ script which takes as input the txt file, the csv file which is the output, and the number of packets to consider in each flow.
6 | - merge the generated flow files into a single csv


--------------------------------------------------------------------------------
/In_switch_ETC/Offline_model_preparation/readme.md:
--------------------------------------------------------------------------------
1 | - use the _Nims2023_Data_Analysis.ipynb_ file to analyze, train and save models.
2 | - use the _generate_table_entries.py_ file to convert the trained and saved model into table entries for the switch.
3 | - the _NIMS_IMA_sample_data.csv_ contains one day of data (8 November 2022) used for the analysis to shorten duration of in-switch experiments.
4 | - the _test_data_nimsIMA.csv_ contains the test data needed to create table entries for the flow table in the switch.
5 | - the noms2024_20_5.pkl is a sample trained and saved RF model with trees of maximum depth 20, 5 trees and 4 features.
6 | 


--------------------------------------------------------------------------------
/In_switch_ETC/README.md:
--------------------------------------------------------------------------------
1 | ## Organization of the folder  
2 | There are two folders:  
3 | <!-- - _Data_ : information on how to access the data  -->
4 | - _Switch_ : the P4 code for the Tofino switch, the M/A table entries, and the runtime controller code.
5 | - _Offline_ : the jupyter notebooks for training the machine learning models and for offline evaluation, and the scripts for generating the M/A table entries from trained models.


--------------------------------------------------------------------------------
/In_switch_ETC/Switch/controller_digest_noms.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | from __future__ import print_function
  4 | 
  5 | import os
  6 | import sys
  7 | import pdb
  8 | 
  9 | SDE_INSTALL   = os.environ['SDE_INSTALL']
 10 | SDE_PYTHON2   = os.path.join(SDE_INSTALL, 'lib', 'python2.7', 'site-packages')
 11 | sys.path.append(SDE_PYTHON2)
 12 | sys.path.append(os.path.join(SDE_PYTHON2, 'tofino'))
 13 | 
 14 | PYTHON3_VER   = '{}.{}'.format(
 15 |                     sys.version_info.major,
 16 |                     sys.version_info.minor)
 17 | SDE_PYTHON3   = os.path.join(SDE_INSTALL, 'lib', 'python' + PYTHON3_VER, 'site-packages')
 18 | sys.path.append(SDE_PYTHON3)
 19 | sys.path.append(os.path.join(SDE_PYTHON3, 'tofino'))
 20 | sys.path.append(os.path.join(SDE_PYTHON3, 'tofino', 'bfrt_grpc'))
 21 | 
 22 | import grpc
 23 | import bfrt_grpc.bfruntime_pb2 as bfruntime_pb2
 24 | import bfrt_grpc.client as bfrt_client
 25 | import pandas as pd
 26 | import time
 27 | import socket, struct
 28 | 
 29 | filename_out = sys.argv[1] #output csv file with classification results
 30 | 
 31 | # Connect to the BF Runtime Server
 32 | interface = bfrt_client.ClientInterface(
 33 |     grpc_addr = 'localhost:50052',
 34 |     client_id = 1,
 35 |     device_id = 0)
 36 | print('Connected to BF Runtime Server')
 37 | 
 38 | 
 39 | # Get the information about the running program
 40 | bfrt_info = interface.bfrt_info_get()
 41 | print('The target runs the program ', bfrt_info.p4_name_get())
 42 | 
 43 | # Establish that we are using this program on the given connection
 44 | interface.bind_pipeline_config(bfrt_info.p4_name_get())
 45 | 
 46 | # Get digest
 47 | learn_filter = bfrt_info.learn_get("digest")
 48 | 
 49 | # List of registers in P4 program
 50 | registers = ['Ingress.reg_flow_ID','Ingress.reg_status','Ingress.reg_pkt_count', 'Ingress.reg_classified_flag', 'Ingress.reg_flow_iat_max', 'Ingress.reg_flow_iat_min', 'Ingress.reg_pkt_len_max', 'Ingress.reg_pkt_len_total','Ingress.reg_time_last_pkt']
 51 | 
 52 | # Getting info about the flow table
 53 | flow_act_tbl = bfrt_info.table_get('Ingress.target_flows_table')
 54 | print('Table max packet length info:', flow_act_tbl)
 55 | 
 56 | target = bfrt_client.Target(device_id=0, pipe_id=0xffff)
 57 | 
 58 | header = 'source_addr,destin_addr,source_port,destin_port,protocol,pkt_count,flow_packet_class'
 59 | 
 60 | count = 0
 61 | 
 62 | with open(filename_out, "w") as text_file:
 63 |     text_file.write(header)
 64 |     text_file.write("\n")
 65 | 
 66 | flow_counter = 0
 67 | while True:
 68 |     try:
 69 |         digest = interface.digest_get(timeout=400)
 70 |     except:
 71 |         f = open("x.txt", "a")
 72 |         f.write('---- \n')
 73 |         f.close()
 74 |         break
 75 | 
 76 |     recv_target = digest.target
 77 | 
 78 |     digest_type = 1
 79 |     data_list = learn_filter.make_data_list(digest)
 80 |     
 81 |     if digest_type == 1:
 82 |         count = count + 1
 83 |         keys_reg = {'Ingress.reg_flow_ID': [],'Ingress.reg_status': [],
 84 |                     'Ingress.reg_pkt_count': [], 'Ingress.reg_classified_flag': [],
 85 |                     'Ingress.reg_flow_iat_min': [], 'Ingress.reg_pkt_len_max': [],
 86 |                     'Ingress.reg_flow_iat_max': [],
 87 |                     'Ingress.reg_pkt_len_total': [], 'Ingress.reg_time_last_pkt': []}
 88 |         datas_reg = {'Ingress.reg_flow_ID': [],'Ingress.reg_status': [],
 89 |                     'Ingress.reg_pkt_count': [], 'Ingress.reg_classified_flag': [],
 90 |                     'Ingress.reg_flow_iat_min': [], 'Ingress.reg_pkt_len_max': [], 
 91 |                     'Ingress.reg_flow_iat_max': [],
 92 |                     'Ingress.reg_pkt_len_total': [], 'Ingress.reg_time_last_pkt': []}
 93 |         keys_table = []
 94 |         datas_table = []
 95 |         for dd in data_list:
 96 |             data_dict = dd.to_dict()
 97 |             # convert ip address into normal format
 98 |             source_addr = socket.inet_ntoa(struct.pack('!L', data_dict['source_addr']))
 99 |             destin_addr = socket.inet_ntoa(struct.pack('!L', data_dict['destin_addr']))
100 |             source_port = str(data_dict['source_port'])
101 |             destin_port = str(data_dict['destin_port'])
102 |             protocol = str(data_dict['protocol'])
103 |             flow_packet_class = data_dict['class_value']
104 |             pkt_count = str(data_dict['packet_num'])
105 |             register_index = data_dict['register_index']
106 |             #
107 |             FlowID = source_addr + ' ' + destin_addr + ' ' + source_port + ' ' + destin_port + ' ' + protocol
108 |             #
109 |             if (pkt_count == '0'):
110 |                 csv_row = source_addr + ',' + destin_addr + ',' + source_port + ',' + destin_port + ',' + protocol + ',' + pkt_count + ',' + str(-1)
111 |             else:
112 |                 csv_row = source_addr + ',' + destin_addr + ',' + source_port + ',' + destin_port + ',' + protocol + ',' + pkt_count + ',' + str(flow_packet_class)
113 | 
114 |             with open(filename_out, "a") as text_file:
115 |                     text_file.write(csv_row)
116 |                     text_file.write("\n")
117 | 
118 |             if (data_dict['packet_num'] == 8):
119 |                 keys_table.append(flow_act_tbl.make_key(
120 |                                 [bfrt_client.KeyTuple('hdr.ipv4.src_addr', data_dict['source_addr']), bfrt_client.KeyTuple('hdr.ipv4.dst_addr', data_dict['destin_addr']), 
121 |                                 bfrt_client.KeyTuple('meta.hdr_dstport', data_dict['destin_port']), bfrt_client.KeyTuple('meta.hdr_srcport', data_dict['source_port']),
122 |                                 bfrt_client.KeyTuple('hdr.ipv4.protocol', data_dict['protocol'])]))
123 | 
124 |                 datas_table.append(flow_act_tbl.make_data([
125 |                                         bfrt_client.DataTuple('f_class', flow_packet_class)
126 |                                     ], 'Ingress.set_flow_class'))
127 | 
128 |                 for reg_name in registers:
129 |                     reg_tbl = bfrt_info.table_get(reg_name)
130 |                     keys_reg[reg_name].append(reg_tbl.make_key([bfrt_client.KeyTuple('$REGISTER_INDEX', register_index)]))
131 |                     datas_reg[reg_name].append(reg_tbl.make_data([bfrt_client.DataTuple(reg_name+'.f1', 0)]))
132 | 
133 |                 try:
134 |                     flow_act_tbl.entry_mod(target, keys_table, datas_table, p4_name=bfrt_info.p4_name_get())
135 |                     print("Flow table entry modified")
136 |                 except:
137 |                     print("Error in flow_act_tbl.entry_mod")
138 |                 for reg_name in registers:
139 |                     reg_tbl = bfrt_info.table_get(reg_name)
140 |                     reg_tbl.entry_mod(target, key_list=keys_reg[reg_name], data_list=datas_reg[reg_name], flags={"from_hw":True}, p4_name=bfrt_info.p4_name_get())
141 |                     print("Register table entry modified")
142 | 


--------------------------------------------------------------------------------
/In_switch_ETC/Switch/noms_20_5_4.p4:
--------------------------------------------------------------------------------
  1 | /* -*- P4_16 -*- */
  2 | #include <core.p4>
  3 | #include <tna.p4>
  4 | /*************************************************************************
  5 |  ************* C O N S T A N T S    A N D   T Y P E S  *******************
  6 | **************************************************************************/
  7 | typedef bit<48> mac_addr_t;
  8 | typedef bit<32> ipv4_addr_t;
  9 | typedef bit<16> ether_type_t;
 10 | const bit<16>       TYPE_IPV4 = 0x800;
 11 | const bit<16>       TYPE_RECIRC = 0x88B5;
 12 | const bit<8>        TYPE_TCP = 6;
 13 | const bit<8>        TYPE_UDP = 17;
 14 | const bit<32>       MAX_REGISTER_ENTRIES = 2048;
 15 | #define INDEX_WIDTH 11
 16 | /*************************************************************************
 17 |  ***********************  H E A D E R S  *********************************
 18 |  *************************************************************************/
 19 | /* Standard ethernet header */
 20 | header ethernet_h {
 21 |     mac_addr_t   dst_addr;
 22 |     mac_addr_t   src_addr;
 23 |     ether_type_t ether_type;
 24 | }
 25 | /* IPV4 header */
 26 | header ipv4_h {
 27 |     bit<4>       version;
 28 |     bit<4>       ihl;
 29 |     bit<8>       diffserv;
 30 |     bit<16>      total_len;
 31 |     bit<16>      identification;
 32 |     bit<3>       flags;
 33 |     bit<13>      frag_offset;
 34 |     bit<8>       ttl;
 35 |     bit<8>       protocol;
 36 |     bit<16>      hdr_checksum;
 37 |     ipv4_addr_t  src_addr;
 38 |     ipv4_addr_t  dst_addr;
 39 | }
 40 | /* TCP header */
 41 | header tcp_h {
 42 |     bit<16> src_port;
 43 |     bit<16> dst_port;
 44 |     bit<32> seq_no;
 45 |     bit<32> ack_no;
 46 |     bit<4>  data_offset;
 47 |     bit<4>  res;
 48 |     bit<1>  cwr;
 49 |     bit<1>  ece;
 50 |     bit<1>  urg;
 51 |     bit<1>  ack;
 52 |     bit<1>  psh;
 53 |     bit<1>  rst;
 54 |     bit<1>  syn;
 55 |     bit<1>  fin;
 56 |     bit<16> window;
 57 |     bit<16> checksum;
 58 |     bit<16> urgent_ptr;
 59 | }
 60 | /* UDP header */
 61 | header udp_h {
 62 |     bit<16> src_port;
 63 |     bit<16> dst_port;
 64 |     bit<16> udp_total_len;
 65 |     bit<16> checksum;
 66 | }
 67 | 
 68 | /*Custom header for recirculation*/
 69 | header recirc_h {
 70 |     bit<8>       class_result;
 71 | }
 72 | 
 73 | /***********************  H E A D E R S  ************************/
 74 | struct my_ingress_headers_t {
 75 |     ethernet_h   ethernet;
 76 |     recirc_h     recirc;
 77 |     ipv4_h       ipv4;
 78 |     tcp_h        tcp;
 79 |     udp_h        udp;
 80 | }
 81 | 
 82 | /******  G L O B A L   I N G R E S S   M E T A D A T A  *********/
 83 | struct my_ingress_metadata_t {
 84 |     bit<1> is_first;
 85 |     bit<8> classified_flag;
 86 |     bit<1> is_hash_collision;
 87 | 
 88 |     bit<1>  reg_status;
 89 |     bit<32> flow_ID;
 90 |     bit<(INDEX_WIDTH)> register_index;
 91 | 
 92 |     bit<16> hdr_srcport;
 93 |     bit<16> hdr_dstport;
 94 | 
 95 |     bit<8>  pkt_count;
 96 |     bit<32> time_last_pkt;
 97 | 
 98 |     bit<32> iat;
 99 |     bit<16> pkt_len_max;
100 |     bit<16> pkt_len_total;
101 | 
102 |     bit<32> flow_iat_max;
103 |     bit<32> flow_iat_min;
104 | 
105 |     bit<8> class0;
106 |     bit<8> class1;
107 |     bit<8> class2;
108 |     bit<8> class3;
109 |     bit<8> class4;
110 |     
111 |     bit<8> final_class;
112 | 
113 |     bit<202> codeword0;
114 |     bit<220> codeword1;
115 |     bit<205> codeword2;
116 |     bit<221> codeword3;
117 |     bit<204> codeword4;
118 | }
119 | 
120 | struct flow_class_digest {  // maximum size allowed is 47 bytes
121 |     
122 |     ipv4_addr_t  source_addr;   // 32 bits
123 |     ipv4_addr_t  destin_addr;   // 32 bits
124 |     bit<16> source_port;
125 |     bit<16> destin_port;
126 |     bit<8> protocol;
127 |     bit<8> class_value;
128 |     bit<8> packet_num;
129 |     bit<(INDEX_WIDTH)> register_index; // To send info to the controller
130 | }
131 | 
132 | /*************************************************************************
133 | *********************** P A R S E R  ***********************************
134 | *************************************************************************/
135 | parser TofinoIngressParser(
136 |         packet_in pkt,
137 |         out ingress_intrinsic_metadata_t ig_intr_md) {
138 |     state start {
139 |         pkt.extract(ig_intr_md);
140 |         transition select(ig_intr_md.resubmit_flag) {
141 |             1 : parse_resubmit;
142 |             0 : parse_port_metadata;
143 |         }
144 |     }
145 |     state parse_resubmit {
146 |         // Parse resubmitted packet here.
147 |         transition reject;
148 |     }
149 |     state parse_port_metadata {
150 |         pkt.advance(PORT_METADATA_SIZE);
151 |         transition accept;
152 |     }
153 | }
154 | 
155 | parser IngressParser(packet_in        pkt,
156 |     /* User */
157 |     out my_ingress_headers_t          hdr,
158 |     out my_ingress_metadata_t         meta,
159 |     /* Intrinsic */
160 |     out ingress_intrinsic_metadata_t  ig_intr_md)
161 | {
162 |     /* This is a mandatory state, required by Tofino Architecture */
163 |     TofinoIngressParser() tofino_parser;
164 | 
165 |     state start {
166 |         tofino_parser.apply(pkt, ig_intr_md);
167 |         transition parse_ethernet;
168 |     }
169 | 
170 |     state parse_ethernet {
171 |         pkt.extract(hdr.ethernet);
172 |         transition select(hdr.ethernet.ether_type) {
173 |             TYPE_RECIRC : parse_recirc;
174 |             TYPE_IPV4:  parse_ipv4;
175 |             default: accept;
176 |         }
177 |     }
178 | 
179 |     state parse_recirc {
180 |        pkt.extract(hdr.recirc);
181 |        transition parse_ipv4;
182 |     }
183 | 
184 |     state parse_ipv4 {
185 |         pkt.extract(hdr.ipv4);
186 |         meta.final_class=10;
187 |         transition select(hdr.ipv4.protocol) {
188 |             TYPE_TCP:  parse_tcp;
189 |             TYPE_UDP:  parse_udp;
190 |             default: accept;
191 |         }
192 |     }
193 | 
194 |     state parse_tcp {
195 |         pkt.extract(hdr.tcp);
196 |         meta.hdr_dstport = hdr.tcp.dst_port;
197 |         meta.hdr_srcport = hdr.tcp.src_port;
198 |         transition accept;
199 |     }
200 | 
201 |     state parse_udp {
202 |         pkt.extract(hdr.udp);
203 |         meta.hdr_dstport = hdr.udp.dst_port;
204 |         meta.hdr_srcport = hdr.udp.src_port;
205 |         transition accept;
206 |     }
207 | }
208 | 
209 | /*************************************************************************
210 |  **************  I N G R E S S   P R O C E S S I N G   *******************
211 |  *************************************************************************/
212 | /***************** M A T C H - A C T I O N  *********************/
213 | control Ingress(
214 |     /* User */
215 |     inout my_ingress_headers_t                       hdr,
216 |     inout my_ingress_metadata_t                      meta,
217 |     /* Intrinsic */
218 |     in    ingress_intrinsic_metadata_t               ig_intr_md,
219 |     in    ingress_intrinsic_metadata_from_parser_t   ig_prsr_md,
220 |     inout ingress_intrinsic_metadata_for_deparser_t  ig_dprsr_md,
221 |     inout ingress_intrinsic_metadata_for_tm_t        ig_tm_md)
222 | {
223 |     action drop() {
224 |         ig_dprsr_md.drop_ctl = 1;
225 |     }
226 | 
227 |     /* Registers for flow management */
228 |   Register<bit<8>,bit<(INDEX_WIDTH)>>(MAX_REGISTER_ENTRIES) reg_classified_flag;
229 |     /* Register read action */
230 |     RegisterAction<bit<8>,bit<(INDEX_WIDTH)>,bit<8>>(reg_classified_flag)
231 |     update_classified_flag = {
232 |         void apply(inout bit<8> classified_flag, out bit<8> output) {
233 |             if (hdr.recirc.isValid()){
234 |                 classified_flag = hdr.ipv4.ttl;
235 |             }
236 |             output = classified_flag;
237 |         }
238 |     };
239 | 
240 |     Register<bit<1>,bit<(INDEX_WIDTH)>>(MAX_REGISTER_ENTRIES) reg_status;
241 |     /* Register read action */
242 |     RegisterAction<bit<1>,bit<(INDEX_WIDTH)>,bit<1>>(reg_status)
243 |     read_reg_status = {
244 |         void apply(inout bit<1> status, out bit<1> output) {
245 |             output = status;
246 |             status = 1;
247 |         }
248 |     };
249 | 
250 |     Register<bit<32>,bit<(INDEX_WIDTH)>>(MAX_REGISTER_ENTRIES) reg_flow_ID;
251 |     /* Register read action */
252 |     RegisterAction<bit<32>,bit<(INDEX_WIDTH)>,bit<32>>(reg_flow_ID)
253 |     update_flow_ID = {
254 |         void apply(inout bit<32> flow_ID) {
255 |             flow_ID = meta.flow_ID;
256 |         }
257 |     };
258 |     /* Register read action */
259 |     RegisterAction<bit<32>,bit<(INDEX_WIDTH)>,bit<32>>(reg_flow_ID)
260 |     read_only_flow_ID = {
261 |         void apply(inout bit<32> flow_ID, out bit<32> output) {
262 |             output = flow_ID;
263 |         }
264 |     };
265 | 
266 |     Register<bit<32>,bit<(INDEX_WIDTH)>>(MAX_REGISTER_ENTRIES) reg_time_last_pkt;
267 |     /* Register read action */
268 |     RegisterAction<bit<32>,bit<(INDEX_WIDTH)>,bit<32>>(reg_time_last_pkt)
269 |     read_time_last_pkt = {
270 |         void apply(inout bit<32> time_last_pkt, out bit<32> output) {
271 |             output = time_last_pkt;
272 |             time_last_pkt = ig_prsr_md.global_tstamp[31:0];
273 |         }
274 |     };
275 | 
276 |     //registers for ML inference - features
277 |     Register<bit<8>,bit<(INDEX_WIDTH)>>(MAX_REGISTER_ENTRIES) reg_pkt_count;
278 |     /* Register read action */
279 |     RegisterAction<bit<8>,bit<(INDEX_WIDTH)>,bit<8>>(reg_pkt_count)
280 |     read_pkt_count = {
281 |         void apply(inout bit<8> pkt_count, out bit<8> output) {
282 |             pkt_count = pkt_count + 1;
283 |             output = pkt_count;
284 |         }
285 |     };
286 | 
287 |     Register<bit<16>,bit<(INDEX_WIDTH)>>(MAX_REGISTER_ENTRIES) reg_pkt_len_max;
288 |     /* Register read action */
289 |     RegisterAction<bit<16>,bit<(INDEX_WIDTH)>,bit<16>>(reg_pkt_len_max)
290 |     read_pkt_len_max = {
291 |         void apply(inout bit<16> pkt_len_max, out bit<16> output) {
292 |             if (meta.is_first == 1){
293 |                 pkt_len_max = hdr.ipv4.total_len;
294 |             }
295 |             else if (hdr.ipv4.total_len > pkt_len_max){
296 |                 pkt_len_max  = hdr.ipv4.total_len;
297 |             }
298 |             output = pkt_len_max;
299 |         }
300 |     };
301 | 
302 |     Register<bit<16>,bit<(INDEX_WIDTH)>>(MAX_REGISTER_ENTRIES) reg_pkt_len_total;
303 |     /* Register read action */
304 |     RegisterAction<bit<16>,bit<(INDEX_WIDTH)>,bit<16>>(reg_pkt_len_total)
305 |     read_pkt_len_total = {
306 |         void apply(inout bit<16> pkt_len_total, out bit<16> output) {
307 |             if (meta.is_first == 1){
308 |                 pkt_len_total = hdr.ipv4.total_len;
309 |             }
310 |             else{
311 |                 pkt_len_total = pkt_len_total + hdr.ipv4.total_len;
312 |             }
313 |             output = pkt_len_total;
314 |         }
315 |     };
316 | 
317 |     Register<bit<32>,bit<(INDEX_WIDTH)>>(MAX_REGISTER_ENTRIES) reg_flow_iat_max;
318 |     /* Register read action */
319 |     RegisterAction<bit<32>,bit<(INDEX_WIDTH)>,bit<32>>(reg_flow_iat_max)
320 |     read_flow_iat_max = {
321 |         void apply(inout bit<32> flow_iat_max, out bit<32> output) {
322 |             if (meta.is_first != 1){
323 |                 if(meta.iat > flow_iat_max){
324 |                     flow_iat_max = meta.iat;
325 |                 }
326 |             }
327 |             output = flow_iat_max;
328 |         }
329 |     };
330 | 
331 |     Register<bit<32>,bit<(INDEX_WIDTH)>>(MAX_REGISTER_ENTRIES) reg_flow_iat_min;
332 |     /* Register read action */
333 |     RegisterAction<bit<32>,bit<(INDEX_WIDTH)>,bit<32>>(reg_flow_iat_min)
334 |     read_flow_iat_min = {
335 |         void apply(inout bit<32> flow_iat_min, out bit<32> output) {
336 |             if (meta.pkt_count <= 2){
337 |                 flow_iat_min = meta.iat;
338 |             }
339 |             else if(meta.iat < flow_iat_min){
340 |                 flow_iat_min = meta.iat;
341 |             }
342 |             output = flow_iat_min;
343 |         }
344 |     };
345 | 
346 | 
347 |     /* Declaration of the hashes*/
348 |     Hash<bit<32>>(HashAlgorithm_t.CRC32)              flow_id_calc;
349 |     Hash<bit<(INDEX_WIDTH)>>(HashAlgorithm_t.CRC16)   idx_calc;
350 | 
351 |     /* Calculate hash of the 5-tuple to represent the flow ID */
352 |     action get_flow_ID(bit<16> srcPort, bit<16> dstPort) {
353 |         meta.flow_ID = flow_id_calc.get({hdr.ipv4.src_addr,
354 |             hdr.ipv4.dst_addr,srcPort, dstPort, hdr.ipv4.protocol});
355 |     }
356 |     /* Calculate hash of the 5-tuple to use as 1st register index */
357 |     action get_register_index(bit<16> srcPort, bit<16> dstPort) {
358 |         meta.register_index = idx_calc.get({hdr.ipv4.src_addr,
359 |             hdr.ipv4.dst_addr,srcPort, dstPort, hdr.ipv4.protocol});
360 |     }
361 | 
362 |     /* Assign class if at leaf node */
363 |     action SetClass0(bit<8> classe) {
364 |         meta.class0 = classe;
365 |     }
366 |     action SetClass1(bit<8> classe) {
367 |         meta.class1 = classe;
368 |     }
369 |     action SetClass2(bit<8> classe) {
370 |         meta.class2 = classe;
371 |     }
372 |     action SetClass3(bit<8> classe) {
373 |         meta.class3 = classe;
374 |     }
375 |     action SetClass4(bit<8> classe) {
376 |         meta.class4 = classe;
377 |     }
378 | 
379 |     /* Compute packet interarrival time (IAT)*/
380 |     action get_iat_value(){
381 |         meta.iat = ig_prsr_md.global_tstamp[31:0] - meta.time_last_pkt;
382 |     }
383 | 
384 |     /* Forward to a specific port upon classification */
385 |     action ipv4_forward(PortId_t port) {
386 |         ig_tm_md.ucast_egress_port = port;
387 |     }
388 | 
389 |     /* Custom Do Nothing Action */
390 |     action nop(){}
391 | 
392 |     /* Recirculate packet via loopback port 68 */
393 |     action recirculate(bit<7> recirc_port) {
394 |         ig_tm_md.ucast_egress_port[8:7] = ig_intr_md.ingress_port[8:7];
395 |         ig_tm_md.ucast_egress_port[6:0] = recirc_port;
396 |         hdr.recirc.setValid();
397 |         hdr.recirc.class_result = meta.final_class;
398 |         hdr.ethernet.ether_type = TYPE_RECIRC;
399 |     }
400 | 
401 |     /* Feature table actions */
402 |     action SetCode0(bit<29> code0, bit<30> code1, bit<23> code2, bit<31> code3, bit<28> code4) {
403 |         meta.codeword0[201:173] = code0;
404 |         meta.codeword1[219:190] = code1;
405 |         meta.codeword2[204:182] = code2;
406 |         meta.codeword3[220:190] = code3;
407 |         meta.codeword4[203:176] = code4;
408 |     }
409 |     action SetCode1(bit<53> code0, bit<64> code1, bit<63> code2, bit<60> code3, bit<42> code4) {
410 |         meta.codeword0[172:120] = code0;
411 |         meta.codeword1[189:126] = code1;
412 |         meta.codeword2[181:119] = code2;
413 |         meta.codeword3[189:130] = code3;
414 |         meta.codeword4[175:134] = code4;
415 |     }
416 |     action SetCode2(bit<57> code0, bit<48> code1, bit<60> code2, bit<54> code3, bit<52> code4) {
417 |         meta.codeword0[119:63] = code0;
418 |         meta.codeword1[125:78] = code1;
419 |         meta.codeword2[118:59] = code2;
420 |         meta.codeword3[129:76] = code3;
421 |         meta.codeword4[133:82] = code4;
422 |     }
423 |     action SetCode3(bit<63> code0, bit<78> code1, bit<59> code2, bit<76> code3, bit<82> code4) {
424 |         meta.codeword0[62:0]  = code0;
425 |         meta.codeword1[77:0]  = code1;
426 |         meta.codeword2[58:0]  = code2;
427 |         meta.codeword3[75:0]  = code3;
428 |         meta.codeword4[81:0]  = code4;
429 |     }
430 | 
431 |     /* Feature tables */
432 |     table table_feature0{
433 | 	    key = {meta.flow_iat_min[31:17]: range @name("feature0");}
434 | 	    actions = {@defaultonly nop; SetCode0;}
435 | 	    size = 64;
436 |         const default_action = nop();
437 | 	}
438 |     table table_feature1{
439 |         key = {meta.pkt_len_max: range @name("feature1");}
440 | 	    actions = {@defaultonly nop; SetCode1;}
441 | 	    size = 160;
442 |         const default_action = nop();
443 | 	}
444 | 	table table_feature2{
445 |         key = {meta.flow_iat_max[31:24]: range @name("feature2");} 
446 | 	    actions = {@defaultonly nop; SetCode2;}
447 | 	    size = 112;
448 |         const default_action = nop();
449 | 	}
450 |     table table_feature3{
451 | 	    key = {meta.pkt_len_total: range @name("feature3");}
452 | 	    actions = {@defaultonly nop; SetCode3;}
453 | 	    size = 244;
454 |         const default_action = nop();
455 | 	}
456 | 
457 |     /* Code tables */
458 | 	table code_table0{
459 | 	    key = {meta.codeword0: ternary;}
460 | 	    actions = {@defaultonly nop; SetClass0;}
461 | 	    size = 203;
462 |         const default_action = nop();
463 | 	}
464 | 	table code_table1{
465 |         key = {meta.codeword1: ternary;}
466 | 	    actions = {@defaultonly nop; SetClass1;}
467 | 	    size = 221;
468 |         const default_action = nop();
469 | 	}
470 | 	table code_table2{
471 |         key = {meta.codeword2: ternary;}
472 | 	    actions = {@defaultonly nop; SetClass2;}
473 | 	    size = 206;
474 |         const default_action = nop();
475 | 	}
476 | 	table code_table3{
477 |         key = {meta.codeword3: ternary;}
478 | 	    actions = {@defaultonly nop; SetClass3;}
479 | 	    size = 222;
480 |         const default_action = nop();
481 | 	}
482 | 	table code_table4{
483 |         key = {meta.codeword4: ternary;}
484 | 	    actions = {@defaultonly nop; SetClass4;}
485 | 	    size = 205;
486 |         const default_action = nop();
487 | 	}
488 | 
489 |     action set_default_result() {
490 |         meta.final_class = meta.class0;
491 |         ig_dprsr_md.digest_type = 1;
492 |         recirculate(68);
493 |     }
494 | 
495 |     action set_final_class(bit<8> class_result) {
496 |         meta.final_class = class_result;
497 |         ig_dprsr_md.digest_type = 1;
498 |         recirculate(68);
499 |     }
500 | 
501 |     table voting_table {
502 |         key = {
503 |             meta.class0: exact;
504 |             meta.class1: exact;
505 |             meta.class2: exact;
506 |             meta.class3: exact;
507 |             meta.class4: exact;
508 |         }
509 |         actions = {set_final_class; @defaultonly set_default_result;}
510 |         size = 5256;
511 |         const default_action = set_default_result();
512 |     }
513 | 
514 |     /* Forwarding-Inference Block Table */
515 |     action set_flow_class(bit<8> f_class) {
516 |         meta.final_class = f_class;
517 |     }
518 |     table target_flows_table {
519 |         key = {
520 |             hdr.ipv4.src_addr: exact;
521 |             hdr.ipv4.dst_addr: exact;
522 |             meta.hdr_srcport: exact;
523 |             meta.hdr_dstport: exact;
524 |             hdr.ipv4.protocol: exact;
525 |         }
526 |         actions = {set_flow_class; @defaultonly drop;}
527 |         size = 500;
528 |         const default_action = drop();
529 |     }
530 | 
531 |     apply {
532 |         // filter for background or already classified traffic
533 |         target_flows_table.apply();
534 | 
535 |         // get flow ID and register index
536 |         bit<32> tmp_flow_ID;
537 |         get_flow_ID(meta.hdr_srcport, meta.hdr_dstport);
538 |         get_register_index(meta.hdr_srcport, meta.hdr_dstport);
539 | 
540 |         if(meta.final_class==0){ //flow not classified
541 | 
542 |             // check if register for emptiness
543 |             meta.reg_status = read_reg_status.execute(meta.register_index);
544 | 
545 |             // check if register array is empty
546 |             if (meta.reg_status == 0){ // we do not yet know this flow
547 |                 meta.is_first = 1;
548 |                 update_flow_ID.execute(meta.register_index);
549 |                 // modify timestamp register
550 |                 meta.time_last_pkt = read_time_last_pkt.execute(meta.register_index);
551 |                 meta.pkt_count     = read_pkt_count.execute(meta.register_index);
552 |                 meta.pkt_len_max   = read_pkt_len_max.execute(meta.register_index);
553 |                 meta.pkt_len_total = read_pkt_len_total.execute(meta.register_index);
554 |                 ipv4_forward(260);
555 |             }
556 |             else { // not the first packet - get flow_ID from register
557 |                 meta.is_first = 0;
558 |                 tmp_flow_ID = read_only_flow_ID.execute(meta.register_index);
559 |                 if(meta.flow_ID != tmp_flow_ID){ // hash collision
560 |                     meta.pkt_count = 0; //hash col
561 |                     // send digest to inform controller of the collision
562 |                     ig_dprsr_md.digest_type = 1;
563 |                     ipv4_forward(260);
564 |                 }
565 |                 else { // not first packet and not hash collision
566 |                     //read and update packet count
567 |                     meta.pkt_count     = read_pkt_count.execute(meta.register_index);
568 |                     
569 |                     // read and update packet length features
570 |                     meta.pkt_len_max   = read_pkt_len_max.execute(meta.register_index);
571 |                     meta.pkt_len_total = read_pkt_len_total.execute(meta.register_index);
572 | 
573 |                     // modify timestamp register
574 |                     meta.time_last_pkt = read_time_last_pkt.execute(meta.register_index);
575 | 
576 |                     // compute IAT value    
577 |                     get_iat_value();
578 | 
579 |                     //read and update IAT features
580 |                     meta.flow_iat_max   =  read_flow_iat_max.execute(meta.register_index);
581 |                     meta.flow_iat_min   =  read_flow_iat_min.execute(meta.register_index);
582 | 
583 |                     // check if # of packets requirement is met
584 |                     if(meta.pkt_count == 8){
585 | 
586 |                         // apply feature tables to assign codes
587 |                         table_feature0.apply();
588 |                         table_feature1.apply();
589 |                         table_feature2.apply();
590 |                         table_feature3.apply();
591 | 
592 |                         // apply code tables to assign labels
593 |                         code_table0.apply();
594 |                         code_table1.apply();
595 |                         code_table2.apply();
596 |                         code_table3.apply();
597 |                         code_table4.apply();
598 | 
599 |                         // decide final class
600 |                         voting_table.apply();
601 |                     }                    
602 |                     else{ // this happens to first  packets and packet number 5 onwards
603 |                         meta.classified_flag = update_classified_flag.execute(meta.register_index);
604 | 
605 |                         if (meta.classified_flag != 0) {//No need to check again - already classified
606 |                             hdr.recirc.setInvalid();
607 |                             hdr.ethernet.ether_type = TYPE_IPV4;
608 |                             //set value of ttl to classification result (stats only)
609 |                             hdr.ipv4.ttl = meta.classified_flag;
610 |                 		}
611 |                         ipv4_forward(260);
612 |                     } //END OF CHECK FOR PREVIOUS CLASSIFICATION
613 |                 } //END OF CHECK ON IF NO COLLISION
614 |             } // END OF CHECK ON WHETHER FIRST CLASS
615 |         }
616 |         ipv4_forward(260);
617 |     } //END OF APPLY
618 | } //END OF INGRESS CONTROL
619 | 
620 | /*************************************************************************
621 | ***********************  D E P A R S E R  *******************************
622 | *************************************************************************/
623 | 
624 | control IngressDeparser(packet_out pkt,
625 |     /* User */
626 |     inout my_ingress_headers_t                       hdr,
627 |     in    my_ingress_metadata_t                      meta,
628 |     /* Intrinsic */
629 |     in    ingress_intrinsic_metadata_for_deparser_t  ig_dprsr_md)
630 | {
631 |     Digest<flow_class_digest>() digest;
632 | 
633 |     apply {
634 | 
635 |         if (ig_dprsr_md.digest_type == 1) {
636 |             // Pack digest and send to controller
637 |             digest.pack({hdr.ipv4.src_addr, hdr.ipv4.dst_addr, meta.hdr_srcport, meta.hdr_dstport, hdr.ipv4.protocol, meta.final_class, meta.pkt_count, meta.register_index});
638 |         }
639 | 
640 |         /* we do not update checksum because we used ttl field for stats*/
641 |         pkt.emit(hdr);
642 |     }
643 | }
644 | 
645 | /*************************************************************************
646 | ****************  E G R E S S   P R O C E S S I N G   *******************
647 | *************************************************************************/
648 | struct my_egress_headers_t {
649 | }
650 | 
651 |     /********  G L O B A L   E G R E S S   M E T A D A T A  *********/
652 | 
653 | struct my_egress_metadata_t {
654 | }
655 | 
656 |     /***********************  P A R S E R  **************************/
657 | 
658 | parser EgressParser(packet_in        pkt,
659 |     /* User */
660 |     out my_egress_headers_t          hdr,
661 |     out my_egress_metadata_t         meta,
662 |     /* Intrinsic */
663 |     out egress_intrinsic_metadata_t  eg_intr_md)
664 | {
665 |     /* This is a mandatory state, required by Tofino Architecture */
666 |     state start {
667 |         pkt.extract(eg_intr_md);
668 |         transition accept;
669 |     }
670 | }
671 | 
672 |     /***************** M A T C H - A C T I O N  *********************/
673 | 
674 | control Egress(
675 |     /* User */
676 |     inout my_egress_headers_t                          hdr,
677 |     inout my_egress_metadata_t                         meta,
678 |     /* Intrinsic */
679 |     in    egress_intrinsic_metadata_t                  eg_intr_md,
680 |     in    egress_intrinsic_metadata_from_parser_t      eg_prsr_md,
681 |     inout egress_intrinsic_metadata_for_deparser_t     eg_dprsr_md,
682 |     inout egress_intrinsic_metadata_for_output_port_t  eg_oport_md)
683 | {
684 |     apply {
685 |     }
686 | }
687 | 
688 |     /*********************  D E P A R S E R  ************************/
689 | 
690 | control EgressDeparser(packet_out pkt,
691 |     /* User */
692 |     inout my_egress_headers_t                       hdr,
693 |     in    my_egress_metadata_t                      meta,
694 |     /* Intrinsic */
695 |     in    egress_intrinsic_metadata_for_deparser_t  eg_dprsr_md)
696 | {
697 |     apply {
698 |         pkt.emit(hdr);
699 |     }
700 | }
701 | 
702 | /*************************************************************************
703 | ***********************  S W I T C H  *******************************
704 | *************************************************************************/
705 | Pipeline(
706 |     IngressParser(),
707 |     Ingress(),
708 |     IngressDeparser(),
709 |     EgressParser(),
710 |     Egress(),
711 |     EgressDeparser()
712 | ) pipe;
713 | 
714 | Switch(pipe) main;
715 | 


--------------------------------------------------------------------------------
/In_switch_ETC/Switch/readme.md:
--------------------------------------------------------------------------------
 1 | To run the code:
 2 | - check the code and change the forwarding port 260 to the right one in your setup
 3 | - compile the P4 code
 4 |     - we used the Intel SDE version 9.7.0
 5 | - load the code onto the switch using _bf_switchd_
 6 | - load the table entries in the _table_entries.py_ file using _bfrt_python_
 7 |     - this also configures and brings up ports 1, 5 and 9 (56, 260, 292) which we use.
 8 |     - modify them according to your setup.
 9 | - run the _controller_digest_noms.py_ script to enable the controller to collect packet digests with classification results, clean registers after flows are classified, and update the flow table. 
10 |     - give your output csv file as an argument to this script when running it.
11 | - send packets from the pcap files through the switch using tcpreplay
12 |     - the current configuration has a filter table in the P4 program that will filter only the flows belonging to the test data for classification in order to easily compare them with the offline results.
13 | - use the function at the end of the _NIMS2023_Data_Analysis.ipynb_ notebook to analyze the csv obtained at the controller at the end of the experiment.
14 | 


--------------------------------------------------------------------------------
/Offline_ETC/README.md:
--------------------------------------------------------------------------------
 1 | # ENCRYPTED TRAFFIC CLASSIFICATION
 2 | 
 3 | This repo contains the scripts used for the data preparation and data engineering used in the paper [Encrypted Traffic Classification at Line Rate in Programmable Switches with Machine Learning](https://dspace.networks.imdea.org/handle/20.500.12761/1791) by Aristide Tanyi-Jong Akem, Guillaume Fraysse, Marco Fiore presented at IEEE/IFIP Network Operations and Management Symposium (NOMS) 2024.
 4 | 
 5 | ## Datasets
 6 | 5 different datasets containing Encrypted data were considered. Some are public some require a subscription to access them. The first three were kept in the paper:
 7 | * The [ISCXVPN2016 Dataset](http://dx.doi.org/10.5220/0005740704070414) dataset (from Draper-Gil, G.; Lashkari, A.; Mamun, M. and A. Ghorbani, A. (2016). Characterization of Encrypted and VPN Traffic using Time-related Features. In Proceedings of the 2nd International Conference on Information Systems Security and Privacy - ICISSP; ISBN 978-989-758-167-0; ISSN 2184-4356, SciTePress, pages 407-414. DOI: 10.5220/0005740704070414) is a popular labeled dataset made available by the Canadian Institute of Cybersecurity (CIC) from University of New Brunswick (UNB). It comprises about 28GB of traffic data captured using tcpdump and Wireshark. A subset of this dataset is made of VPN data, generated using an external VPN service.For this work the dataset was processed
 8 | from the raw PCAP files using the pipeline described in Section 2 to keep only the VPN subset and aggregate the packets in flows. This results in 4960 flows. Classes This dataset includes 7 classes of encrypted traffic: Browsing, Email,
 9 | Chat, Streaming, File Transfer, VoIP, and P2P. Visualization Figure 6a shows the distribution of the samples
10 | * NOMS2023 Encrypted Mobile Instant Messaging Traffic Dataset. The **NOMS2023 Encrypted Mobile Instant Messaging Traffic Dataset** (by Zolboo Erdenebaatar, Riyad Alshammari, Nur Zincir-Heywood, Marwa Elsayed, Biswajit Nandy, Nabil Seddigh, January 23, 2023, "Encrypted Mobile Instant Messaging Traffic Dataset", IEEE Dataport) can be downloaded at [https://dx.doi.org/10.21227/aer2-kq52](https://dx.doi.org/10.21227/aer2-kq52). It is divided in 7 files in the zip format. Six of these files contains data from traffic to commonly used Instant Messaging applications (Discord, Facebook Messenger, Signal, Microsoft Teams, Telegram and WhatsApp). The last file (non_ima_encrypted_traffic.zip) contains encrypted traffic that is not from any of this classes and is not traffic from Instant Messaging applications. It contains four classes, the first three are other types of usage: Gmail, WebBrowsing, YouTube. The last class Background contains all background
11 | traffic, i.e. traffic recorded during the same period but that is not for the classes identified by the other applications. For this work we considered only the data from the 6 Instant Messaging application and considered the classification
12 | of traffic in these six classes. The subset of the dataset that is then considered contains 6 different classes: Discord, Facebook Messenger, Signal, Microsoft Teams, Telegram and WhatsApp.
13 | * The Netflow QUIC dataset from [V. Tong, H. A. Tran, S. Souihi and A. Mellouk, "A Novel QUIC Traffic Classifier Based on Convolutional Neural Networks," 2018 IEEE Global Communications Conference (GLOBECOM), Abu Dhabi, United Arab Emirates, 2018, pp. 1-6, doi: 10.1109/GLOCOM.2018.8647128.](https://ieeexplore.ieee.org/abstract/document/8647128) is a labeled dataset of QUIC traffic to Google services. This dataset is significantly larger than the others with 365000 flows and a total of 136 millions packets. This dataset contains traffic classified in 5 different classes from Google services: CHAT, VoIP, FileTransfer, Video streaming YouTube, Google Play Music.
14 | * [UC Davis](https://doi.org/10.48550/arXiv.1812.09761) : The UCDavis QUIC Dataset is a labeled dataset that can be downloaded
15 | at [https://drive.google.com/drive/folders/1Pvev0hJ82usPh6dWDlz7Lv8L6h3JpWhE](https://drive.google.com/drive/folders/1Pvev0hJ82usPh6dWDlz7Lv8L6h3JpWhE) (file pretraining.zip). Traffic on different services offered by Google was captured by University of California, Davis (UC Davis) team. The data was collected using AutoIt4 and Selenium WebDriver5 scripts on different systems running various versions of Windows and Ubuntu Linux. Only the QUIC traffic was kept. This dataset contains 5 classes which are 5 different Google Services: Google Drive, YouTube, Google Doc, Google Search and Google Music.
16 | * The CSTNET TLS1.3 dataset (by [Lin, X., Xiong, G., Gou, G., Li, Z., Shi, J. and Yu, J., 2022, April. Et-bert: A contextualized datagram representation with pre-training transformers for encrypted traffic classification. In Proceedings of the ACM Web Conference 2022 (pp. 633-642)](https://dl.acm.org/doi/abs/10.1145/3485447.3512217) is a labeled dataset of encrypted traffic to a large number (120) of services. This high number of classes is an order of magnitude bigger than the other 4 datasets. It is probably more realistic from a network operator perspective whose customers generate traffic not only to a handful of services but to any service on the internet. This dataset contains data from 120 classes, each of which is labeled by the domain name of an application (e.g. google.com, elsevier.com, ..).
17 | 
18 | ## Data preparation
19 | Most datasets are in raw PCAP format. We have performed two steps:
20 | * convert PCAP to CSV
21 | * compute the flows for each packet. A flow is a 5-tuple (IP src, port src, ip dst, port dst, protocol). Each packet with the same value for the tuple get associated with a unique flow-id. This step add a new column in the CSV file with this flow id.
22 | 
23 | ### PCAP to csv
24 | After using a Python script absed on scappy we moved to tshark for performance on the larger datasets.
25 | We used the tshark command, cf. the script **data_preparation/pcap2csv.sh**. Once you have downloaded a dataset you can run the script on each of the PCAP files and redirect the output to a CSV file:
26 | 
27 | ```bash
28 | bash data_preparation/pcap2csv.sh datafile.pcap > datafile.csv
29 | ```
30 | 
31 | ### Add the flow id column
32 | 
33 | To add the flow Id information to the dataset, we have developped the Python script **data_preparation/pkts2flows.py**. 
34 | To use it, simply change the placeholder values *inputdir* and *outputdir* in the script.
35 | * *inputdir* must point to the directory where the csv files of the dataset are stored.
36 | * *outputdir* must point to the folder where you want the new files to be written.
37 | 
38 | ```bash
39 | python data_preparation/pkts2flows.py 
40 | ```
41 | 


--------------------------------------------------------------------------------
/Offline_ETC/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nds-group/ETC_NOMS_2024/1ad4bbd4e1c097e87625b185e6d0ba14fa201bbb/Offline_ETC/__init__.py


--------------------------------------------------------------------------------
/Offline_ETC/cstnet-tls13_traffic_classifier.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import argparse
  5 | from os import listdir
  6 | from os.path import isfile, join
  7 | import sys
  8 | import time
  9 | 
 10 | import pandas as pd
 11 | 
 12 | import numpy as np
 13 | 
 14 | from scipy.stats import kurtosis, skew
 15 | 
 16 | from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
 17 | from sklearn.compose import ColumnTransformer
 18 | from sklearn.pipeline import Pipeline
 19 | from sklearn.metrics import classification_report, f1_score, confusion_matrix, ConfusionMatrixDisplay
 20 | 
 21 | import seaborn as sns
 22 | import matplotlib.pyplot as plt 
 23 | 
 24 | from encrypted_traffic_classification import EncryptedTrafficClassifier, EncryptedTrafficClassifierIterator
 25 | 
 26 | ########################################
 27 | # Data preparation: convert RAW data
 28 | ########################################
 29 | class CstNetTls13Classifier(EncryptedTrafficClassifier):
 30 |     def __init__(self, nb_folds, nb_packets_per_flow):        
 31 |         super().__init__(
 32 |             nb_folds= nb_folds,
 33 |             nb_packets_per_flow = nb_packets_per_flow,
 34 |             filename_prefix = "cstnet_tls13",
 35 |             processed_data_output_dir = "cstnet_tls13_output/",
 36 |             data_dir = "data/cstnet_tls13/"            
 37 |         )
 38 |         
 39 |         pools = [tuple(pool) for pool in [self.nb_packets_per_flow, range(self.nb_folds)]]
 40 |         result = [[]]
 41 |         for pool in pools:
 42 |             result = [x+[y] for x in result for y in pool]
 43 |         self.flow_ids = result
 44 | 
 45 |         pools = [tuple(pool) for pool in [self.nb_packets_per_flow]]
 46 |         result = [[]]
 47 |         for pool in pools:
 48 |             result = [x+[y] for x in result for y in pool]
 49 |         self.flow_ids_without_folds = result
 50 | 
 51 |         pools = [tuple(pool) for pool in [range(self.nb_folds)]]
 52 |         result = [[]]
 53 |         for pool in pools:
 54 |             result = [x+[y] for x in result for y in pool]
 55 |         self.packet_ids = result
 56 |         
 57 |     ########################################
 58 |     # Preprocessing
 59 |     ########################################
 60 |     def _get_flows_with_all_packets(self):
 61 |         print("_get_flows_with_all_packets")
 62 |         start_time = time.time()
 63 |         subdirs = sorted([f for f in listdir(self.data_dir)])
 64 |         nb_flows = 0
 65 |         df_flows = pd.DataFrame()
 66 |         self.classes = set()
 67 |         for subdir in subdirs:
 68 |             # print("subdir", self.data_dir+subdir)
 69 |             _files = sorted([f for f in listdir(self.data_dir + subdir) if isfile(join(self.data_dir + subdir, f))])
 70 |             # print("  files", _files)
 71 |             for _i in range(len(_files)):
 72 |                 f = self.data_dir + subdir + "/" + _files[_i]
 73 | 
 74 |                 df_new = pd.read_csv(f, 
 75 |                                      names = [
 76 |                                          'flow_id',
 77 |                                          'timestamp', 
 78 |                                          'iat',                                                         
 79 |                                          'source',
 80 |                                          'sport',
 81 |                                          'dest', 
 82 |                                          'dport',
 83 |                                          'protocol', 
 84 |                                          'length'
 85 |                                      ],
 86 |                                      header = 0
 87 |                                      )   
 88 |                 print(f, df_new.shape)
 89 |             
 90 |                 # drop DNS traffic
 91 |                 df_new = df_new.drop(df_new[df_new['sport'] == 53].index)
 92 |                 df_new = df_new.drop(df_new[df_new['dport'] == 53].index)
 93 |                 
 94 |                 found = False
 95 |                 for _c in self.all_classes:
 96 |                     if _c in f:
 97 |                         found = True
 98 |                         df_new['class'] = _c
 99 |                         self.classes.add(_c)
100 |                         break
101 |                 if found == False:
102 |                     print("class not identified for", f)
103 |             
104 |                 # extract flow and add statistical features
105 |                 dfs = []
106 |                 for flow_id in df_new['flow_id'].unique():                    
107 |                     nb_flows += 1
108 |                     d = df_new[df_new['flow_id'] == flow_id].head(n = 1)
109 |                     d['nb_packets'] = len(df_new[df_new['flow_id'] == flow_id])
110 |                     c = d['class'].tolist()
111 |                     dport = d.dport.tolist()
112 |                     sport = d.sport.tolist()
113 |                     #print(d)
114 |                     _df = df_new.loc[df_new['flow_id'] == flow_id, 'iat']
115 |                     d['sum_iat'] = np.sum(_df)
116 |                 
117 |                     _df = df_new.loc[df_new['flow_id'] == flow_id, 'length']
118 |                     d['sum_length'] = np.sum(_df)
119 |                     d['src'] = f
120 |                     dfs.append(d)
121 |             _d = pd.concat(dfs)
122 |             df_flows = pd.concat([_d, df_flows])
123 |             # For debugging
124 |             # break
125 |                 
126 |         print("  processing took ", time.time() - start_time, "seconds.")
127 |         print("%d flows processed" % nb_flows)            
128 |         # Finish processing the data, create the train/tests split and save as pickle files
129 |         df_flows = df_flows.fillna(0)
130 |         
131 |         self.classes = list(self.classes)
132 |         self._hotencode_class(df_flows)
133 |         return df_flows
134 |         
135 |     def data_preparation(self):
136 |         print("data_preparation")
137 |         import warnings
138 |         warnings.filterwarnings("ignore")
139 | 
140 |         df_flows = {}
141 |         files = []
142 |         subdirs = [f for f in listdir(self.data_dir)]
143 |         for subdir in subdirs:
144 |             # print("subdir", self.data_dir+subdir)
145 |             _files = [f for f in listdir(self.data_dir + subdir) if isfile(join(self.data_dir + subdir, f))]
146 |             # print("  files", _files)
147 |             for _i in range(len(_files)):
148 |                 _files[_i] = self.data_dir + subdir + "/" + _files[_i]
149 |             files += _files
150 | 
151 |         # print(files)
152 |         for i in self.nb_packets_per_flow:
153 |             self.__generate_pickle_for_n_packets(i, files)
154 | 
155 |     def __generate_pickle_for_n_packets(self, n, files):
156 |         print("__generate_pickle_for_n_packets n =", n)
157 |         nb_flows = 0
158 |         df_flows = pd.DataFrame()
159 |         # dfs = []
160 |         self.classes = set()
161 |         for f in files:
162 |             # print("f=", f)
163 |             df_new = pd.read_csv(f, 
164 |                                  names = [
165 |                                      'flow_id',
166 |                                      'timestamp', 
167 |                                      'iat',                                                         
168 |                                      'source',
169 |                                      'sport',
170 |                                      'dest', 
171 |                                      'dport',
172 |                                      'protocol', 
173 |                                      'length'
174 |                                  ],
175 |                                  header = 0
176 |                                  )   
177 |             print(n, f, df_new.shape)
178 |             
179 |             # drop DNS traffic
180 |             df_new = df_new.drop(df_new[df_new['sport'] == 53].index)
181 |             df_new = df_new.drop(df_new[df_new['dport'] == 53].index)
182 |             
183 |             found = False
184 |             for _c in self.all_classes:
185 |                 if _c in f:
186 |                     found = True
187 |                     df_new['class'] = _c
188 |                     self.classes.add(_c)
189 |                     break
190 |             if found == False:
191 |                 print("class not identified for", f)
192 |             
193 |             # extract flow and add statistical features
194 |             for flow_id in df_new['flow_id'].unique():                    
195 |                 nb_flows += 1
196 |                 _df_new = df_new[df_new['flow_id'] == flow_id].head(n = n)
197 |                 d = _df_new.head(n = 1)
198 |                 d['nb_packets'] = len(_df_new) #df_new[df_new['flow_id'] == flow_id])
199 |                 c = d['class'].tolist()
200 |                 dport = d.dport.tolist()
201 |                 sport = d.sport.tolist()
202 |                 #print(d)
203 |                 _df = _df_new['iat']
204 |                 d['min_iat'] = np.min(df_new[df_new['iat'] > 0]['iat']) # probably useless as most probably always 0 for the first packet
205 |                 d['max_iat'] = np.max(_df)
206 |                 d['sum_iat'] = np.sum(_df)
207 |                 d['mean_iat'] = np.mean(_df)
208 |                 d['median_iat'] = np.median(_df)
209 |                 d['std_iat'] = np.std(_df)
210 |                 d['1stQ_iat'] = np.quantile(_df, 0.25)
211 |                 d['3rdQ_iat'] = np.quantile(_df, 0.75)
212 |                 _a = list(_df)
213 |                 d['skew_iat'] = skew(_a)
214 |                 d['kurt_iat'] = kurtosis(_a)
215 |                 
216 |                 _df = _df_new['length']
217 |                 d['min_length'] = np.min(_df)
218 |                 d['max_length'] = np.max(_df)
219 |                 d['sum_length'] = np.sum(_df)
220 |                 d['median_length'] = np.median(_df)
221 |                 d['mean_length'] = np.mean(_df)
222 |                 d['std_length'] = np.std(_df)
223 |                 d['1stQ_length'] = np.quantile(_df, 0.25)
224 |                 d['3rdQ_length'] = np.quantile(_df, 0.75)
225 |                 _a = list(_df)
226 |                 d['skew_length'] = skew(_a)
227 |                 # d['skew_length'] = skew(np.array(df_new.loc[df_new['flow_id'] == flow_id, 'length']))
228 |                 d['kurt_length'] = kurtosis(_a)
229 |                 d['src'] = f
230 |                 # dfs.append(d)
231 |                 df_flows = pd.concat([d, df_flows])
232 |             # if nb_flows > 20:
233 |             #     break
234 |                 
235 |         print("%d flows processed" % nb_flows)            
236 |         # Finish processing the data, create the train/tests split and save as pickle files
237 |         df_flows = df_flows.fillna(0)
238 |         
239 |         self.classes = list(self.classes)
240 |         self._hotencode_class(df_flows)
241 |         
242 |         filename = self.filename_prefix + "_" + str(n) + ".pickle"
243 |         # filename = "cstnet_tls13_" + str(n) + ".pickle"
244 |         self._generate_data_folds(df_flows, filename)
245 |         
246 |     ########################################
247 |     # Data Analysis
248 |     ########################################
249 |     """
250 |     def __show_actual_and_predicted(self, X, y, y_pred, _class):
251 |         print(self.classes)
252 |         for _i in itertools.product(NB_PACKETS, self.filenames):
253 |             i = (_i[0], _i[1], 0)
254 |             print(i)
255 |             df = X[i].copy()
256 |             df['type'] = y[i]
257 |             df['type_pred'] = y_pred[i]
258 |             print(df.columns)
259 |             a4_dims = (23.4, 16.54)
260 |             fig, ax = plt.subplots(figsize = a4_dims)
261 |             sns.lmplot(
262 |                 x = 'sum_iat', 
263 |                 y = 'sum_length', 
264 |                 data = df[df['type'] == _class],
265 |                 hue = 'type', 
266 |                 fit_reg = False,
267 |                 height = 4, aspect = 5,
268 |                 # color = 'green',
269 |                 # scatter_kws = {'alpha': 0.3},
270 |                 # ax = ax,
271 |                 legend = False,
272 |                 palette = 'viridis'
273 |             )
274 |             #ax.set(xlabel='time_delta', ylabel='packet_size')
275 |             ax.set(xlabel = 'duration', ylabel = 'sum_packet_size')
276 |             plt.legend(title = 'Class', labels =self.classes)
277 |             plt.savefig("cstnet_tls13_" + self.classes[_class] + "_"+ str(i[0]) + "_" + i[1]+".png", format = 'png')    
278 |             fig, ax2 = plt.subplots(figsize = a4_dims)
279 |             sns.lmplot(
280 |                 x = 'sum_iat', 
281 |                 y = 'sum_length', 
282 |                 data = df[df['type_pred'] == _class],
283 |                 hue = 'type', 
284 |                 fit_reg = False,
285 |                 height = 4, aspect = 5,
286 |                 # color = 'orange',
287 |                 # scatter_kws = {'alpha': 0.3},
288 |                 legend = False,
289 |                 palette = 'viridis',
290 |                 # ax = ax2
291 |             )
292 |             ax2.set(xlabel = 'duration', ylabel = 'sum_packet_size')
293 |             plt.legend(title = 'Class', labels =self.classes)
294 |             plt.savefig(self.filename_prefix + "_" + self.classes[_class] + "_pred_"+ str(i[0]) + "_" + i[1]+".png", format = 'png')
295 |     """
296 | ########################################
297 | # Entry point
298 | ########################################
299 | if __name__ == "__main__":
300 |     parser = argparse.ArgumentParser(
301 |         prog='cstnet_tls13_instant_messaging_traffic_classifier',
302 |         description='Classify packets or flows from CTSNET TTLS1.3 dataset',
303 |         epilog=''
304 |     )
305 |     parser.add_argument('-p', '--nb_packets', action = 'append', type = int, required = True) #, default = [4, 8])
306 |     parser.add_argument('-c', '--classifier', action = 'append', type = str) #, default = ['rf'])
307 |     parser.add_argument('-f', '--nb_folds', action = 'store', default = 12, type = int)
308 |     parser.add_argument('-v', '--visualization', action = 'store_true', required = False, default = False)
309 |     parser.add_argument('-r', '--report', action = 'store_true', required = False, default = False)
310 |     parser.add_argument('-F', '--force_rf_classification', action = 'store_true', required = False, default = False)
311 |     args = parser.parse_args(sys.argv[1:])
312 | 
313 |     VISUALIZATION_ENABLED = False
314 |     if args.visualization == True:
315 |         VISUALIZATION_ENABLED = True
316 | 
317 |     RF_ENABLED = False
318 |     GB_ENABLED = False
319 |     XG_ENABLED = False
320 |     for c in args.classifier:
321 |         c = c.lower()
322 |         if c == "rf":
323 |             RF_ENABLED = True
324 |         elif c == "gb":
325 |             GB_ENABLED = True
326 |         elif c == "xg":
327 |             XG_ENABLED = True
328 |         else:
329 |             print("Unknown classifier", c)
330 | 
331 |     classifier = CstNetTls13Classifier(
332 |         nb_folds = args.nb_folds,
333 |         nb_packets_per_flow = args.nb_packets
334 |     )
335 | 
336 |     if args.force_rf_classification == True:
337 |         classifier.force_rf_classification = True
338 | 
339 |     classifier.all_classes = [
340 |         "163.com",
341 |         "chia.net",
342 |         "github.com",
343 |         "leetcode-cn.com",
344 |         "qcloud.com",
345 |         "toutiao.com",
346 |         "51cto.com",
347 |         "chinatax.gov.cn",
348 |         "gitlab.com",
349 |         "media.net",
350 |         "qq.com",
351 |         "twimg.com",
352 |         "51.la",
353 |         "cisco.com",
354 |         "gmail.com",
355 |         "mi.com",
356 |         "researchgate.net",
357 |         "twitter.com",
358 |         "acm.org",
359 |         "cloudflare.com",
360 |         "goat.com",
361 |         "microsoft.com",
362 |         "runoob.com",
363 |         "unity3d.com",
364 |         "adobe.com",
365 |         "cloudfront.net",
366 |         "google.com",
367 |         "mozilla.org",
368 |         "sciencedirect.com",
369 |         "v2ex.com",
370 |         "alibaba.com",
371 |         "cnblogs.com",
372 |         "grammarly.com",
373 |         "msn.com",
374 |         "semanticscholar.org",
375 |         "vivo.com.cn",
376 |         "alicdn.com",
377 |         "codepen.io",
378 |         "gravatar.com",
379 |         "naver.com",
380 |         "sina.com.cn",
381 |         "vk.com",
382 |         "alipay.com",
383 |         "crazyegg.com",
384 |         "guancha.cn",
385 |         "netflix.com",
386 |         "smzdm.com",
387 |         "vmware.com",
388 |         "amap.com",
389 |         "criteo.com",
390 |         "huanqiu.com",
391 |         "nike.com",
392 |         "snapchat.com",
393 |         "walmart.com",
394 |         "amazonaws.com",
395 |         "ctrip.com",
396 |         "huawei.com",
397 |         "notion.so",
398 |         "sohu.com",
399 |         "weibo.com",
400 |         "ampproject.org",
401 |         "dailymotion.com",
402 |         "hubspot.com",
403 |         "nvidia.com",
404 |         "springer.com",
405 |         "wikimedia.org",
406 |         "apple.com",
407 |         "deepl.com",
408 |         "huya.com",
409 |         "office.net",
410 |         "spring.io",
411 |         "wikipedia.org",
412 |         "arxiv.org",
413 |         "digitaloceanspaces.com",
414 |         "ibm.com",
415 |         "onlinedown.net",
416 |         "squarespace.com",
417 |         "wp.com",
418 |         "asus.com",
419 |         "duckduckgo.com",
420 |         "icloud.com",
421 |         "opera.com",
422 |         "statcounter.com",
423 |         "xiaomi.com",
424 |         "atlassian.net",
425 |         "eastday.com",
426 |         "ieee.org",
427 |         "oracle.com",
428 |         "steampowered.com",
429 |         "ximalaya.com",
430 |         "azureedge.net",
431 |         "eastmoney.com",
432 |         "instagram.com",
433 |         "outbrain.com",
434 |         "taboola.com",
435 |         "yahoo.com",
436 |         "baidu.com",
437 |         "elsevier.com",
438 |         "iqiyi.com",
439 |         "overleaf.com",
440 |         "t.co",
441 |         "yandex.ru",
442 |         "bilibili.com",
443 |         "facebook.com",
444 |         "jb51.net",
445 |         "paypal.com",
446 |         "teads.tv",
447 |         "youtube.com",
448 |         "biligame.com",
449 |         "feishu.cn",
450 |         "jd.com",
451 |         "pinduoduo.com",
452 |         "thepaper.cn",
453 |         "yy.com",
454 |         "booking.com",
455 |         "ggpht.com",
456 |         "kugou.com",
457 |         "python.org",
458 |         "tiktok.com",
459 |         "zhihu.com"
460 |     ]
461 |     
462 |     non_needed_features = [
463 |         'flow_id', 
464 |         'class', 
465 |         'source', 
466 |         'dest', 
467 |         'sport',
468 |         'dport', 
469 |         'protocol', 
470 |         'timestamp', 
471 |         # 'nb_packets',
472 |         'src',
473 |         'iat',
474 |         'direction',
475 |         'length'
476 |     ]
477 | 
478 |     all_features_flows = [
479 |         'min_iat',
480 |         'max_iat',
481 |         'sum_iat',
482 |         'mean_iat',
483 |         'median_iat',
484 |         'std_iat',
485 |         '1stQ_iat',
486 |         '3rdQ_iat', 
487 |         'skew_iat',
488 |         'kurt_iat',
489 |         'min_length',
490 |         'max_length',
491 |         'sum_length',
492 |         'median_length',
493 |         'mean_length', 
494 |         'std_length',
495 |         '1stQ_length',
496 |         '3rdQ_length',
497 |         'skew_length',
498 |         'kurt_length',
499 |         'nb_packets',
500 |         # 'sport',
501 |         # 'dport',
502 |         # 'protocol',
503 |         # 'direction'
504 |     ]
505 |     # best_features = [
506 |     #     'max_iat',
507 |     #     'sum_iat',
508 |     #     'mean_iat',
509 |     #     'median_iat',
510 |     #     'std_iat',
511 |     #     '1stQ_iat',
512 |     #     '3rdQ_iat', 
513 |     #     'skew_iat',
514 |     #     'kurt_iat',
515 |     #     'min_length',
516 |     #     'max_length',
517 |     #     'sum_length',
518 |     #     'median_length',
519 |     #     'mean_length', 
520 |     #     'std_length',
521 |     #     '1stQ_length',
522 |     #     '3rdQ_length',
523 |     #     'skew_length',
524 |     #     'kurt_length'
525 |     # ]
526 |     best_features = ['3rdQ_iat', 'std_iat', 'std_length', 'skew_iat', 'max_iat', 'sum_iat', 'mean_length', '1stQ_length', 'max_length', 'mean_iat', 'min_length', 'sum_length', 'median_length', '1stQ_iat', 'median_iat', '3rdQ_length', 'kurt_iat', 'kurt_length', 'nb_packets']
527 |     online_features=[
528 |         'sum_iat',
529 |         'sum_length',
530 |         'max_length',
531 |         'mean_iat',
532 |         'max_iat',
533 |         'mean_length',
534 |         'min_length',
535 |         'min_iat'
536 |     ]
537 |     feats_flows = all_features_flows
538 |     
539 |     # Preprocessing
540 |     if not classifier.data_prepared():
541 |         classifier.data_preparation()
542 |         classifier.load_flows()
543 |     else:
544 |         classifier.load_flows()
545 |         classifier.classes = classifier.all_classes
546 |     # if not classifier.data_prepared():
547 |     #     classifier.data_preparation()
548 |     # else:
549 |     #     classifier.classes = classifier.all_classes
550 |         
551 |     # classifier.load_flows()
552 |     classifier.cleanup_data(classifier.X_train_flows,
553 |                             classifier.y_train_flows,
554 |                             classifier.X_test_flows,
555 |                             classifier.y_test_flows,
556 |                             classifier.flow_ids,
557 |                             non_needed_features)
558 |     # classifier._cleanup_data(non_needed_features)
559 |     # classifier.X_train_flows_fitted, classifier.X_test_flows_fitted = classifier.preprocessing(
560 |     #     classifier.X_train_flows,
561 |     #     classifier.y_train_flows,
562 |     #     classifier.X_test_flows,
563 |     #     classifier.y_test_flows,
564 |     #     classifier.flow_ids,
565 |     #     feats_flows
566 |     # )
567 |     classifier.X_train_flows_fitted = classifier.X_train_flows
568 |     classifier.X_test_flows_fitted = classifier.X_test_flows
569 |     # __correlation()
570 |     # feats = all_features
571 |     # analyze_models_for_npkts(10, all_features, "all_feats")
572 | 
573 |     if args.report == True:
574 |         classifier._viz(distribution = 0, class_distribution = -1, nb_packets = -1, min_iat = -1, max_iat = -1)
575 |         for n in classifier.nb_packets_per_flow:
576 |             if n == 4:
577 |                 classifier._viz(distribution = -1, class_distribution = 0, nb_packets = 0, min_iat = -1, max_iat = -1)
578 |             elif n == 8:
579 |                 classifier._viz(distribution = -1, class_distribution = 10, nb_packets = 0, min_iat = -1, max_iat = -1)
580 |             elif n == 600000:
581 |                 classifier._viz(distribution = -1, class_distribution = 11, nb_packets = 0, min_iat = 1, max_iat = -1)
582 |         sys.exit(1)
583 |     if VISUALIZATION_ENABLED:
584 |         # f = classifier.filename_prefix + '_datasetflows_distribution.pickle'
585 |         # if isfile(classifier.processed_data_output_dir + f):
586 |         #     print("Loading dataset from pickle file", f)
587 |         #     _df = classifier._load_pickle(f)
588 |         # else:
589 |         #     print("Creating dataset")
590 |         #     _df = classifier._get_flows_with_all_packets()
591 |         #     classifier._pickle_dump(_df, f)
592 |         #     print("Dataset saved in file", f)
593 |         # classifier._class_distribution(_df, classifier.filename_prefix + '_flows_distribution')
594 |         # # classifier._nb_packets_distribution(_df, classifier.filename_prefix + "_flows_nb_packets_distribution" )
595 |         # classifier._distribution(_df, classifier.filename_prefix + "_flows_class_split" )
596 |         pkt = classifier.nb_packets_per_flow[0]
597 |         fold = 0
598 |         _i = pkt, fold
599 |         _df1 = classifier.X_train_flows[_i].copy()
600 |         # print("_df1", _df1.columns)
601 |         # print("y_train", classifier.y_train_flows[_i])
602 |         # print(classifier.y_train_flows[_i][classifier.y_train_flows[_i].index.duplicated()])
603 |         _df1['type'] = classifier.y_train_flows[_i].values
604 |         # print("_df1 type", _df1.columns)
605 |         _df2 = classifier.X_test_flows[_i].copy()
606 |         # print("_df2", _df1.columns)
607 |         _df2['type'] = classifier.y_test_flows[_i].values
608 |         _df = pd.concat([_df1, _df2])
609 |         _df.reset_index()
610 |         print(_df.shape)
611 |         print(_df['type'].value_counts().to_string())
612 |         # classifier._distribution(_df, classifier.filename_prefix + "_flows_class_split_" + str(pkt) + '_pkt', xticks = False)
613 |         # classifier._class_distribution(_df, classifier.filename_prefix + '_flows_distribution_' + str(pkt) + '_pkt', xticks = False)
614 |         classifier._nb_packets_distribution(_df, classifier.filename_prefix + "_flows_nb_packets_distribution_" + str(pkt) + '_pkt', xticks = False)
615 |         # classifier._min_iat_distribution(_df, classifier.filename_prefix + "_flows_min_iat_distribution_" + str(pkt) + '_pkt', xticks = False)
616 |         
617 |     if RF_ENABLED:
618 |         print("==== RandomForest =====")
619 |         """
620 |         classifier.X_train_flows_fitted, classifier.X_test_flows_fitted = classifier.preprocessing(
621 |             classifier.X_train_flows,
622 |             classifier.y_train_flows,
623 |             classifier.X_test_flows,
624 |             classifier.y_test_flows,
625 |             classifier.flow_ids,
626 |             feats_flows
627 |         )
628 |         classifier.X_train_flows_fitted = classifier.X_train_flows
629 |         classifier.X_test_flows_fitted = classifier.X_test_flows
630 |         """
631 |         rf_regr_flows, rf_y_train_flows_predicted, rf_y_test_flows_predicted = classifier.RF_predict(
632 |                 classifier.X_train_flows_fitted,
633 |                 classifier.y_train_flows,
634 |                 classifier.X_test_flows_fitted,
635 |                 classifier.y_test_flows,
636 |             )
637 |         rf_cm_dict_flows, output = classifier.confusion_matrix(rf_regr_flows,
638 |                                                                classifier.y_test_flows,
639 |                                                                rf_y_test_flows_predicted,
640 |                                                                classifier.flow_ids,
641 |                                                                "rf"
642 |                                                                )
643 |         print(output)
644 |         rf_f1_scores_flows, output = classifier.get_F1_score(rf_cm_dict_flows,
645 |                                                              classifier.y_test_flows,
646 |                                                              rf_y_test_flows_predicted,
647 |                                                              classifier.flow_ids,
648 |                                                              "rf_flows")
649 |         print(output)
650 |         avg_scores, output = classifier.avg_f1_scores(rf_f1_scores_flows, classifier.flow_ids)
651 |         print(output)
652 | 
653 |     if GB_ENABLED:
654 |         gb_regr, gb_y_train_predicted, gb_y_test_predicted  = classifier.GBoost_predict(feats_flows, df_score)
655 |         gb_cm_dict = classifier.confusion_matrix(gb_regr, gb_y_test_predicted, False)
656 |         gb_f1_scores = classifier.get_F1_score(df_score, gb_cm_dict,  y_test, gb_y_test_predicted, "gb", False)
657 |         classifier.avg_f1_scores(gb_f1_scores_flows, classifier.flow_ids_without_folds)
658 |         classifier.avg_f1_scores(gb_f1_scores)
659 | 
660 |     if XG_ENABLED:
661 |         print("==== XGBoost =====")
662 |         xg_regr, xg_y_train_predicted, xg_y_test_flows_predicted = classifier.XGBoost_predict(
663 |             classifier.X_train_flows_fitted,
664 |             classifier.y_train_flows,
665 |             classifier.X_test_flows_fitted,
666 |             classifier.y_test_flows
667 |         )
668 |         xg_cm_dict_flows, output = classifier.confusion_matrix(xg_regr,
669 |                                                                classifier.y_test_flows,
670 |                                                                xg_y_test_flows_predicted,
671 |                                                                classifier.flow_ids,
672 |                                                                "xg"
673 |                                                                )
674 |         print(output)
675 |             
676 |         xg_f1_scores_flows, output = classifier.get_F1_score(
677 |             xg_cm_dict_flows,
678 |             classifier.y_test_flows,
679 |             xg_y_test_flows_predicted,
680 |             classifier.flow_ids,
681 |             "xg_flows")
682 |         print(output)
683 |         avg_scores, output = classifier.avg_f1_scores(xg_f1_scores_flows, classifier.flow_ids)
684 |         print(output)
685 | 
686 |     print(classifier.classification_results)
687 |     if RF_ENABLED or GB_ENABLED or XG_ENABLED:
688 |         classifier.save_results()


--------------------------------------------------------------------------------
/Offline_ETC/data_preparation/pcap2csv.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | pcapfile=$1
3 | 
4 | tshark -o gui.column.format:"SP,%uS,DP,%uD" -r "${pcapfile}" -T fields -E header=y -E separator=, -e frame.number -e frame.time_epoch -e frame.time_delta -e ip.src -e _ws.col.SP -e ip.dst -e _ws.col.DP -e ip.proto -e frame.len   
5 | 


--------------------------------------------------------------------------------
/Offline_ETC/data_preparation/pkts2flows.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | from os import listdir
 5 | from os.path import join
 6 | 
 7 | import pandas as pd
 8 | import numpy as np
 9 | 
10 | def __pkts2flow(path, outpath, filename, first_flow_id):
11 |     print("opening", join(path, filename))
12 |     df = pd.read_csv(join(path, filename), names = ["packet_id", "timestamp", "iat", "src", "psrc", "dst", "pdst", "protocol", "length"], dtype={'packet_id': 'int', 'timestamp': 'float', 'iat': 'float', 'src': 'str', "psrc": 'int', 'dst':'str', 'pdst': 'int', 'protocol': 'int', 'length': 'int'},header = 0)
13 |     df['protocol'].replace('', np.nan, inplace = True) 
14 |     df = df.dropna(axis = 1)
15 |     df['flow_id'] = df.groupby(['src', 'psrc', 'dst', 'pdst', 'protocol']).ngroup()
16 |     df['flow_id'] = df['flow_id'].astype('int')
17 |     
18 |     print(df.shape)
19 | 
20 |     # update flow_id to consecutive values
21 |     df = df.sort_values(by = ['flow_id'])
22 |     df.flow_id = df.flow_id.ne(df.flow_id.shift()).cumsum().add(first_flow_id).astype('int')
23 |     
24 |     r = df['flow_id'].max()
25 |     df.to_csv(join(outpath, filename), index = False)
26 |     return r
27 | 
28 | def main():    
29 |     # change inputdir to the full name of the directory where the dataset CSV files are stored.
30 |     paths = ["inputdir"]
31 |     first_flow_id = 0
32 |     # change outputdir to the full name of the directory where you want to store the new CSV files
33 |     output_path = "outputdir"
34 |     
35 |     for path in paths:
36 |         for f in listdir(path):
37 |             if "csv" in f:
38 |                 first_flow_id = __pkts2flow(path, output_path, f, first_flow_id)                
39 |  
40 | if __name__ == "__main__":
41 |     main()


--------------------------------------------------------------------------------
/Offline_ETC/netflow_quic_traffic_classifier.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import argparse
  5 | import itertools
  6 | from os import listdir
  7 | from os.path import isfile, join
  8 | import sys
  9 | import time
 10 | 
 11 | import pandas as pd
 12 | 
 13 | import numpy as np
 14 | 
 15 | from scipy.stats import kurtosis, skew
 16 | 
 17 | from sklearn.metrics import classification_report, f1_score, confusion_matrix
 18 | 
 19 | import seaborn as sns
 20 | import matplotlib.pyplot as plt 
 21 | 
 22 | from encrypted_traffic_classification import EncryptedTrafficClassifier, EncryptedTrafficClassifierIterator
 23 | 
 24 | filename_patterns = { 
 25 |     "youtube_": "STREAMING",
 26 |     "Google_Play_Music_": "Google_Play_Music",
 27 |     "GoogleHangout_VoIP_": "GoogleHangout_VoIP",
 28 |     "GoogleHangout_Chat_": "GoogleHangout_Chat",
 29 |     "FileTransfer_": "FileTransfer",    
 30 | }
 31 | 
 32 | ########################################
 33 | # Data preparation: convert RAW data
 34 | ########################################
 35 | class NetFlowQUICClassifier(EncryptedTrafficClassifier):
 36 |     def __init__(self, nb_folds, nb_packets_per_flow):        
 37 |         super().__init__(
 38 |             nb_folds= nb_folds,
 39 |             nb_packets_per_flow = nb_packets_per_flow,
 40 |             filename_prefix = "netflow_quic",
 41 |             processed_data_output_dir = "netflow_quic_output/",
 42 |             data_dir = "data/Netflow-QUIC/"            
 43 |         )
 44 |                 
 45 |         pools = [tuple(pool) for pool in [self.nb_packets_per_flow, range(self.nb_folds)]]
 46 |         result = [[]]
 47 |         for pool in pools:
 48 |             result = [x+[y] for x in result for y in pool]
 49 |         self.flow_ids = result
 50 | 
 51 |         pools = [tuple(pool) for pool in [self.nb_packets_per_flow]]
 52 |         result = [[]]
 53 |         for pool in pools:
 54 |             result = [x+[y] for x in result for y in pool]
 55 |         self.flow_ids_without_folds = result
 56 | 
 57 |         pools = [tuple(pool) for pool in [range(self.nb_folds)]]
 58 |         result = [[]]
 59 |         for pool in pools:
 60 |             result = [x+[y] for x in result for y in pool]
 61 |         self.packet_ids = result
 62 |         
 63 |     ########################################
 64 |     # Preprocessing
 65 |     ########################################
 66 |     def _get_flows_with_all_packets(self):
 67 |         print("_get_flows_with_all_packets")
 68 |         df_flows = {}
 69 |         start_time = time.time()
 70 |         nb_flows = 0
 71 |         df_flows = pd.DataFrame()
 72 |         self.classes = set()
 73 |         files = [f for f in listdir(self.data_dir) if isfile(join(self.data_dir, f))]
 74 |         for _i in range(len(files)):
 75 |             f = self.data_dir + "/" + files[_i]
 76 |             print("f=", f)
 77 |             start_time = time.time()
 78 |             df_new = pd.read_csv(f, 
 79 |                                  names = [
 80 |                                      'packet_id',
 81 |                                      'timestamp', 
 82 |                                      'iat',                                                         
 83 |                                      'source',
 84 |                                      'sport',
 85 |                                      'dest', 
 86 |                                      'dport',
 87 |                                      'protocol', 
 88 |                                      'length',
 89 |                                      'flow_id'
 90 |                                  ],
 91 |                                  dtype = {
 92 |                                      'flow_id': 'Int32',
 93 |                                      'timestamp': np.float64, 
 94 |                                      'iat': np.float64,                                                         
 95 |                                      'source':str,
 96 |                                      'sport': 'Int32',
 97 |                                      'dest': str, 
 98 |                                      'dport': 'Int32',
 99 |                                      'protocol': 'Int32',
100 |                                      'length': 'Int64',
101 |                                      'flow_id': 'Int64'
102 |                                  },
103 |                                  header = 0
104 |                                  )   
105 |             print(f, df_new.shape)
106 |             
107 |             # drop DNS traffic
108 |             df_new = df_new.drop(df_new[df_new['sport'] == 53].index)
109 |             df_new = df_new.drop(df_new[df_new['dport'] == 53].index)
110 |             
111 |             found = False
112 |             for k, v in filename_patterns.items():
113 |                 if k in f:
114 |                     df_new['class'] = v
115 |                     self.classes.add(v)
116 |                     found = True
117 |                     break
118 |             if found == False:
119 |                 print("Type for file", f, "not found")
120 |                 sys.exit(1)
121 |             dfs = []
122 |             # extract flow and add statistical features
123 |             for flow_id in df_new['flow_id'].unique():                    
124 |                 nb_flows += 1
125 |                 df_new = df_new.sort_values(by = ['packet_id'])
126 |                 d = df_new[df_new['flow_id'] == flow_id].head(n = 1)
127 |                 d['nb_packets'] = len(df_new[df_new['flow_id'] == flow_id])
128 |                 c = d['class'].tolist()
129 |                 dport = d.dport.tolist()
130 |                 sport = d.sport.tolist()
131 |                 #print(d)
132 |                 _df = df_new.loc[df_new['flow_id'] == flow_id, 'iat']
133 |                 d['sum_iat'] = np.sum(_df)
134 |                 
135 |                 _df = df_new.loc[df_new['flow_id'] == flow_id, 'length']
136 |                 d['sum_length'] = np.sum(_df)
137 |                 dfs.append(d)
138 |             _d = pd.concat(dfs)
139 |             df_flows = pd.concat([_d, df_flows])
140 | 
141 |             print(f, "processed in ", time.time() - start_time, "seconds.")            
142 |             
143 |             # uncomment following line to stop after the first file during debug
144 |             # break
145 |                 
146 |         print("  processing took ", time.time() - start_time, "seconds.")
147 |         print("%d flows processed" % nb_flows)            
148 |         # Finish processing the data, create the train/tests split and save as pickle files
149 |         df_flows = df_flows.fillna(0)
150 |         
151 |         self.classes = list(self.classes)
152 |         self._hotencode_class(df_flows)
153 |         return df_flows
154 |     
155 |     def data_preparation(self):
156 |         print("data_preparation")
157 |         import warnings
158 |         warnings.filterwarnings("ignore")
159 | 
160 |         df_flows = {}
161 |         files = [f for f in listdir(self.data_dir) if isfile(join(self.data_dir, f))]
162 |         for _i in range(len(files)):
163 |             files[_i] = self.data_dir + "/" + files[_i]
164 | 
165 |         # print(files)
166 |         for i in self.nb_packets_per_flow:
167 |             self.__generate_pickle_for_n_packets(i, files)
168 | 
169 |     def __generate_pickle_for_n_packets(self, n, files):
170 |         print("__generate_pickle_for_n_packets n =", n)
171 | 
172 |         for fold in range(self.nb_folds):
173 |             if self._test_data_prepared((n, fold)):
174 |                 print("pickle files detected for ", n, "packets")
175 |                 return 
176 |         nb_flows = 0
177 |         df_flows = pd.DataFrame()
178 |         self.classes = set()
179 |         for f in files:
180 |             # print("f=", f)
181 |             start_time = time.time()
182 |             df_new = pd.read_csv(f, 
183 |                                  names = [
184 |                                      'packet_id',
185 |                                      'timestamp', 
186 |                                      'iat',                                                         
187 |                                      'source',
188 |                                      'sport',
189 |                                      'dest', 
190 |                                      'dport',
191 |                                      'protocol', 
192 |                                      'length',
193 |                                      'flow_id'
194 |                                  ],
195 |                                  dtype = {
196 |                                      'flow_id': 'Int32',
197 |                                      'timestamp': np.float64, 
198 |                                      'iat': np.float64,                                                         
199 |                                      'source':str,
200 |                                      'sport': 'Int32',
201 |                                      'dest': str, 
202 |                                      'dport': 'Int32',
203 |                                      'protocol': 'Int32',
204 |                                      'length': 'Int64',
205 |                                      'flow_id': 'Int64'
206 |                                  },
207 |                                  header = 0
208 |                                  )   
209 |             print(n, f, df_new.shape)
210 |             
211 |             # drop DNS traffic
212 |             df_new = df_new.drop(df_new[df_new['sport'] == 53].index)
213 |             df_new = df_new.drop(df_new[df_new['dport'] == 53].index)
214 |             
215 |             found = False
216 |             for k, v in filename_patterns.items():
217 |                 if k in f:
218 |                     df_new['class'] = v
219 |                     self.classes.add(v)
220 |                     found = True
221 |                     break
222 |             if found == False:
223 |                 print("Type for file", f, "not found")
224 |                 sys.exit(1)
225 |             
226 |             # extract flow and add statistical features
227 |             for flow_id in df_new['flow_id'].unique():                    
228 |                 nb_flows += 1
229 |                 df_new = df_new.sort_values(by = ['packet_id'])
230 |                 _df_new = df_new[df_new['flow_id'] == flow_id].head(n = n)
231 |                 d = _df_new.head(n = 1)
232 |                 d['nb_packets'] = len(_df_new) #df_new[df_new['flow_id'] == flow_id])
233 |                 c = d['class'].tolist()
234 |                 dport = d.dport.tolist()
235 |                 sport = d.sport.tolist()
236 |                 #print(d)
237 |                 _df = _df_new['iat']
238 |                 d['min_iat'] = 0
239 |                 d['min_iat'] = np.min(_df_new[_df_new['iat'] > 0]['iat'])
240 |                 if len(d[d['iat'] < 0]) > 0:
241 |                     print(_df_new, "has negative iat")
242 |                     continue
243 |                 if len(d[d['iat'] > 120]) > 0:
244 |                     print(_df_new, "has iat > 120")
245 |                     continue
246 |                 d['max_iat'] = np.max(_df)
247 |                 d['sum_iat'] = np.sum(_df)
248 |                 d['mean_iat'] = np.mean(_df)
249 |                 d['median_iat'] = np.median(_df)
250 |                 d['std_iat'] = np.std(_df)
251 |                 d['1stQ_iat'] = np.quantile(_df, 0.25)
252 |                 d['3rdQ_iat'] = np.quantile(_df, 0.75)
253 |                 _a = list(_df)
254 |                 d['skew_iat'] = skew(_a)
255 |                 d['kurt_iat'] = kurtosis(_a)
256 |                 # d['skew_iat'] = skew(np.array(df_new.loc[df_new['flow_id'] == flow_id, 'iat']))
257 |                 # d['kurt_iat'] = kurtosis(np.array(df_new.loc[df_new['flow_id'] == flow_id, 'iat']))
258 |                 
259 |                 _df = _df_new['length']
260 |                 d['min_length'] = np.min(_df)
261 |                 d['max_length'] = np.max(_df)
262 |                 d['sum_length'] = np.sum(_df)
263 |                 d['median_length'] = np.median(_df)
264 |                 d['mean_length'] = np.mean(_df)
265 |                 d['std_length'] = np.std(_df)
266 |                 d['1stQ_length'] = np.quantile(_df, 0.25)
267 |                 d['3rdQ_length'] = np.quantile(_df, 0.75)
268 |                 _a = list(_df)
269 |                 d['skew_length'] = skew(_a)
270 |                 # d['skew_length'] = skew(np.array(df_new.loc[df_new['flow_id'] == flow_id, 'length']))
271 |                 d['kurt_length'] = kurtosis(_a)
272 |                 # d['kurt_length'] = kurtosis(np.array(df_new.loc[df_new['flow_id'] == flow_id, 'length']))
273 |                 
274 |                 d['src'] = f
275 |                 df_flows = pd.concat([d, df_flows])
276 | 
277 |             print(f, "processed in ", time.time() - start_time, "seconds.")            
278 |             
279 |             # uncomment following line to stop after the first file during debug
280 |             # break
281 |                 
282 |         print("%d flows processed" % nb_flows)            
283 |         # Finish processing the data, create the train/tests split and save as pickle files
284 |         df_flows = df_flows.fillna(0)
285 |         
286 |         self.classes = list(self.classes)
287 |         self._hotencode_class(df_flows)
288 |         
289 |         filename = self.filename_prefix + "_" + str(n) + ".pickle"
290 |         self._generate_data_folds(df_flows, filename)
291 | 
292 |     ########################################
293 |     # Data Analysis
294 |     ########################################        
295 |     # def __show_actual_and_predicted(self, X, y, y_pred, _class):
296 |     #     print(self.classes)
297 |     #     for _i in itertools.product(NB_PACKETS, self.filenames):
298 |     #         i = (_i[0], _i[1], 0)
299 |     #         print(i)
300 |     #         df = X[i].copy()
301 |     #         df['type'] = y[i]
302 |     #         df['type_pred'] = y_pred[i]
303 |     #         print(df.columns)
304 |     #         a4_dims = (23.4, 16.54)
305 |     #         fig, ax = plt.subplots(figsize = a4_dims)
306 |     #         sns.lmplot(
307 |     #             x = 'sum_iat', 
308 |     #             y = 'sum_length', 
309 |     #             data = df[df['type'] == _class],
310 |     #             hue = 'type', 
311 |     #             fit_reg = False,
312 |     #             height = 4, aspect = 5,
313 |     #             # color = 'green',
314 |     #             # scatter_kws = {'alpha': 0.3},
315 |     #             # ax = ax,
316 |     #             legend = False,
317 |     #             palette = 'viridis'
318 |     #         )
319 |     #         #ax.set(xlabel='time_delta', ylabel='packet_size')
320 |     #         ax.set(xlabel = 'duration', ylabel = 'sum_packet_size')
321 |     #         plt.legend(title = 'Class', labels =self.classes)
322 |     #         plt.savefig(self.filename_prefix + "_" + self.classes[_class] + "_"+ str(i[0]) + "_" + i[1] + ".png", format = 'png')    
323 |     #         fig, ax2 = plt.subplots(figsize = a4_dims)
324 |     #         sns.lmplot(
325 |     #             x = 'sum_iat', 
326 |     #             y = 'sum_length', 
327 |     #             data = df[df['type_pred'] == _class],
328 |     #             hue = 'type', 
329 |     #             fit_reg = False,
330 |     #             height = 4, aspect = 5,
331 |     #             # color = 'orange',
332 |     #             # scatter_kws = {'alpha': 0.3},
333 |     #             legend = False,
334 |     #             palette = 'viridis',
335 |     #             # ax = ax2
336 |     #         )
337 |     #         ax2.set(xlabel = 'duration', ylabel = 'sum_packet_size')
338 |     #         plt.legend(title = 'Class', labels =self.classes)
339 |     #         plt.savefig(self.filename_prefix + "_" + self.classes[_class] + "_pred_"+ str(i[0]) + "_" + i[1]+".png", format = 'png')
340 |             
341 |     # ########################################
342 |     # # Akem's methods
343 |     # ########################################
344 |     # # Feature Importance
345 |     # """
346 |     # Function to Fit model based on optimal values of depth and number of estimators and use it
347 |     # to compute feature importance for all the features.
348 |     # """
349 |     # def get_feature_importance(depth, n_tree, max_leaf, X_train, y_train):
350 |     #     from sklearn.ensemble import RandomForestClassifier
351 |         
352 |     #     # rf_opt = RandomForestClassifier(max_depth = depth, n_estimators = n_tree, random_state=42, bootstrap=False)
353 |     #     rf_opt = RandomForestClassifier(max_depth = depth, n_estimators = n_tree, max_leaf_nodes=max_leaf, random_state=42, bootstrap=False)
354 |     #     rf_opt.fit(X_train, y_train)
355 |     #     feature_importance = pd.DataFrame(rf_opt.feature_importances_)
356 |     #     feature_importance.index = X_train.columns
357 |     #     feature_importance = feature_importance.sort_values(by=list(feature_importance.columns),axis=0,ascending=False)
358 |         
359 |     #     return feature_importance
360 | 
361 | 
362 |     # """
363 |     # Function to Fit model based on optimal values of depth and number of estimators and feature importance
364 |     # to find the fewest possible features to exceed the previously attained score with all selected features
365 |     # """
366 |     # def get_fewest_features(depth, n_tree, max_leaf, importance):    
367 |     #     sorted_feature_names = importance.index
368 |     #     # print('sorted_feature_names: ', sorted_feature_names)
369 |     #     features = []
370 |     #     for f in range(1,len(sorted_feature_names)+1):
371 |     #         features.append(sorted_feature_names[0:f])
372 |     #         # print('features:', features)
373 |     #     return features
374 | 
375 | 
376 |     # def get_result_scores(classes, cl_report):
377 |     #     precision=[]
378 |     #     recall=[]
379 |     #     f1_score=[]
380 |     #     supports=[]
381 |     #     for a_class in classes:
382 |     #         precision.append(cl_report[a_class]['precision'])
383 |     #         recall.append(cl_report[a_class]['recall'])
384 |     #         f1_score.append(cl_report[a_class]['f1-score'])
385 |     #         supports.append(cl_report[a_class]['support'])
386 |     #     return precision, recall, f1_score, supports
387 | 
388 | 
389 |     # def get_scores(classes, depth, n_tree, feats, max_leaf, X_train, y_train, X_test, y_test):
390 |     #     from sklearn.ensemble import RandomForestClassifier
391 |     #     model = RandomForestClassifier(max_depth=depth, n_estimators = n_tree, max_leaf_nodes=max_leaf, n_jobs=4,
392 |     #                                    random_state=42, bootstrap=False)                              
393 | 
394 |     #     model.fit(X_train[feats], y_train)
395 |     #     y_pred = model.predict(X_test[feats])
396 |         
397 |     #     class_report = classification_report(y_test, y_pred, target_names=classes, output_dict = True)
398 |         
399 |     #     accurac = model.score(X_test[feats], y_test)
400 |     #     macro_score = class_report['macro avg']['f1-score']
401 |     #     weighted_score = class_report['weighted avg']['f1-score']
402 | 
403 |     #     return model, class_report, macro_score, weighted_score, y_pred, accurac
404 | 
405 | 
406 |     # def get_x_y(Dataset, classes, feats):
407 |     #     Dataset = Dataset[Dataset["Label"].isin(classes)]    
408 |     #     X = Dataset[feats]
409 |     #     y = Dataset['Label'].replace(classes, range(len(classes)))
410 |     #     #     y = Dataset.columns[-1].replace(classes, range(len(classes)))
411 |         
412 |     #     return X, y
413 | 
414 |     # def analyze_models(classes, model_type, depths, n_trees, X_train, y_train, X_test, y_test, max_leaf, filename_out):
415 |         
416 |     #     with open(filename_out, "w") as res_file:
417 |     #         print('depth;tree;n_feat;Macro_F1;Weighted_F1;Accuracy;feats;c_report', file=res_file)
418 |     #         if model_type == 'RF':
419 |     #             # FOR EACH (depth, n_tree, feat)
420 |     #             for depth in depths:
421 |     #                 for n_tree in n_trees:
422 |     #                     # get feature orders to use
423 |     #                     importance = get_feature_importance(depth, n_tree, max_leaf, X_train, y_train)
424 | 
425 |     #                     m_feats = get_fewest_features(depth, n_tree, max_leaf, importance) 
426 |     #                     for feats in m_feats:
427 |     #                         # Get the scores with the given (depth, n_tree, feat)
428 |     #                         model, c_report, macro_f1, weight_f1, y_pred, accuracs = get_scores(classes, depth, n_tree, feats, max_leaf, X_train, y_train, X_test, y_test)
429 |                             
430 |     #                         print(str(depth)+';'+str(n_tree)+';'+str(len(feats))+';'+str(macro_f1)+';'+str(weight_f1)+';'+str(accuracs)+';'+str(list(feats))+';'+str(c_report), file=res_file)
431 |     #     print("Analysis Complete. Check output file.")
432 |     #     return []
433 | 
434 |     # # N = number of packets in flows, feats = array of feature names to use, feat_name = string to add to output file name
435 |     # def analyze_models_for_npkts(self, N, feats, feat_name):
436 |     #     i = (N, self.filenames[0], 0)
437 |     #     print("Number of packets per flow: ", N)
438 |         
439 |     #     X_trains, y_trains = X_train[i][feats], y_train[i]
440 |     #     X_tests,  y_tests  = X_test[i][feats], y_test[i]
441 |         
442 |     #     results_file = "Models_" + feat_name + "_" + str(N) + "_pkts_.csv"
443 |     #     analyze_models(self.classes, "RF", range(7, 20, 1), range(1, 8, 2), X_trains, y_trains, X_tests, y_tests, 500, results_file)
444 |         
445 |     #     results = pd.read_csv(results_file, sep=';')
446 |     #     results = results.sort_values(by=['Weighted_F1','Macro_F1'],ascending=False)
447 |     #     print(results.head(10))
448 |     #     print("******")
449 |     #     print(results.head(1)['c_report'].values)
450 | 
451 |     ########################################
452 |     # GBoost
453 |     ########################################
454 |     def GBoost_predict(self, feats):
455 |         print("GBoost_predict")
456 |         from sklearn.ensemble import GradientBoostingClassifier
457 |         gb_model = {}
458 |         
459 |         for i in EncryptedTrafficClassifierIterator(self.flow_ids):
460 |             gb_model[i] = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state = 42)
461 | 
462 |         for i in EncryptedTrafficClassifierIterator(self.flow_ids):
463 |             print("==",i,"==")
464 |             try:
465 |                 gb_model[i].fit(X_train[i][feats], y_train[i])
466 |             except ValueError as e:
467 |                 print(e)
468 |                 pass
469 |         
470 |         gb_y_train_predicted = {}
471 |         gb_y_test_predicted = {}
472 |         gb_train_score = {}
473 |         gb_test_score = {}
474 |         for i in EncryptedTrafficClassifierIterator(self.flow_ids):
475 |             print("==",i,"==")
476 |             gb_y_train_predicted[i] = gb_model[i].predict(X_train[i][feats])
477 |             gb_y_test_predicted[i] = gb_model[i].predict(X_test[i][feats])
478 |             gb_train_score[i] = gb_model[i].score(X_train[i][feats], y_train[i])
479 |             gb_test_score[i] = gb_model[i].score(X_test[i][feats], y_test[i])
480 | 
481 |         self._get_scores_from_models(gb_model, y_test, gb_y_test_predicted, feats)
482 | 
483 |         gb_cm_dict = {}
484 |         for i in EncryptedTrafficClassifierIterator(self.flow_ids):
485 |             print("==",i,"==")
486 |             gb_cm_dict[i] = confusion_matrix(y_test[i], gb_y_test_predicted[i].astype(int))
487 |             print(gb_cm_dict[i])
488 | 
489 |         for i in EncryptedTrafficClassifierIterator(self.flow_ids):
490 |             pkt, _ = i
491 |             classification_results.loc[classification_results['nb_packets'] == pkt, 'gb_train_score'] = gb_train_score[i]
492 |             classification_results.loc[classification_results['nb_packets'] == pkt, 'gb_test_score'] = gb_test_score[i]
493 |         
494 |         return gb_model, gb_y_train_predicted, gb_y_test_predicted
495 | 
496 | 
497 | ########################################
498 | # Entry point
499 | ########################################
500 | if __name__ == "__main__":
501 |     parser = argparse.ArgumentParser(
502 |         prog='netflow_quic_traffic_classifier',
503 |         description='Classify packets or flows from NetFlow QUIC dataset',
504 |         epilog=''
505 |     )
506 |     parser.add_argument('-p', '--nb_packets', action = 'append', type = int, required = True) #, default = [4, 8])
507 |     parser.add_argument('-c', '--classifier', action = 'append', type = str) #, default = ['rf'])
508 |     parser.add_argument('-f', '--nb_folds', action = 'store', default = 12, type = int)
509 |     parser.add_argument('-v', '--visualization', action = 'store_true', required = False, default = False)
510 |     parser.add_argument('-r', '--report', action = 'store_true', required = False, default = False)
511 |     parser.add_argument('-F', '--force_rf_classification', action = 'store_true', required = False, default = False)
512 |     args = parser.parse_args(sys.argv[1:])
513 | 
514 |     VISUALIZATION_ENABLED = False
515 |     if args.visualization == True:
516 |         VISUALIZATION_ENABLED = True
517 | 
518 |     RF_ENABLED = False
519 |     GB_ENABLED = False
520 |     XG_ENABLED = False
521 |     for c in args.classifier:
522 |         c = c.lower()
523 |         if c == "rf":
524 |             RF_ENABLED = True
525 |         elif c == "gb":
526 |             GB_ENABLED = True
527 |         elif c == "xg":
528 |             XG_ENABLED = True
529 |         else:
530 |             print("Unknown classifier", c)
531 |             
532 |     classifier = NetFlowQUICClassifier(
533 |         nb_folds = args.nb_folds,
534 |         nb_packets_per_flow = args.nb_packets
535 |     )
536 | 
537 |     if args.force_rf_classification == True:
538 |         classifier.force_rf_classification = True
539 | 
540 |     classifier.all_classes = [
541 |         "youtube",
542 |         "Google_Play_Music",
543 |         "GoogleHangout_VoIP",
544 |         "GoogleHangout_Chat",
545 |         "FileTransfer",
546 |     ]
547 |     
548 |     non_needed_features = [
549 |         'flow_id', 
550 |         'class', 
551 |         'source', 
552 |         'dest', 
553 |         'sport',
554 |         'dport', 
555 |         'protocol', 
556 |         'timestamp', 
557 |         # 'nb_packets',
558 |         'src',
559 |         'iat',
560 |         'direction',
561 |         'length',
562 |         'packet_id'
563 |     ]
564 | 
565 |     all_features_flows = [
566 |         'min_iat',
567 |         'max_iat',
568 |         'sum_iat',
569 |         'mean_iat',
570 |         'median_iat',
571 |         'std_iat',
572 |         '1stQ_iat',
573 |         '3rdQ_iat', 
574 |         'skew_iat',
575 |         'kurt_iat',
576 |         'min_length',
577 |         'max_length',
578 |         'sum_length',
579 |         'median_length',
580 |         'mean_length', 
581 |         'std_length',
582 |         '1stQ_length',
583 |         '3rdQ_length',
584 |         'skew_length',
585 |         'kurt_length',
586 |         'nb_packets',
587 |         # 'sport',
588 |         # 'dport',
589 |         # 'protocol',
590 |         # 'direction'
591 |     ]
592 |     # best_features = [
593 |     #     'max_iat',
594 |     #     'sum_iat',
595 |     #     'mean_iat',
596 |     #     'median_iat',
597 |     #     'std_iat',
598 |     #     '1stQ_iat',
599 |     #     '3rdQ_iat', 
600 |     #     'skew_iat',
601 |     #     'kurt_iat',
602 |     #     'min_length',
603 |     #     'max_length',
604 |     #     'sum_length',
605 |     #     'median_length',
606 |     #     'mean_length', 
607 |     #     'std_length',
608 |     #     '1stQ_length',
609 |     #     '3rdQ_length',
610 |     #     'skew_length',
611 |     #     'kurt_length'
612 |     # ]
613 |     best_features = ['3rdQ_iat', 'std_iat', 'std_length', 'skew_iat', 'max_iat', 'sum_iat', 'mean_length', '1stQ_length', 'max_length', 'mean_iat', 'min_length', 'sum_length', 'median_length', '1stQ_iat', 'median_iat', '3rdQ_length', 'kurt_iat', 'kurt_length', 'nb_packets']
614 |     online_features=[
615 |         'sum_iat',
616 |         'sum_length',
617 |         'max_length',
618 |         'mean_iat',
619 |         'max_iat',
620 |         'mean_length',
621 |         'min_length',
622 |         'min_iat'
623 |     ]
624 |     feats_flows = all_features_flows
625 |     
626 |     # Preprocessing
627 |     if not classifier.data_prepared():
628 |         classifier.data_preparation()
629 |         classifier.load_flows()
630 |     else:
631 |         classifier.load_flows()
632 |         #_c = classifier.y_train_flows[(classifier.nb_packets_per_flow[0], 0)].unique()
633 |         classifier.classes = [-1 for _ in classifier.all_classes]
634 |         _Xy = classifier.X_train_flows[(classifier.nb_packets_per_flow[0], 0)].copy()
635 |         _Xy['y'] = classifier.y_train_flows[(classifier.nb_packets_per_flow[0], 0)]
636 |         for index, row in _Xy.iterrows():
637 |             for _i in range(len(classifier.all_classes)):
638 |                 if classifier.all_classes[_i] in row['src']:
639 |                     classifier.classes[row['y']] = classifier.all_classes[_i]
640 |                     break
641 |                     #classifier.classes[i] .append(classifier.all_classes[i])
642 |             if -1 not in classifier.classes:
643 |                 break
644 |         print("classes =",classifier.classes)
645 | 
646 |         #classifier.classes = []
647 |         #for i in _c:
648 |         #    classifier.classes.append(classifier.all_classes[i])
649 |         
650 |     classifier.cleanup_data(classifier.X_train_flows,
651 |                             classifier.y_train_flows,
652 |                             classifier.X_test_flows,
653 |                             classifier.y_test_flows,
654 |                             classifier.flow_ids,
655 |                             non_needed_features)
656 | 
657 |     # scaling during processing make results worse !
658 |     # classifier.X_train_flows_fitted, classifier.X_test_flows_fitted = classifier.preprocessing(
659 |     #     classifier.X_train_flows,
660 |     #     classifier.y_train_flows,
661 |     #     classifier.X_test_flows,
662 |     #     classifier.y_test_flows,
663 |     #     classifier.flow_ids,
664 |     #     feats_flows
665 |     # )
666 |     classifier.X_train_flows_fitted = classifier.X_train_flows
667 |     classifier.X_test_flows_fitted = classifier.X_test_flows
668 |     # for i in EncryptedTrafficClassifierIterator(classifier.flow_ids):
669 |     #     # print(classifier.X_train_flows[i].columns)
670 |     #     # print(classifier.X_train_flows[i].index)
671 |     #     # print(classifier.y_train_flows[i].index)
672 |     #     classifier.X_train_flows[i] = classifier.X_train_flows[i][classifier.X_train_flows[i]['sum_iat'] < 120]
673 |     #     classifier.y_train_flows[i] = classifier.y_train_flows[i][classifier.X_train_flows[i].index]
674 |     #     classifier.X_test_flows[i] = classifier.X_test_flows[i][classifier.X_test_flows[i]['sum_iat'] < 120]
675 |     #     classifier.y_test_flows[i] = classifier.y_test_flows[i][classifier.X_test_flows[i].index]
676 |     
677 |     # __correlation()
678 |     # analyze_models_for_npkts(10, all_features, "all_feats")
679 |     if args.report == True:
680 |         __class_names = { 
681 |             "youtube": "YouTube",
682 |             "Google_Play_Music": "Music",
683 |             "GoogleHangout_VoIP": "VoIP",
684 |             "GoogleHangout_Chat": "Chat",
685 |             "FileTransfer": "FileTransfer",
686 |             }
687 |         for _c in range(len(classifier.classes)):
688 |             classifier.classes[_c] = __class_names[classifier.classes[_c]]
689 |         classifier._viz(distribution = 0, class_distribution = -1, nb_packets = -1, min_iat = -1, max_iat = -1)
690 |         for n in classifier.nb_packets_per_flow:
691 |             if n == 4:
692 |                 classifier._viz(distribution = -1, class_distribution = 10, nb_packets = 0, min_iat = -1, max_iat = -1)
693 |             elif n == 8:
694 |                 classifier._viz(distribution = -1, class_distribution = 10, nb_packets = 0, min_iat = -1, max_iat = -1)
695 |             elif n == 600000:
696 |                 classifier._viz(distribution = -1, class_distribution = 11, nb_packets = 1, min_iat = 1, max_iat = -1)
697 |         sys.exit(1)
698 |     if VISUALIZATION_ENABLED:
699 |         # classifier.classes= ["Chat", "Music", "YouTube", "VoIP", "FileTransfer"]
700 |         # classifier._viz(distribution = 0, class_distribution = 10, nb_packets = -1, min_iat = 1, max_iat = -1)
701 |         # sys.exit(1)
702 |         
703 |         # f = classifier.filename_prefix + '_datasetflows_distribution.pickle'
704 |         # if isfile(classifier.processed_data_output_dir + f):
705 |         #     print("Loading dataset from pickle file", f)
706 |         #     _df = classifier._load_pickle(f)
707 |         # else:
708 |         #     print("Creating dataset")
709 |         #     _df = classifier._get_flows_with_all_packets()
710 |         #     classifier._pickle_dump(_df, f)
711 |         #     print("Dataset saved in file", f)
712 |         # classifier._class_distribution(_df, classifier.filename_prefix + '_flows_distribution')
713 |         # classifier._nb_packets_distribution(_df, classifier.filename_prefix + "_flows_nb_packets_distribution" )
714 |         # classifier._distribution(_df, classifier.filename_prefix + "_flows_class_split" )
715 |         
716 |         __class_names = { 
717 |             "youtube": "YouTube",
718 |             "Google_Play_Music": "Music",
719 |             "GoogleHangout_VoIP": "VoIP",
720 |             "GoogleHangout_Chat": "Chat",
721 |             "FileTransfer": "FileTransfer",
722 |             }
723 |         for _c in range(len(classifier.classes)):
724 |             classifier.classes[_c] = __class_names[classifier.classes[_c]]
725 |         pkt = classifier.nb_packets_per_flow[0]
726 |         fold = 0
727 |         _i = pkt, fold
728 |         _df1 = classifier.X_train_flows[_i].copy()
729 |         _df1['type'] = classifier.y_train_flows[_i].values
730 |         _df2 = classifier.X_test_flows[_i].copy()
731 |         _df2['type'] = classifier.y_test_flows[_i].values
732 |         _df = pd.concat([_df1, _df2])
733 |         _df.reset_index()
734 |         # _df = _df[_df['sum_iat'] < 120]
735 |         # _df.to_csv("netflow_datasets_pkts_"+str(pkt)+".csv", index = False)
736 |         # sys.exit(1)
737 |         print(_df.shape)
738 |         classifier._distribution(_df, classifier.filename_prefix + "_flows_class_split_" + str(pkt) + '_pkt')
739 |         sys.exit(1)
740 |         # classifier._class_distribution(_df, classifier.filename_prefix + '_flows_distribution_' + str(pkt) + '_pkt')
741 |         # classifier._nb_packets_distribution(_df, classifier.filename_prefix + "_flows_nb_packets_distribution_" + str(pkt) + '_pkt')
742 |         # classifier._min_iat_distribution(_df, classifier.filename_prefix + "_flows_min_iat_distribution_" + str(pkt) + '_pkt')
743 | 
744 |     if RF_ENABLED:
745 |         print("==== RandomForest =====")
746 |         __class_names = { 
747 |             "youtube": "YouTube",
748 |             "Google_Play_Music": "Music",
749 |             "GoogleHangout_VoIP": "VoIP",
750 |             "GoogleHangout_Chat": "Chat",
751 |             "FileTransfer": "FileTransfer",
752 |             }
753 |         for _c in range(len(classifier.classes)):
754 |             classifier.classes[_c] = __class_names[classifier.classes[_c]]
755 |         
756 |         rf_regr_flows, rf_y_train_flows_predicted, rf_y_test_flows_predicted = classifier.RF_predict(
757 |                 classifier.X_train_flows_fitted,
758 |                 classifier.y_train_flows,
759 |                 classifier.X_test_flows_fitted,
760 |                 classifier.y_test_flows
761 |         )
762 |                 
763 |         # __show_actual_and_predicted(X_test, y_test, rf_y_test_predicted, 1)
764 |         rf_cm_dict_flows, output = classifier.confusion_matrix(rf_regr_flows,
765 |                                                                classifier.y_test_flows,
766 |                                                                rf_y_test_flows_predicted,
767 |                                                                classifier.flow_ids,
768 |                                                                "rf"
769 |                                                                )
770 |         print(output)
771 |         rf_f1_scores_flows, output = classifier.get_F1_score(rf_cm_dict_flows,
772 |                                                              classifier.y_test_flows,
773 |                                                              rf_y_test_flows_predicted,
774 |                                                              classifier.flow_ids,
775 |                                                              "rf_flows")
776 |         print(output)
777 |         avg_scores, output = classifier.avg_f1_scores(rf_f1_scores_flows, classifier.flow_ids) #_without_folds)
778 |         print(output)
779 | 
780 |     if GB_ENABLED:
781 |         print("==== GradientBoosting =====")
782 |         gb_regr, gb_y_train_predicted, gb_y_test_predicted  = classifier.GBoost_predict(feats_flows, classification_results)
783 |         gb_cm_dict = classifier.confusion_matrix(gb_regr, classifier.y_test_flows, gb_y_test_predicted, classifier.flow_ids, "gb")
784 |         gb_f1_scores = classifier.get_F1_score(gb_cm_dict,  y_test, gb_y_test_predicted, "gb", False)
785 |         classifier.avg_f1_scores(gb_f1_scores_flows, classifier.flow_ids_without_folds)
786 |         # classifier.avg_f1_scores(gb_f1_scores)
787 | 
788 |     if XG_ENABLED:
789 |         print("==== XGBoost =====")
790 |         xg_regr, xg_y_train_predicted, xg_y_test_flows_predicted = classifier.XGBoost_predict(
791 |             classifier.X_train_flows_fitted,
792 |             classifier.y_train_flows,
793 |             classifier.X_test_flows_fitted,
794 |             classifier.y_test_flows
795 |         )
796 |         
797 |         xg_cm_dict_flows, output = classifier.confusion_matrix(xg_regr,
798 |                                                                classifier.y_test_flows,
799 |                                                                xg_y_test_flows_predicted,
800 |                                                                classifier.flow_ids,
801 |                                                                "xg"
802 |                                                                )
803 |         print(output)
804 |             
805 |         xg_f1_scores_flows, output = classifier.get_F1_score(
806 |             xg_cm_dict_flows,
807 |             classifier.y_test_flows,
808 |             xg_y_test_flows_predicted,
809 |             classifier.flow_ids,
810 |             "xg_flows")
811 |         print(output)
812 |         # xg_cm_dict, classifier.y_test_flows, xg_y_test_predicted, "xg", False)
813 |         avg_scores, output = classifier.avg_f1_scores(xg_f1_scores_flows, classifier.flow_ids)
814 |         print(output)
815 |         # print(xg_f1_scores)
816 | 
817 |     print(classifier.classification_results)
818 |     if RF_ENABLED or GB_ENABLED or XG_ENABLED:
819 |         classifier.save_results()


--------------------------------------------------------------------------------
/Offline_ETC/noms2023_instant_messaging_traffic_classifier.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import argparse
  5 | import itertools
  6 | from os import listdir
  7 | from os.path import isfile, join
  8 | import sys
  9 | import time
 10 | 
 11 | import pandas as pd
 12 | 
 13 | import numpy as np
 14 | 
 15 | from scipy.stats import kurtosis, skew
 16 | 
 17 | from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
 18 | from sklearn.compose import ColumnTransformer
 19 | from sklearn.pipeline import Pipeline
 20 | from sklearn.metrics import classification_report, f1_score, confusion_matrix, ConfusionMatrixDisplay
 21 | 
 22 | import seaborn as sns
 23 | import matplotlib.pyplot as plt 
 24 | 
 25 | from encrypted_traffic_classification import EncryptedTrafficClassifier, EncryptedTrafficClassifierIterator
 26 | 
 27 | ########################################
 28 | # Data preparation: convert RAW data
 29 | ########################################
 30 | class NOMS2023InstantMessagingClassifier(EncryptedTrafficClassifier):
 31 |     def __init__(self, nb_folds, nb_packets_per_flow):        
 32 |         super().__init__(
 33 |             nb_folds= nb_folds,
 34 |             nb_packets_per_flow = nb_packets_per_flow,
 35 |             filename_prefix = "noms2023_im",
 36 |             processed_data_output_dir = "noms2023_im_output/",
 37 |             data_dir = "data/noms2023_im/"            
 38 |         )
 39 |                 
 40 |         pools = [tuple(pool) for pool in [self.nb_packets_per_flow, range(self.nb_folds)]]
 41 |         result = [[]]
 42 |         for pool in pools:
 43 |             result = [x+[y] for x in result for y in pool]
 44 |         self.flow_ids = result
 45 | 
 46 |         pools = [tuple(pool) for pool in [self.nb_packets_per_flow]]
 47 |         result = [[]]
 48 |         for pool in pools:
 49 |             result = [x+[y] for x in result for y in pool]
 50 |         self.flow_ids_without_folds = result
 51 | 
 52 |         pools = [tuple(pool) for pool in [range(self.nb_folds)]]
 53 |         result = [[]]
 54 |         for pool in pools:
 55 |             result = [x+[y] for x in result for y in pool]
 56 |         self.packet_ids = result
 57 |         
 58 |     ########################################
 59 |     # Preprocessing
 60 |     ########################################
 61 |     def data_preparation(self):
 62 |         print("data_preparation")
 63 |         import warnings
 64 |         warnings.filterwarnings("ignore")
 65 | 
 66 |         df_flows = {}
 67 |         files = [f for f in listdir(self.data_dir) if isfile(join(self.data_dir, f))]
 68 |         for _i in range(len(files)):
 69 |             files[_i] = self.data_dir + "/" + files[_i]
 70 | 
 71 |         # print(files)
 72 |         for i in self.nb_packets_per_flow:
 73 |             self.__generate_pickle_for_n_packets(i, files)
 74 | 
 75 |     def _get_flows_with_all_packets(self):
 76 |         print("_get_flows_with_all_packets")
 77 | 
 78 |         self.classes = set()
 79 |         start_time = time.time()
 80 |         nb_flows = 0
 81 |         df_flows = pd.DataFrame()
 82 |         files = [f for f in listdir(self.data_dir) if isfile(join(self.data_dir, f))]
 83 |         for _i in range(len(files)):
 84 |             f = self.data_dir + "/" + files[_i]
 85 |             # print("f=", f)
 86 |             df_new = pd.read_csv(f, 
 87 |                                  names = [
 88 |                                      'flow_id',
 89 |                                      'timestamp', 
 90 |                                      'iat',                                                         
 91 |                                      'source',
 92 |                                      'sport',
 93 |                                      'dest', 
 94 |                                      'dport',
 95 |                                      'protocol', 
 96 |                                      'length'
 97 |                                  ],
 98 |                                  header = 0
 99 |                                  )   
100 |             print(f, df_new.shape)
101 |             
102 |             # drop DNS traffic
103 |             df_new = df_new.drop(df_new[df_new['sport'] == 53].index)
104 |             df_new = df_new.drop(df_new[df_new['dport'] == 53].index)
105 |             
106 |             found = False
107 |             for _c in self.all_classes:
108 |                 if _c in f:
109 |                     found = True
110 |                     df_new['class'] = _c
111 |                     self.classes.add(_c)
112 |                     break
113 |             if found == False:
114 |                 print("class not identified for", f)
115 |             dfs = []
116 |             # extract flow and add statistical features
117 |             for flow_id in df_new['flow_id'].unique():                    
118 |                 nb_flows += 1
119 |                 d = df_new[df_new['flow_id'] == flow_id].head(n = 1)
120 |                 d['nb_packets'] = len(df_new[df_new['flow_id'] == flow_id])
121 |                 c = d['class'].tolist()
122 |                 dport = d.dport.tolist()
123 |                 sport = d.sport.tolist()
124 |                 #print(d)
125 |                 _df = df_new.loc[df_new['flow_id'] == flow_id, 'iat']
126 |                 d['sum_iat'] = np.sum(_df)
127 |                 
128 |                 _df = df_new.loc[df_new['flow_id'] == flow_id, 'length']
129 |                 d['sum_length'] = np.sum(_df)
130 |                 dfs.append(d)
131 |             _d = pd.concat(dfs)
132 |             df_flows = pd.concat([df_flows, _d])
133 |             # uncomment for debugging
134 |             # break
135 |                 
136 |         print(f, "processed in ", time.time() - start_time, "seconds.")            
137 |         print("%d flows processed" % nb_flows)            
138 |         # Finish processing the data, create the train/tests split and save as pickle files
139 |         df_flows = df_flows.fillna(0)
140 |         
141 |         self.classes = list(self.classes)
142 |         self._hotencode_class(df_flows)
143 |         return df_flows
144 | 
145 |     def __statistical_features(self, df, n, df_flows, f, nb_flows):
146 |         nb_flows[0] += 1
147 |         # d = df.head(n = 1)
148 |         d = df
149 |         _df_new = df.head(n = n)
150 |         d['nb_packets'] = len(_df_new) #df_new[df_new['flow_id'] == flow_id])
151 |         c = d['class'].tolist()
152 |         dport = d.dport.tolist()
153 |         sport = d.sport.tolist()
154 |         #print(d)
155 |         _df = _df_new['iat']
156 |         d['min_iat'] = np.min(df[df['iat'] > 0]['iat']) # probably useless as most probably always 0 for the first packet
157 |         d['max_iat'] = np.max(_df)
158 |         d['sum_iat'] = np.sum(_df)
159 |         d['mean_iat'] = np.mean(_df)
160 |         d['median_iat'] = np.median(_df)
161 |         d['std_iat'] = np.std(_df)
162 |         d['1stQ_iat'] = np.quantile(_df, 0.25)
163 |         d['3rdQ_iat'] = np.quantile(_df, 0.75)
164 |         _a = list(_df)
165 |         d['skew_iat'] = skew(_a)
166 |         d['kurt_iat'] = kurtosis(_a)
167 |         
168 |         _df = _df_new['length']
169 |         d['min_length'] = np.min(_df)
170 |         d['max_length'] = np.max(_df)
171 |         d['sum_length'] = np.sum(_df)
172 |         d['median_length'] = np.median(_df)
173 |         d['mean_length'] = np.mean(_df)
174 |         d['std_length'] = np.std(_df)
175 |         d['1stQ_length'] = np.quantile(_df, 0.25)
176 |         d['3rdQ_length'] = np.quantile(_df, 0.75)
177 |         _a = list(_df)
178 |         d['skew_length'] = skew(_a)
179 |         d['kurt_length'] = kurtosis(_a)
180 |         
181 |         d['src'] = f
182 |         # dfs.append(d)
183 |         df_flows = pd.concat([d, df_flows])
184 |         if nb_flows[0] % 1000 == 0:
185 |             # print(self.classes)
186 |             print("nb flows processed is %d" % nb_flows[0])
187 |             print("df_flows.shape", df_flows.shape)
188 |             print(d.columns)
189 |             print(df_flows.columns)
190 |         return d
191 |         
192 |     def __generate_pickle_for_n_packets(self, n, files):
193 |         print("__generate_pickle_for_n_packets n =", n)
194 | 
195 |         for fold in range(self.nb_folds):
196 |             if self._test_data_prepared((n, fold)):
197 |                 print("pickle files detected for ", n, "packets")
198 |                 return
199 |         nb_flows = [0]
200 |         df_flows = pd.DataFrame()
201 |         dfs = []
202 |         self.classes = set()
203 |         start_time = time.time()
204 |         for f in files:
205 |             # print("f=", f)
206 |             df_new = pd.read_csv(f, 
207 |                                  names = [
208 |                                      'packet_id',
209 |                                      'timestamp', 
210 |                                      'iat',                                                         
211 |                                      'source',
212 |                                      'sport',
213 |                                      'dest', 
214 |                                      'dport',
215 |                                      'protocol', 
216 |                                      'length',
217 |                                      'flow_id',
218 |                                  ],
219 |                                  header = 0,
220 |                                  index_col = False
221 |                                  )   
222 |             print(n, f, df_new.shape)
223 |             #print(df_new) 
224 |             # drop DNS traffic
225 |             df_new = df_new.drop(df_new[df_new['sport'] == 53].index)
226 |             df_new = df_new.drop(df_new[df_new['dport'] == 53].index)
227 |             
228 |             found = False
229 |             for _c in self.all_classes:
230 |                 if _c in f:
231 |                     found = True
232 |                     df_new['class'] = _c
233 |                     self.classes.add(_c)
234 |                     break
235 |             if found == False:
236 |                 print("class not identified for", f)
237 | 
238 |             print("nb flows = ", len(df_new['flow_id'].unique()))
239 |             #df_new.groupby(by = 'flow_id', group_keys = False).apply(self.__statistical_features, n, df_flows, f, nb_flows)
240 |             # extract flow and add statistical features
241 |             for flow_id in df_new['flow_id'].unique():                    
242 |                  nb_flows[0] += 1
243 |                  d = df_new[df_new['flow_id'] == flow_id].head(n = 1)
244 |                  _df_new = df_new[df_new['flow_id'] == flow_id].head(n = n)
245 |                  d['nb_packets'] = len(_df_new) #df_new[df_new['flow_id'] == flow_id])
246 |                  if n != 600000 and d['nb_packets'].iloc[0] != n:
247 |                      print("Flow #", flow_id," has only", d['nb_packets'].iloc[0]," packets, skipping...")
248 |                      continue
249 |                  c = d['class'].tolist()
250 |                  dport = d.dport.tolist()
251 |                  sport = d.sport.tolist()
252 |                  #print(d)
253 |                  _df = _df_new['iat']
254 |                  d['sum_iat'] = np.sum(_df)
255 |                  if d['sum_iat'].iloc[0] == 0:
256 |                      print("Total duration is 0 for flow #", flow_id, ", skipping...")
257 |                      continue
258 |                  d['min_iat'] = np.min(df_new[df_new['iat'] > 0]['iat']) # probably useless as most probably always 0 for the first packet
259 |                  d['max_iat'] = np.max(_df)
260 |                  d['mean_iat'] = np.mean(_df)
261 |                  d['median_iat'] = np.median(_df)
262 |                  d['std_iat'] = np.std(_df)
263 |                  try:     
264 |                     d['1stQ_iat'] = np.quantile(_df, 0.25)
265 |                  except Exception as e:
266 |                     d['1stQ_iat'] = 0
267 | 
268 |                  try:     
269 |                     d['3rdQ_iat'] = np.quantile(_df, 0.75)
270 |                  except Exception as e:
271 |                     d['3rdQ_iat'] = 0
272 |                  _a = list(_df)
273 |                  try:     
274 |                      d['skew_iat'] = skew(_a)
275 |                  except Exception as e:
276 |                      d['skew_iat'] = 0
277 |                  try:     
278 |                     d['kurt_iat'] = kurtosis(_a)
279 |                  except Exception as e:
280 |                     d['kurt_iat'] = 0
281 |                 
282 |                  _df = _df_new['length']
283 |                  d['min_length'] = np.min(_df)
284 |                  d['max_length'] = np.max(_df)
285 |                  d['sum_length'] = np.sum(_df)
286 |                  d['median_length'] = np.median(_df)
287 |                  d['mean_length'] = np.mean(_df)
288 |                  d['std_length'] = np.std(_df)
289 |                  try:     
290 |                     d['1stQ_length'] = np.quantile(_df, 0.25)
291 |                  except Exception as e:
292 |                     d['1stQ_length'] = 0
293 |                  try:     
294 |                     d['3rdQ_length'] = np.quantile(_df, 0.75)
295 |                  except Exception as e:
296 |                     d['3rdQ_length'] = 0
297 |                  _a = list(_df)
298 |                  try:     
299 |                     d['skew_length'] = skew(_a)
300 |                  except Exception as e:
301 |                     d['skew_length'] = 0
302 |                  try:     
303 |                     d['kurt_length'] = kurtosis(_a)
304 |                  except Exception as e:
305 |                     d['kurt_length'] = 0
306 |                
307 |                  d['src'] = f
308 |                  dfs.append(d)
309 |                  # df_flows = pd.concat([d, df_flows])
310 |                  
311 |         df_flows = pd.concat(dfs)
312 |                 
313 |         print(f, "processed in ", time.time() - start_time, "seconds.")            
314 |         print("%d flows processed" % nb_flows[0])            
315 |         # Finish processing the data, create the train/tests split and save as pickle files
316 |         df_flows = df_flows.fillna(0)
317 |         
318 |         self.classes = list(self.classes)
319 |         self._hotencode_class(df_flows)
320 |         
321 |         filename = self.filename_prefix + "_" + str(n) + ".pickle"
322 |         self._generate_data_folds(df_flows, filename)        
323 | 
324 |     ########################################
325 |     # Data Analysis
326 |     ########################################        
327 |     def __show_actual_and_predicted(self, X, y, y_pred, _class):
328 |         print(self.classes)
329 |         for _i in itertools.product(NB_PACKETS, self.filenames):
330 |             i = (_i[0], _i[1], 0)
331 |             print(i)
332 |             df = X[i].copy()
333 |             df['type'] = y[i]
334 |             df['type_pred'] = y_pred[i]
335 |             print(df.columns)
336 |             a4_dims = (23.4, 16.54)
337 |             fig, ax = plt.subplots(figsize = a4_dims)
338 |             sns.lmplot(
339 |                 x = 'sum_iat', 
340 |                 y = 'sum_length', 
341 |                 data = df[df['type'] == _class],
342 |                 hue = 'type', 
343 |                 fit_reg = False,
344 |                 height = 4, aspect = 5,
345 |                 # color = 'green',
346 |                 # scatter_kws = {'alpha': 0.3},
347 |                 # ax = ax,
348 |                 legend = False,
349 |                 palette = 'viridis'
350 |             )
351 |             #ax.set(xlabel='time_delta', ylabel='packet_size')
352 |             ax.set(xlabel = 'duration', ylabel = 'sum_packet_size')
353 |             plt.legend(title = 'Class', labels =self.classes)
354 |             plt.savefig(self.filename_prefix + "_" + self.classes[_class] + "_"+ str(i[0]) + "_" + i[1] + ".png", format = 'png')    
355 |             fig, ax2 = plt.subplots(figsize = a4_dims)
356 |             sns.lmplot(
357 |                 x = 'sum_iat', 
358 |                 y = 'sum_length', 
359 |                 data = df[df['type_pred'] == _class],
360 |                 hue = 'type', 
361 |                 fit_reg = False,
362 |                 height = 4, aspect = 5,
363 |                 # color = 'orange',
364 |                 # scatter_kws = {'alpha': 0.3},
365 |                 legend = False,
366 |                 palette = 'viridis',
367 |                 # ax = ax2
368 |             )
369 |             ax2.set(xlabel = 'duration', ylabel = 'sum_packet_size')
370 |             plt.legend(title = 'Class', labels =self.classes)
371 |             plt.savefig(self.filename_prefix + "_" + self.classes[_class] + "_pred_"+ str(i[0]) + "_" + i[1]+".png", format = 'png')
372 |             
373 |     ########################################
374 |     # Prediction
375 |     ########################################
376 | 
377 | 
378 |     ########################################
379 |     # Akem's methods
380 |     ########################################
381 |     # Feature Importance
382 |     """
383 |     Function to Fit model based on optimal values of depth and number of estimators and use it
384 |     to compute feature importance for all the features.
385 |     """
386 |     def get_feature_importance(depth, n_tree, max_leaf, X_train, y_train):
387 |         from sklearn.ensemble import RandomForestClassifier
388 |         
389 |         # rf_opt = RandomForestClassifier(max_depth = depth, n_estimators = n_tree, random_state=42, bootstrap=False)
390 |         rf_opt = RandomForestClassifier(max_depth = depth, n_estimators = n_tree, max_leaf_nodes=max_leaf, random_state=42, bootstrap=False)
391 |         rf_opt.fit(X_train, y_train)
392 |         feature_importance = pd.DataFrame(rf_opt.feature_importances_)
393 |         feature_importance.index = X_train.columns
394 |         feature_importance = feature_importance.sort_values(by=list(feature_importance.columns),axis=0,ascending=False)
395 |         
396 |         return feature_importance
397 | 
398 | 
399 |     """
400 |     Function to Fit model based on optimal values of depth and number of estimators and feature importance
401 |     to find the fewest possible features to exceed the previously attained score with all selected features
402 |     """
403 |     def get_fewest_features(depth, n_tree, max_leaf, importance):    
404 |         sorted_feature_names = importance.index
405 |         # print('sorted_feature_names: ', sorted_feature_names)
406 |         features = []
407 |         for f in range(1,len(sorted_feature_names)+1):
408 |             features.append(sorted_feature_names[0:f])
409 |             # print('features:', features)
410 |         return features
411 | 
412 | 
413 |     def get_result_scores(classes, cl_report):
414 |         precision=[]
415 |         recall=[]
416 |         f1_score=[]
417 |         supports=[]
418 |         for a_class in classes:
419 |             precision.append(cl_report[a_class]['precision'])
420 |             recall.append(cl_report[a_class]['recall'])
421 |             f1_score.append(cl_report[a_class]['f1-score'])
422 |             supports.append(cl_report[a_class]['support'])
423 |         return precision, recall, f1_score, supports
424 | 
425 | 
426 |     def get_scores(classes, depth, n_tree, feats, max_leaf, X_train, y_train, X_test, y_test):
427 |         from sklearn.ensemble import RandomForestClassifier
428 |         model = RandomForestClassifier(max_depth=depth, n_estimators = n_tree, max_leaf_nodes=max_leaf, n_jobs=4,
429 |                                        random_state=42, bootstrap=False)                              
430 | 
431 |         model.fit(X_train[feats], y_train)
432 |         y_pred = model.predict(X_test[feats])
433 |         
434 |         class_report = classification_report(y_test, y_pred, target_names=classes, output_dict = True)
435 |         
436 |         accurac = model.score(X_test[feats], y_test)
437 |         macro_score = class_report['macro avg']['f1-score']
438 |         weighted_score = class_report['weighted avg']['f1-score']
439 | 
440 |         return model, class_report, macro_score, weighted_score, y_pred, accurac
441 | 
442 | 
443 |     def get_x_y(Dataset, classes, feats):
444 |         Dataset = Dataset[Dataset["Label"].isin(classes)]    
445 |         X = Dataset[feats]
446 |         y = Dataset['Label'].replace(classes, range(len(classes)))
447 |         #     y = Dataset.columns[-1].replace(classes, range(len(classes)))
448 |         
449 |         return X, y
450 | 
451 |     def analyze_models(classes, model_type, depths, n_trees, X_train, y_train, X_test, y_test, max_leaf, filename_out):
452 |         
453 |         with open(filename_out, "w") as res_file:
454 |             print('depth;tree;n_feat;Macro_F1;Weighted_F1;Accuracy;feats;c_report', file=res_file)
455 |             if model_type == 'RF':
456 |                 # FOR EACH (depth, n_tree, feat)
457 |                 for depth in depths:
458 |                     for n_tree in n_trees:
459 |                         # get feature orders to use
460 |                         importance = get_feature_importance(depth, n_tree, max_leaf, X_train, y_train)
461 | 
462 |                         m_feats = get_fewest_features(depth, n_tree, max_leaf, importance) 
463 |                         for feats in m_feats:
464 |                             # Get the scores with the given (depth, n_tree, feat)
465 |                             model, c_report, macro_f1, weight_f1, y_pred, accuracs = get_scores(classes, depth, n_tree, feats, max_leaf, X_train, y_train, X_test, y_test)
466 |                             
467 |                             print(str(depth)+';'+str(n_tree)+';'+str(len(feats))+';'+str(macro_f1)+';'+str(weight_f1)+';'+str(accuracs)+';'+str(list(feats))+';'+str(c_report), file=res_file)
468 |         print("Analysis Complete. Check output file.")
469 |         return []
470 | 
471 |     # N = number of packets in flows, feats = array of feature names to use, feat_name = string to add to output file name
472 |     def analyze_models_for_npkts(self, N, feats, feat_name):
473 |         i = (N, self.filenames[0], 0)
474 |         print("Number of packets per flow: ", N)
475 |         
476 |         X_trains, y_trains = X_train[i][feats], y_train[i]
477 |         X_tests,  y_tests  = X_test[i][feats], y_test[i]
478 |         
479 |         results_file = "Models_" + feat_name + "_" + str(N) + "_pkts_.csv"
480 |         analyze_models(self.classes, "RF", range(7, 20, 1), range(1, 8, 2), X_trains, y_trains, X_tests, y_tests, 500, results_file)
481 |         
482 |         results = pd.read_csv(results_file, sep=';')
483 |         results = results.sort_values(by=['Weighted_F1','Macro_F1'],ascending=False)
484 |         print(results.head(10))
485 |         print("******")
486 |         print(results.head(1)['c_report'].values)
487 | 
488 |     ########################################
489 |     # GBoost
490 |     ########################################
491 |     def GBoost_predict(self, feats):
492 |         print("GBoost_predict")
493 |         from sklearn.ensemble import GradientBoostingClassifier
494 |         gb_model = {}
495 |         
496 |         for i in EncryptedTrafficClassifierIterator(self.flow_ids):
497 |             gb_model[i] = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state = 42)
498 | 
499 |         for i in EncryptedTrafficClassifierIterator(self.flow_ids):
500 |             print("==",i,"==")
501 |             try:
502 |                 gb_model[i].fit(X_train[i][feats], y_train[i])
503 |             except ValueError as e:
504 |                 print(e)
505 |                 pass
506 |         
507 |         gb_y_train_predicted = {}
508 |         gb_y_test_predicted = {}
509 |         gb_train_score = {}
510 |         gb_test_score = {}
511 |         for i in EncryptedTrafficClassifierIterator(self.flow_ids):
512 |             print("==",i,"==")
513 |             gb_y_train_predicted[i] = gb_model[i].predict(X_train[i][feats])
514 |             gb_y_test_predicted[i] = gb_model[i].predict(X_test[i][feats])
515 |             gb_train_score[i] = gb_model[i].score(X_train[i][feats], y_train[i])
516 |             gb_test_score[i] = gb_model[i].score(X_test[i][feats], y_test[i])
517 | 
518 |         self._get_scores_from_models(gb_model, y_test, gb_y_test_predicted, feats)
519 | 
520 |         gb_cm_dict = {}
521 |         for i in EncryptedTrafficClassifierIterator(self.flow_ids):
522 |             print("==",i,"==")
523 |             gb_cm_dict[i] = confusion_matrix(y_test[i], gb_y_test_predicted[i].astype(int))
524 |             print(gb_cm_dict[i])
525 | 
526 |         for i in EncryptedTrafficClassifierIterator(self.flow_ids):
527 |             pkt, _ = i
528 |             classification_results.loc[classification_results['nb_packets'] == pkt, 'gb_train_score'] = gb_train_score[i]
529 |             classification_results.loc[classification_results['nb_packets'] == pkt, 'gb_test_score'] = gb_test_score[i]
530 |         
531 |         return gb_model, gb_y_train_predicted, gb_y_test_predicted
532 | 
533 | 
534 | ########################################
535 | # Entry point
536 | ########################################
537 | if __name__ == "__main__":
538 |     parser = argparse.ArgumentParser(
539 |         prog='noms2023_instant_messaging_traffic_classifier',
540 |         description='Classify packets or flows from NOMS 2023 Encrypted Mobile Instant Messaging',
541 |         epilog=''
542 |     )
543 |     parser.add_argument('-p', '--nb_packets', action = 'append', type = int, required = True) #, default = [4, 8])
544 |     parser.add_argument('-c', '--classifier', action = 'append', type = str) #, default = ['rf'])
545 |     parser.add_argument('-f', '--nb_folds', action = 'store', default = 12, type = int)
546 |     parser.add_argument('-v', '--visualization', action = 'store_true', required = False, default = False)
547 |     parser.add_argument('-r', '--report', action = 'store_true', required = False, default = False)
548 |     parser.add_argument('-F', '--force_rf_classification', action = 'store_true', required = False, default = False)
549 |     args = parser.parse_args(sys.argv[1:])
550 | 
551 |     # NB_PACKETS = [2, 3, 4, 5, 6, 7, 8, 9, 10, 600000]
552 |     VISUALIZATION_ENABLED = False
553 |     if args.visualization == True:
554 |         VISUALIZATION_ENABLED = True
555 | 
556 |     RF_ENABLED = False
557 |     GB_ENABLED = False
558 |     XG_ENABLED = False
559 |     for c in args.classifier:
560 |         c = c.lower()
561 |         if c == "rf":
562 |             RF_ENABLED = True
563 |         elif c == "gb":
564 |             GB_ENABLED = True
565 |         elif c == "xg":
566 |             XG_ENABLED = True
567 |         else:
568 |             print("Unknown classifier", c)
569 |             
570 |     classifier = NOMS2023InstantMessagingClassifier(
571 |         nb_folds = args.nb_folds,
572 |         nb_packets_per_flow = args.nb_packets
573 |     )
574 | 
575 |     if args.force_rf_classification == True:
576 |         classifier.force_rf_classification = True
577 |         
578 |     classifier.all_classes = [
579 |         "discord",
580 |         "messenger",
581 |         "signal",
582 |         "teams",
583 |         "telegram",
584 |         "whatsapp",
585 |         # Non Instant Messenging
586 |         #"all_background",
587 |         #"gmail",
588 |         #"browsing",
589 |         #"youtube",
590 |     ]
591 |     
592 |     non_needed_features = [
593 |         'packet_id', 
594 |         'flow_id', 
595 |         'class', 
596 |         'source', 
597 |         'dest', 
598 |         'sport',
599 |         'dport', 
600 |         'protocol', 
601 |         'timestamp', 
602 |         # 'nb_packets',
603 |         'src',
604 |         'iat',
605 |         'direction',
606 |         'length'
607 |     ]
608 | 
609 |     all_features_flows = [
610 |         'min_iat',
611 |         'max_iat',
612 |         'sum_iat',
613 |         'mean_iat',
614 |         'median_iat',
615 |         'std_iat',
616 |         '1stQ_iat',
617 |         '3rdQ_iat', 
618 |         'skew_iat',
619 |         'kurt_iat',
620 |         'min_length',
621 |         'max_length',
622 |         'sum_length',
623 |         'median_length',
624 |         'mean_length', 
625 |         'std_length',
626 |         '1stQ_length',
627 |         '3rdQ_length',
628 |         'skew_length',
629 |         'kurt_length',
630 |         'nb_packets',
631 |         # 'sport',
632 |         # 'dport',
633 |         # 'protocol',
634 |         # 'direction'
635 |     ]
636 |     # best_features = [
637 |     #     'max_iat',
638 |     #     'sum_iat',
639 |     #     'mean_iat',
640 |     #     'median_iat',
641 |     #     'std_iat',
642 |     #     '1stQ_iat',
643 |     #     '3rdQ_iat', 
644 |     #     'skew_iat',
645 |     #     'kurt_iat',
646 |     #     'min_length',
647 |     #     'max_length',
648 |     #     'sum_length',
649 |     #     'median_length',
650 |     #     'mean_length', 
651 |     #     'std_length',
652 |     #     '1stQ_length',
653 |     #     '3rdQ_length',
654 |     #     'skew_length',
655 |     #     'kurt_length'
656 |     # ]
657 |     best_features = ['3rdQ_iat', 'std_iat', 'std_length', 'skew_iat', 'max_iat', 'sum_iat', 'mean_length', '1stQ_length', 'max_length', 'mean_iat', 'min_length', 'sum_length', 'median_length', '1stQ_iat', 'median_iat', '3rdQ_length', 'kurt_iat', 'kurt_length', 'nb_packets']
658 |     online_features=[
659 |         'sum_iat',
660 |         'sum_length',
661 |         'max_length',
662 |         'mean_iat',
663 |         'max_iat',
664 |         'mean_length',
665 |         'min_length',
666 |         'min_iat'
667 |     ]
668 |     feats_flows = all_features_flows
669 |     
670 |     # Preprocessing
671 |     if not classifier.data_prepared():
672 |         classifier.data_preparation()
673 |         classifier.load_flows()
674 |     else:
675 |         classifier.load_flows()
676 |         _c = classifier.y_train_flows[(classifier.nb_packets_per_flow[0], 0)].unique()
677 |         #_df_tmp = classifier.y_train_flows[(classifier.nb_packets_per_flow[0], 0)]
678 |         #_df2_tmp = classifier.y_test_flows[(classifier.nb_packets_per_flow[0], 0)]
679 |         #print(_df_tmp.value_counts())
680 |         #print(_df2_tmp.value_counts())
681 |         #sys.exit(1)
682 |         classifier.classes = [-1 for _ in range(len(classifier.all_classes) + 4)]
683 |         _Xy = classifier.X_train_flows[(classifier.nb_packets_per_flow[0], 0)].copy()
684 |         _Xy['type'] = classifier.y_train_flows[(classifier.nb_packets_per_flow[0], 0)]
685 |         for index, row in _Xy.iterrows():
686 |             for _i in range(len(classifier.all_classes)):
687 |                 if classifier.all_classes[_i] in row['src']:
688 |                     classifier.classes[row['type']] = classifier.all_classes[_i]
689 |                     break
690 |             #classifier.classes[i] .append(classifier.all_classes[i])
691 |             if -1 not in classifier.classes:
692 |                 break
693 |         _n = 0
694 |         for _i in range(len(classifier.classes)):
695 |             if classifier.classes[_i] == -1:
696 |                 _Xy = _Xy.drop(_Xy[_Xy['type'] == _i].index)
697 |                 print("dropping", _i)
698 |                 _n += 1
699 |             if _i < len(classifier.classes) - 1:
700 |                 if _n > 0:
701 |                     _Xy.loc[_Xy['type'] == (_i + 1),'type'] = _i - _n +1 
702 |                     print(_i+1,"->", _i - _n + 1)
703 |                 #for i in _c:
704 |         #    classifier.classes.append(classifier.all_classes[i])
705 |         keep = True
706 |         while keep:
707 |             try:
708 |                 classifier.classes.remove(-1)
709 |             except ValueError:
710 |                 keep = False
711 |         classes_dict = {}
712 |         for _i in range(len(classifier.classes)):
713 |             classes_dict[_i] = classifier.classes[_i]
714 |         _Xy['class'] = _Xy['type'].map(classes_dict)
715 |         print("classes =",classifier.classes)
716 |         pkt = classifier.nb_packets_per_flow[0]
717 |         # classifier._distribution(_Xy, classifier.filename_prefix + "_flows_class_split_" + str(pkt) + '_pkt_6_IMA')
718 |         # sys.exit(1)
719 |         
720 |     classifier.cleanup_data(classifier.X_train_flows,
721 |                             classifier.y_train_flows,
722 |                             classifier.X_test_flows,
723 |                             classifier.y_test_flows,
724 |                             classifier.flow_ids,
725 |                             non_needed_features)
726 | 
727 |     # scaling during processing make results wore !
728 |     # classifier.X_train_flows_fitted, classifier.X_test_flows_fitted = classifier.preprocessing(
729 |     #     classifier.X_train_flows,
730 |     #     classifier.y_train_flows,
731 |     #     classifier.X_test_flows,
732 |     #     classifier.y_test_flows,
733 |     #     classifier.flow_ids,
734 |     #     feats_flows
735 |     # )
736 |     classifier.X_train_flows_fitted = classifier.X_train_flows
737 |     classifier.X_test_flows_fitted = classifier.X_test_flows
738 |     
739 |     # __correlation()
740 |     # analyze_models_for_npkts(10, all_features, "all_feats")
741 |     if args.report == True:
742 |         classifier._viz(distribution = 0, class_distribution = -1, nb_packets = -1, min_iat = -1, max_iat = -1)
743 |         for n in classifier.nb_packets_per_flow:
744 |             if n == 4:
745 |                 classifier._viz(distribution = -1, class_distribution = 11, nb_packets = 0, min_iat = -1, max_iat = -1)
746 |             elif n == 8:
747 |                 classifier._viz(distribution = -1, class_distribution = 11, nb_packets = 0, min_iat = -1, max_iat = -1)
748 |             elif n == 600000:
749 |                 classifier._viz(distribution = -1, class_distribution = 11, nb_packets = 1, min_iat = 1, max_iat = -1)
750 |         sys.exit(1)
751 |     if VISUALIZATION_ENABLED:
752 |         # f = classifier.filename_prefix + '_datasetflows_distribution.pickle'
753 |         # if isfile(classifier.processed_data_output_dir + f):
754 |         #     print("Loading dataset from pickle file", f)
755 |         #     _df = classifier._load_pickle(f)
756 |         # else:
757 |         #     print("Creating dataset")
758 |         #     _df = classifier._get_flows_with_all_packets()
759 |         #     classifier._pickle_dump(_df, f)
760 |         #     print("Dataset saved in file", f)
761 |         # classifier._class_distribution(_df, classifier.filename_prefix + '_flows_distribution')
762 |         # classifier._nb_packets_distribution(_df, classifier.filename_prefix + "_flows_nb_packets_distribution" )
763 |         # classifier._distribution(_df, classifier.filename_prefix + "_flows_class_split" )
764 |         classifier._viz(distribution = 0, class_distribution = 11, nb_packets = -1, min_iat = -1, max_iat = -1)
765 |         # pkt = classifier.nb_packets_per_flow[0]
766 |         # fold = 0
767 |         # _i = pkt, fold
768 |         # _df1 = classifier.X_train_flows[_i].copy()
769 |         # _df1['type'] = classifier.y_train_flows[_i]
770 |         # _df2 = classifier.X_test_flows[_i].copy()
771 |         # _df2['type'] = classifier.y_test_flows[_i]
772 |         # _df = pd.concat([_df1, _df2])
773 |         # _df.reset_index()
774 |         # print(_df.shape)
775 |         # classifier._distribution(_df, classifier.filename_prefix + "_flows_class_split_" + str(pkt) + '_pkt')
776 |         # classifier._class_distribution(_df, classifier.filename_prefix + '_flows_distribution_' + str(pkt) + '_pkt')
777 |         # # classifier._nb_packets_distribution(_df, classifier.filename_prefix + "_flows_nb_packets_distribution_" + str(pkt) + '_pkt')
778 |         # classifier._min_iat_distribution(_df, classifier.filename_prefix + "_flows_min_iat_distribution_" + str(pkt) + '_pkt')
779 |         # classifier._max_iat_distribution(_df, classifier.filename_prefix + "_flows_max_iat_distribution_" + str(pkt) + '_pkt')
780 |         sys.exit(1)
781 | 
782 |     if RF_ENABLED:
783 |         print("==== RandomForest =====")
784 |         """
785 |         classifier.X_train_flows_fitted, classifier.X_test_flows_fitted = classifier.preprocessing(
786 |             classifier.X_train_flows,
787 |             classifier.y_train_flows,
788 |             classifier.X_test_flows,
789 |             classifier.y_test_flows,
790 |             classifier.flow_ids,
791 |             feats_flows
792 |         )
793 |         """
794 |         # classifier.X_train_flows_fitted = classifier.X_train_flows
795 |         # classifier.X_test_flows_fitted = classifier.X_test_flows
796 |         
797 |         rf_regr_flows, rf_y_train_flows_predicted, rf_y_test_flows_predicted = classifier.RF_predict(
798 |                 classifier.X_train_flows_fitted,
799 |                 classifier.y_train_flows,
800 |                 classifier.X_test_flows_fitted,
801 |                 classifier.y_test_flows
802 |         )
803 |                 
804 |         # __show_actual_and_predicted(X_test, y_test, rf_y_test_predicted, 1)
805 |         rf_cm_dict_flows, output = classifier.confusion_matrix(rf_regr_flows,
806 |                                                                classifier.y_test_flows,
807 |                                                                rf_y_test_flows_predicted,
808 |                                                                classifier.flow_ids,
809 |                                                                "rf"
810 |                                                                )
811 |         print(output)
812 |         rf_f1_scores_flows, output = classifier.get_F1_score(rf_cm_dict_flows,
813 |                                                              classifier.y_test_flows,
814 |                                                              rf_y_test_flows_predicted,
815 |                                                              classifier.flow_ids,
816 |                                                              "rf_flows")
817 |         print(output)
818 |         avg_scores, output = classifier.avg_f1_scores(rf_f1_scores_flows, classifier.flow_ids) #_without_folds)
819 |         print(output)
820 |         # rf_cm_dict = classifier.confusion_matrix(rf_regr, rf_y_test_predicted, False)
821 |         # rf_f1_scores = classifier.get_F1_score(classification_results, rf_cm_dict, y_test, rf_y_test_predicted, "rf", False)
822 |         # classifier.avg_f1_scores(rf_f1_scores)
823 | 
824 |     if GB_ENABLED:
825 |         print("==== GradientBoosting =====")
826 |         gb_regr, gb_y_train_predicted, gb_y_test_predicted  = classifier.GBoost_predict(feats_flows, classification_results)
827 |         gb_cm_dict = classifier.confusion_matrix(gb_regr, classifier.y_test_flows, gb_y_test_predicted, classifier.flow_ids, "gb")
828 |         gb_f1_scores = classifier.get_F1_score(gb_cm_dict,  y_test, gb_y_test_predicted, "gb", False)
829 |         classifier.avg_f1_scores(gb_f1_scores_flows, classifier.flow_ids_without_folds)
830 |         classifier.avg_f1_scores(gb_f1_scores)
831 | 
832 |     if XG_ENABLED:
833 |         print("==== XGBoost =====")
834 |         xg_regr, xg_y_train_predicted, xg_y_test_flows_predicted = classifier.XGBoost_predict(
835 |             classifier.X_train_flows_fitted,
836 |             classifier.y_train_flows,
837 |             classifier.X_test_flows_fitted,
838 |             classifier.y_test_flows
839 |         )
840 |         
841 |         # feats_flows, classification_results)
842 |         # xg_cm_dict = classifier.confusion_matrix(xg_regr, xg_y_test_predicted, False)
843 |         xg_cm_dict_flows, output = classifier.confusion_matrix(xg_regr,
844 |                                                                classifier.y_test_flows,
845 |                                                                xg_y_test_flows_predicted,
846 |                                                                classifier.flow_ids,
847 |                                                                "xg"
848 |                                                                )
849 |         print(output)
850 |             
851 |         xg_f1_scores_flows, output = classifier.get_F1_score(
852 |             xg_cm_dict_flows,
853 |             classifier.y_test_flows,
854 |             xg_y_test_flows_predicted,
855 |             classifier.flow_ids,
856 |             "xg_flows")
857 |         print(output)
858 |         # xg_cm_dict, classifier.y_test_flows, xg_y_test_predicted, "xg", False)
859 |         avg_scores, output = classifier.avg_f1_scores(xg_f1_scores_flows, classifier.flow_ids)
860 |         print(output)
861 | 
862 |     print(classifier.classification_results)
863 |     if RF_ENABLED or GB_ENABLED or XG_ENABLED:
864 |         classifier.save_results()
865 | 


--------------------------------------------------------------------------------
/Offline_ETC/ucdavis_quic_classifier.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import argparse
  5 | 
  6 | import os
  7 | from os.path import isfile, join
  8 | import sys
  9 | import time
 10 | 
 11 | import numpy as np
 12 | 
 13 | import pandas as pd 
 14 | 
 15 | from scipy.stats import kurtosis, skew
 16 | 
 17 | from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
 18 | from sklearn.pipeline import Pipeline
 19 | from sklearn.compose import ColumnTransformer
 20 |     
 21 | from sklearn.experimental import enable_iterative_imputer
 22 | from sklearn.impute import SimpleImputer, IterativeImputer
 23 | 
 24 | from encrypted_traffic_classification import EncryptedTrafficClassifier, EncryptedTrafficClassifierIterator
 25 | 
 26 | REGENERATE_FLOWS_DATA = False
 27 | 
 28 | TEST_FLOWS = True
 29 | TEST_PACKETS = False
 30 | 
 31 | ########################################
 32 | # Data preparation: convert RAW data
 33 | ########################################
 34 | class UCDavisQuicClassifier(EncryptedTrafficClassifier):
 35 |     def __init__(self, nb_folds, nb_packets_per_flow):
 36 |         super().__init__(
 37 |             nb_folds= nb_folds,
 38 |             nb_packets_per_flow = nb_packets_per_flow,
 39 |             filename_prefix = "ucdavis_quic",
 40 |             # processed_data_output_dir = "ucdavis_quic_output/",
 41 |             processed_data_output_dir = "ucdavis_quic_output_addendum/",
 42 |             data_dir = "data/ucdavis_quic_pretraining/"            
 43 |         )
 44 | 
 45 |         pools = [tuple(pool) for pool in [self.nb_packets_per_flow, range(self.nb_folds)]]
 46 |         result = [[]]
 47 |         for pool in pools:
 48 |             result = [x + [y] for x in result for y in pool]
 49 |         self.flow_ids = result
 50 | 
 51 |         pools = [tuple(pool) for pool in [range(self.nb_folds)]]
 52 |         result = [[]]
 53 |         for pool in pools:
 54 |             result = [x + [y] for x in result for y in pool]
 55 |         self.packet_ids = result
 56 | 
 57 |     def data_preparation(self):
 58 |         print("data_prepation")
 59 |         # limit = 100000
 60 | 
 61 |         start_time = time.time()
 62 |         traffic_type = 0
 63 |         subdirs = os.listdir(self.data_dir)
 64 |         dfs = []
 65 |         dfs_memory_usage = 0
 66 |         df = pd.DataFrame()
 67 |         for d in subdirs:
 68 |             self.classes[traffic_type] = d
 69 |             # print(d)
 70 |             files = os.listdir(self.data_dir + d)
 71 |             # i = 0
 72 |             for filename in files:
 73 |                 f = self.data_dir + d + "/" + filename
 74 |                 # print(filename)
 75 |                 file_df = pd.read_csv(f, 
 76 |                                       delimiter = '\t',
 77 |                                       names = ['timestamp', 'time_delta', 'packet_size', 'direction']
 78 |                                       )
 79 |                 file_df['type'] = traffic_type
 80 |                 file_df['src'] = filename
 81 |                 dfs.append(file_df)
 82 |                 # short hack: there is a trade-off between memory usage and speed
 83 |                 # as much as possible DataFrames are insert in the dfs numpy array
 84 |                 # which is much faster than pandas.concat, but if the dfs array grows
 85 |                 # too big the process will be killed by oom_killer on Linux, so
 86 |                 # once memory is above 2GB concat what we have in dfs
 87 |                 dfs_memory_usage += file_df.memory_usage(deep = True).sum()
 88 |                 if dfs_memory_usage > 2 * 1024 * 1024 * 1024:
 89 |                     df = pd.concat([df, *dfs])
 90 |                     del dfs
 91 |                     dfs = []
 92 |                     dfs_memory_usage = 0
 93 |                     
 94 |                 # i += 1
 95 |                 # if i >= limit:
 96 |                 #     break
 97 |     
 98 |                 traffic_type += 1
 99 |         
100 |         df = pd.concat([df, *dfs])
101 |         df = df.fillna(0)
102 | 
103 |         del dfs
104 |         print(f"  flows data loaded in {time.time() - start_time} seconds")
105 |         filename = self.filename_prefix + ".pickle"
106 |         self._generate_data_folds(df, filename)
107 |         
108 |         print(df.columns)
109 |         print(df.describe())
110 |         print(df.info)
111 |         print(df.shape)
112 | 
113 |     def __get_flow_df(self, flow_df, traffic_type):
114 |         # filter by direction
115 |         #file_df = file_df[file_df['direction'] == 1]
116 |         _df = flow_df['packet_size']
117 |         packet_size = sum(_df)
118 |         min_packet_size = np.min(_df)
119 |         max_packet_size = np.max(_df)
120 |         mean_packet_size = np.mean(_df)
121 |         median_packet_size = np.median(_df)
122 |         std_packet_size = np.std(_df)
123 |         Q1_packet_size = np.quantile(_df, 0.25)
124 |         Q3_packet_size = np.quantile(_df, 0.75)
125 |         _a = list(_df)
126 |         skew_packet_size = skew(_a)
127 |         kurt_packet_size = kurtosis(_a)
128 |                 
129 |         min_time_delta = np.min(flow_df[flow_df['time_delta'] > 0]['time_delta'])
130 |         _df = flow_df['time_delta']
131 |         time_delta = sum(_df)
132 |         max_time_delta = np.max(_df)
133 |         mean_time_delta = np.mean(_df)
134 |         median_time_delta = np.median(_df)
135 |         std_time_delta = np.std(_df)
136 |         Q1_iat = np.quantile(_df, 0.25)
137 |         Q3_iat = np.quantile(_df, 0.75)
138 |         _a = list(_df)
139 |         skew_iat = skew(_a)
140 |         kurt_iat = kurtosis(_a)
141 |         data = {
142 |             'sum_iat': [time_delta],
143 |             'sum_length': [packet_size],
144 |             'min_length': [min_packet_size],
145 |             'max_length': [max_packet_size],
146 |             'mean_length': [mean_packet_size],
147 |             'median_length': [median_packet_size],
148 |             'std_length': [std_packet_size],
149 |             '1stQ_length': [Q1_packet_size],
150 |             '3stQ_length': [Q3_packet_size],
151 |             'skew_length': [skew_packet_size],
152 |             'kurt_length': [kurt_packet_size],
153 |             'min_iat': [min_time_delta],
154 |             'max_iat': [max_time_delta],
155 |             'mean_iat': [mean_time_delta],
156 |             'median_iat': [median_time_delta],
157 |             'std_iat': [std_time_delta],
158 |             '1stQ_iat': [Q1_iat],
159 |             '3stQ_iat': [Q3_iat],
160 |             'skew_iat': [skew_iat],
161 |             'kurt_iat': [kurt_iat],
162 |             'nb_packets': [len(flow_df)],
163 |             'type': [traffic_type],
164 |             #'direction': [flow_df['direction']]
165 |         }
166 |         _df = pd.DataFrame(data = data)
167 |         _df.fillna(0)
168 |         return _df
169 | 
170 |     def packets2flows_nofold(self):
171 |         print("packets2flows_nofold")
172 |         traffic_type = 0
173 |         subdirs = os.listdir(self.data_dir)
174 |         dfs = {}
175 |         flow_id = 0
176 |         for n in self.nb_packets_per_flow:
177 |             dfs[n] = []
178 |         for d in subdirs:
179 |             self.classes[traffic_type] = d
180 |             files = os.listdir(self.data_dir + d)
181 |             for filename in files:
182 |                 f = self.data_dir + d + "/" + filename
183 |                 # print(filename)
184 |                 file_df = pd.read_csv(f, 
185 |                                       delimiter = '\t',
186 |                                       names = ['timestamp', 'time_delta', 'packet_size', 'direction']
187 |                                       )
188 |                 file_df['type'] = traffic_type
189 |                 file_df['src'] = filename
190 |                 file_df['flow_id'] = flow_id
191 |                 flow_id += 1
192 |                 
193 |                 dfs[n].append(file_df.head(n = n))
194 |             
195 |             traffic_type += 1
196 |         for n in self.nb_packets_per_flow:
197 |             df = pd.concat(dfs[n])        
198 |             self._pickle_dump(df, "for_signatures_" + str(n) + "_flows_" + self.pickle_filename_suffix)
199 |             
200 |     def _get_flows_with_all_packets(self):
201 |         print("_get_flows_with_all_packets")
202 |         traffic_type = 0
203 |         subdirs = os.listdir(self.data_dir)
204 |         _df = []
205 |         start_time = time.time()
206 |         for d in subdirs:
207 |             self.classes[traffic_type] = d
208 |             files = os.listdir(self.data_dir + d)
209 |             for filename in files:
210 |                 f = self.data_dir + d + "/" + filename
211 |                 file_df = pd.read_csv(
212 |                     f, 
213 |                     delimiter = '\t',
214 |                     names = ['timestamp', 'time_delta', 'packet_size', 'direction']
215 |                 )
216 |                 file_df['type'] = traffic_type
217 |                 file_df['src'] = filename
218 |                 _flow_df = self.__get_flow_df(file_df, traffic_type)
219 |                 
220 |                 _df.append(_flow_df)
221 |                 # uncomment for debugging behavior with a single file
222 |                 # break
223 |             traffic_type += 1
224 |         print("  processing took ", time.time() - start_time, "seconds.")
225 |         _df = pd.concat(_df)
226 |         _df =_df.fillna(0)
227 |         return _df
228 |         
229 |     def packets2flows(self):
230 |         print("packets2flows")
231 |         traffic_type = 0
232 |         subdirs = sorted(os.listdir(self.data_dir))
233 |         dfs = {}
234 |         for n in self.nb_packets_per_flow:
235 |             dfs[n] = []
236 |         idx_d = 0
237 |         for d in subdirs:
238 |             start_time = time.time()
239 |             print("Processing directory #%d/%d: %s" % (idx_d, len(subdirs), d))
240 |             idx_d += 1
241 |             self.classes[traffic_type] = d
242 |             files = sorted(os.listdir(self.data_dir + d))
243 |             for filename in files:
244 |                 f = self.data_dir + d + "/" + filename
245 |                 # print(filename)
246 |                 file_df = pd.read_csv(f, 
247 |                                       delimiter = '\t',
248 |                                       names = ['timestamp', 'time_delta', 'packet_size', 'direction']
249 |                                       )
250 |                 file_df['type'] = traffic_type
251 |                 file_df['src'] = filename
252 |                 for n in self.nb_packets_per_flow:
253 |                     _flow_df = self.__get_flow_df(file_df.head(n = n), traffic_type)
254 |                 
255 |                     dfs[n].append(_flow_df)
256 |             print("  ", d, "processed in ", time.time() - start_time, "seconds.")            
257 |             
258 |             traffic_type += 1
259 |             
260 |         for n in self.nb_packets_per_flow:
261 |             df = pd.concat(dfs[n])
262 |             df = df.fillna(0)
263 |             seed = 42
264 |             filename = self.filename_prefix + "_" + str(n) + ".pickle"
265 |             # filename = self.filename_prefix + "_flows_" + str(n) + ".pickle"
266 |             self._generate_data_folds(df, filename)
267 | 
268 |     def load_flows_nofold(self):
269 |         from sklearn.model_selection import StratifiedKFold
270 |         skf = StratifiedKFold(n_splits = self.nb_folds, shuffle = True, random_state = self.random_seed)
271 |         for n in [4]:
272 |             df = self._load_pickle("for_signatures_" + str(n) + "_flows_" + self.pickle_filename_suffix)
273 |             X = df.drop('type', axis = 1)
274 |             y = df['type']
275 |         
276 |             for _i, (train_index, test_index) in enumerate(skf.split(X, y)):
277 |                 i = _i, n
278 |                 self.X_train_flows[i] = X.iloc[train_index].fillna(0)
279 |                 self.y_train_flows[i] = y.iloc[train_index].fillna(0)
280 |                 self.X_test_flows[i] = X.iloc[test_index].fillna(0)
281 |                 self.y_test_flows[i] = y.iloc[test_index].fillna(0)
282 |         
283 |     def load_packets(self, suffix):
284 |         print("load_packets", suffix)
285 |         start_time = time.time()
286 |         
287 |         for fold in EncryptedTrafficClassifierIterator(self.packet_ids):
288 |             name = str(fold) + "_X_train_" + suffix
289 |             self.X_train_packets[fold] = self._load_pickle(name)
290 |             
291 |             name = str(fold) + "_y_train_" + suffix
292 |             self.y_train_packets[fold] = self._load_pickle(name)
293 |             
294 |             name = str(fold) + "_X_test_" + suffix
295 |             self.X_test_packets[fold] = self._load_pickle(name)
296 |             
297 |             name = str(fold) + "_y_test_" + suffix
298 |             self.y_test_packets[fold] = self._load_pickle(name)
299 |         print(f"  packets data loaded in {time.time() - start_time} seconds")
300 | 
301 |     def LogReg_predict(X_train, y_train, X_test, y_test, ):    
302 |         lr = LogisticRegression(penalty='none', solver='newton-cg')
303 |         lr.fit(X, y)
304 |         metrics.plot_roc_curve(lr, X, y)
305 |         plt.plot([0, 1], [0, 1], "-")
306 |         plt.show()
307 |         
308 |         
309 |         display(metrics.roc_auc_score(y, lr.predict_proba(X)[:, 1]))
310 |         display(metrics.confusion_matrix(y, lr.predict_proba(X)[:, 1]>0.5))
311 |         
312 |         print("train score = %f" % (lr.score(X_train, y_train)))
313 |         print("test score = %f" % (lr.score(X_test, y_test)))
314 |         
315 | ########################################
316 | # Entry point
317 | ########################################
318 | if __name__ == "__main__":
319 |     parser = argparse.ArgumentParser(
320 |         prog='ucdavis_quic_traffic_classifier',
321 |         description='Classify packets or flows from UCDavis QUIC dataset',
322 |         epilog=''
323 |     )
324 |     parser.add_argument('-p', '--nb_packets', action = 'append', type = int, required = True)
325 |     parser.add_argument('-c', '--classifier', action = 'append', type = str)
326 |     parser.add_argument('-f', '--nb_folds', action = 'store', default = 12, type = int)
327 |     parser.add_argument('-v', '--visualization', action = 'store_true', required = False, default = False)
328 |     parser.add_argument('-r', '--report', action = 'store_true', required = False, default = False)
329 |     parser.add_argument('-F', '--force_rf_classification', action = 'store_true', required = False, default = False)
330 |     args = parser.parse_args(sys.argv[1:])
331 | 
332 |     VISUALIZATION_ENABLED = False
333 |     if args.visualization == True:
334 |         VISUALIZATION_ENABLED = True
335 | 
336 |     RF_ENABLED = False
337 |     GB_ENABLED = False
338 |     XG_ENABLED = False
339 | 
340 |     for c in args.classifier:
341 |         print(c)
342 |         if c == "rf":
343 |             RF_ENABLED = True
344 |         elif c == "gb":
345 |             GB_ENABLED = True
346 |         elif c == "xg":
347 |             XG_ENABLED = True
348 |         else:
349 |             print("Unknown classifier", c)
350 |             
351 |     classifier = UCDavisQuicClassifier(
352 |         nb_folds = args.nb_folds,
353 |         nb_packets_per_flow = args.nb_packets
354 |     )
355 |     FORCE_RF_CLASSIFICATION = False
356 |     if args.force_rf_classification == True:
357 |         classifier.force_rf_classification = True
358 | 
359 |     classifier.all_classes = [
360 |         "Google Doc",
361 |         "Google Drive",
362 |         "Google Music",
363 |         "Google Search",
364 |         "Youtube",
365 |     ]
366 |     if REGENERATE_DATA_FOR_SIGNATURES:
367 |         classifier.packets2flows_nofold()
368 |         sys.exit(1)
369 | 
370 |     if REGENERATE_FLOWS_DATA:
371 |         classifier.load_packets(classifier.pickle_filename_suffix)
372 |         classifier.packets2flows()
373 |         sys.exit(1)
374 |         
375 |     # data preparation, convert raw data to pickle file, split in StratifiedKFold X_train, y_train, X_test, y_test
376 |     if not classifier.data_prepared():        
377 |         # classifier.data_preparation()
378 |         classifier.packets2flows()
379 |     else:
380 |         subdirs = os.listdir(classifier.data_dir)
381 |         traffic_type = 0
382 |         for d in subdirs:
383 |             classifier.classes[traffic_type] = d
384 |             traffic_type += 1
385 |             
386 |     non_needed_features = [
387 |         'timestamp',
388 |         'direction',
389 |         # "nb_packets"
390 |     ]
391 |     # non_needed_features += [
392 |     #     'min_length', 'max_length',
393 |     #     'mean_length', 'std_length',
394 |     #     '1stQ_length',
395 |     #     '3stQ_length',
396 |     #     'skew_length',
397 |     #     'kurt_length',
398 |     #     'min_iat',
399 |     #     'max_iat', 'mean_iat', 'std_iat',
400 |     #     '1stQ_iat',
401 |     #     '3stQ_iat',
402 |     #     'skew_iat',
403 |     #     'kurt_iat',
404 |     # ]
405 |     
406 |     if TEST_PACKETS:
407 |         classifier.load_packets(classifier.pickle_filename_suffix)
408 |         classifier.cleanup_data(classifier.X_train_packets,
409 |                                 classifier.y_train_packets,
410 |                                 classifier.X_test_packets,
411 |                                 classifier.y_test_packets,
412 |                                 classifier.packet_ids,
413 |                                 non_needed_features)
414 | 
415 |     if TEST_FLOWS:
416 |         classifier.load_flows()
417 |         _c = classifier.y_train_flows[(classifier.nb_packets_per_flow[0], 0)].unique()
418 |         classifier.classes = []
419 |         for i in _c:
420 |             classifier.classes.append(classifier.all_classes[i])
421 |         classifier.cleanup_data(classifier.X_train_flows,
422 |                                 classifier.y_train_flows,
423 |                                 classifier.X_test_flows,
424 |                                 classifier.y_test_flows,
425 |                                 classifier.flow_ids,
426 |                                 non_needed_features)
427 |     
428 |     # __correlation()
429 |     if args.report == True:
430 |         classifier._viz(distribution = 0, class_distribution = -1, nb_packets = -1, min_iat = -1, max_iat = -1)
431 |         for n in classifier.nb_packets_per_flow:
432 |             if n == 4:
433 |                 classifier._viz(distribution = -1, class_distribution = 11, nb_packets = 0, min_iat = -1, max_iat = -1)
434 |             elif n == 8:
435 |                 classifier._viz(distribution = -1, class_distribution = 11, nb_packets = 0, min_iat = -1, max_iat = -1)
436 |             elif n == 600000:
437 |                 classifier._viz(distribution = -1, class_distribution = 0, nb_packets = 0, min_iat = 1, max_iat = -1)
438 |         sys.exit(1)
439 |     if VISUALIZATION_ENABLED:       
440 |         pkt = classifier.nb_packets_per_flow[0]
441 |         fold = 0
442 |         _i = pkt, fold
443 |         _df1 = classifier.X_train_flows[_i].copy()
444 |         _df1['type'] = classifier.y_train_flows[_i]
445 |         _df2 = classifier.X_test_flows[_i].copy()
446 |         _df2['type'] = classifier.y_test_flows[_i]
447 |         _df = pd.concat([_df1, _df2])
448 |         _df.reset_index()
449 |         print(_df.shape)
450 |         print(_df['type'].value_counts())
451 |         classifier._distribution(_df, classifier.filename_prefix + "_flows_class_split_" + str(pkt) + '_pkt')
452 |         # classifier._class_distribution(_df, classifier.filename_prefix + '_flows_distribution_' + str(pkt) + '_pkt')
453 |         # classifier._nb_packets_distribution(_df, classifier.filename_prefix + "_flows_nb_packets_distribution_" + str(pkt) + '_pkt')
454 |         classifier._min_iat_distribution(_df, classifier.filename_prefix + "_flows_min_iat_distribution_" + str(pkt) + '_pkt')
455 | 
456 | 
457 |     all_features_flows = ['sum_iat', 'sum_length', 'min_length', 'max_length',
458 |                           'mean_length', 'median_length', 'std_length',
459 |                           '1stQ_length',
460 |                           '3stQ_length',
461 |                           'skew_length',
462 |                           'kurt_length',
463 |                           'min_iat',
464 |                           'max_iat', 'mean_iat', 'median_iat', 'std_iat',
465 |                           '1stQ_iat',
466 |                           '3stQ_iat',
467 |                           'skew_iat',
468 |                           'kurt_iat',
469 |                           'nb_packets'
470 |                           ]
471 | 
472 |     basic_features_flows = ['sum_iat', 'sum_length', 'min_length', 'max_length',
473 |                           'mean_length', 'std_length',
474 |                           'min_iat',
475 |                           'max_iat', 'mean_iat', 'std_iat', 'nb_packets'
476 |                           ]
477 | 
478 |     all_features_packets = ['sum_iat', 'sum_length']
479 |     # feats_flows = all_features_flows
480 |     feats_flows = basic_features_flows
481 |     feats_packets = all_features_packets
482 |     
483 |     # classification based on flows
484 |     if TEST_FLOWS:
485 |         # classifier.X_train_flows_fitted, classifier.X_test_flows_fitted = classifier.preprocessing(
486 |         #     classifier.X_train_flows,
487 |         #     classifier.y_train_flows,
488 |         #     classifier.X_test_flows,
489 |         #     classifier.y_test_flows,
490 |         #     classifier.flow_ids,
491 |         #     feats_flows
492 |         # )
493 |         classifier.X_train_flows_fitted = classifier.X_train_flows
494 |         classifier.X_test_flows_fitted = classifier.X_test_flows
495 |         if RF_ENABLED:
496 |             rf_regr_flows, rf_y_train_flows_predicted, rf_y_test_flows_predicted, rf_y_test_flows_isolated_predicted = classifier.RF_predict(
497 |                 classifier.X_train_flows_fitted,
498 |                 classifier.y_train_flows,
499 |                 classifier.X_test_flows_fitted,
500 |                 classifier.y_test_flows
501 |             )
502 |             rf_cm_dict_flows, output = classifier.confusion_matrix(rf_regr_flows,
503 |                                                                    classifier.y_test_flows,
504 |                                                                    rf_y_test_flows_predicted,
505 |                                                                    classifier.flow_ids,
506 |                                                                    "rf"
507 |                                                                    )
508 |             print(output)
509 |             rf_f1_scores_flows, output = classifier.get_F1_score(rf_cm_dict_flows,
510 |                                                                  classifier.y_test_flows,
511 |                                                                  rf_y_test_flows_predicted,
512 |                                                                  classifier.flow_ids,
513 |                                                                  "rf_flows")
514 |             print(output)
515 |             avg_scores, output = classifier.avg_f1_scores(rf_f1_scores_flows, classifier.flow_ids)
516 |             print(output)
517 |             
518 |             ######
519 |     
520 |             cm_dict = {}
521 |             cm_dict_normalized = {}
522 |             rf_F1 = {}
523 |             skl_F1 = {}
524 |             
525 |             print("== isolated ==\n")
526 |             from sklearn.metrics import f1_score, confusion_matrix
527 |             print("classifier.y_test_isolated_flows =", classifier.y_test_isolated_flows.shape)
528 |             print("rf_y_test_flows_isolated_predicted =", rf_y_test_flows_isolated_predicted)
529 |             for i in EncryptedTrafficClassifierIterator(classifier.flow_ids):
530 |                 output = ("== %s ==\n" % str(i))
531 |                 cm_dict = confusion_matrix(classifier.y_test_isolated_flows, rf_y_test_flows_isolated_predicted[i])
532 |                 output += str(cm_dict) + '\n'
533 |                 cm_dict_normalized = confusion_matrix(classifier.y_test_isolated_flows, rf_y_test_flows_isolated_predicted[i], normalize = 'true')
534 |                 output += str(cm_dict_normalized) + '\n'
535 |                 print(output)
536 | 
537 |                 output = ""
538 |                 
539 |                 cm = cm_dict
540 |                 FP = cm.sum(axis=0) - np.diag(cm)  
541 |                 FN = cm.sum(axis=1) - np.diag(cm)
542 |                 TP = np.diag(cm)
543 |                 TN = cm.sum() - (FP + FN + TP)
544 |                 rf_F1[i] = 2 * (TP) / (2 * TP + FP + FN) * 100
545 |                 output += ("FP = %s\n" % str(FP))
546 |                 output += ("FN = %s\n" % str(FN))
547 |                 output += ("TP = %s\n" % str(TP))
548 |                 output += ("TN = %s\n" % str(TN))
549 |                 if len(classifier.y_test_isolated_flows) > 0:
550 |                     skl_F1 = f1_score(classifier.y_test_isolated_flows, rf_y_test_flows_isolated_predicted[i], average = 'micro')
551 |                     output += ("skl_F1 = %s\n" % str(skl_F1))
552 |                 output += "\n"
553 |                 for j in range(len(classifier.classes)):
554 |                     t = classifier.classes[j]
555 |                     try:
556 |                         output += ("for type %s \t\t F1 = %.2f\n" % (t, rf_F1[j]))
557 |                     except IndexError as e:
558 |                         pass
559 |                     except KeyError as e:
560 |                         pass
561 |                 output += "\n"
562 |                 print(output)
563 |                 
564 |             output =""
565 |             f1 = {}
566 |             for i in EncryptedTrafficClassifierIterator(classifier.flow_ids):
567 |                 pkt, _ = i
568 |                 f1[pkt] = [0 for _ in range(len(classifier.classes))]
569 |             for i in EncryptedTrafficClassifierIterator(classifier.flow_ids):
570 |                 pkt, _ = i
571 |                 for j in range(len(classifier.classes)):
572 |                     try:
573 |                         f1[pkt][j] += rf_F1[i][j]
574 |                     except KeyError as e:
575 |                         continue
576 |                     except IndexError as e:
577 |                         continue
578 |         
579 |             avg_scores = {}
580 |             output = ""
581 |             for pkt in classifier.nb_packets_per_flow:
582 |                 output += f"for {pkt} packets\n"
583 |                 for j in range(len(classifier.classes)):
584 |                     t = classifier.classes[j]
585 |                     avg_scores[(pkt, t)] = f1[pkt][j] / classifier.nb_folds
586 |                     output += "average for type %s [%d] \t\t F1 = %.2f\n" % (t, j, avg_scores[(pkt, t)])
587 |             output += "\n"
588 |             print(output)
589 |             ####
590 | 
591 |         if XG_ENABLED:
592 |             print("==== XGBoost =====")
593 |             xg_regr, xg_y_train_predicted, xg_y_test_flows_predicted = classifier.XGBoost_predict(
594 |                 classifier.X_train_flows_fitted,
595 |                 classifier.y_train_flows,
596 |                 classifier.X_test_flows_fitted,
597 |                 classifier.y_test_flows
598 |             )
599 |             
600 |             # feats_flows, classification_results)
601 |             # xg_cm_dict = classifier.confusion_matrix(xg_regr, xg_y_test_predicted, False)
602 |             xg_cm_dict_flows, output = classifier.confusion_matrix(xg_regr,
603 |                                                                    classifier.y_test_flows,
604 |                                                                    xg_y_test_flows_predicted,
605 |                                                                    classifier.flow_ids,
606 |                                                                    "xg"
607 |                                                                 )
608 |             print(output)
609 |             
610 |             xg_f1_scores_flows, output = classifier.get_F1_score(
611 |                 xg_cm_dict_flows,
612 |                 classifier.y_test_flows,
613 |                 xg_y_test_flows_predicted,
614 |                 classifier.flow_ids,
615 |                 "xg_flows")
616 |             print(output)
617 |             avg_scores, output = classifier.avg_f1_scores(xg_f1_scores_flows, classifier.flow_ids)
618 |             print(output)
619 |     # classification based on Packets
620 |     if TEST_PACKETS:
621 |         classifier.X_train_packets_fitted, classifier.X_test_packets_fitted = classifier.preprocessing(
622 |             classifier.X_train_packets,
623 |             classifier.y_train_packets,
624 |             classifier.X_test_packets,
625 |             classifier.y_test_packets,
626 |             classifier.packet_ids,
627 |             feats_packets
628 |         )
629 |         if RF_ENABLED:
630 |             rf_regr_packets, rf_y_train_packets_predicted, rf_y_test_packets_predicted = classifier.RF_predict(
631 |                 classifier.X_train_packets_fitted,
632 |                 classifier.y_train_packets,
633 |                 classifier.X_test_packets_fitted,
634 |                 classifier.y_test_packets,
635 |                 classifier.packet_ids,
636 |             )
637 |             for i in EncryptedTrafficClassifierIterator(classifier.packet_ids):
638 |                 print("Feature ranking:")
639 |                 importances = rf_regr_flows[i].best_estimator_.named_steps["rf"].feature_importances_
640 |                 std = np.std([tree.feature_importances_ for tree in rf_regr_flows[i].best_estimator_.named_steps["rf"].estimators_],
641 |                              axis=0)
642 |                 indices = np.argsort(importances)[::-1]       
643 |                 for f in range(classifier.X_train_flows[i].shape[1]):
644 |                     print("%d. feature %s (%f)" % (f + 1, classifier.X_train_flows[i].columns[indices[f]], importances[indices[f]]))
645 |             # __show_actual_and_predicted(X_test, y_test, rf_y_test_predicted, 1)
646 |             # __analyze_CHAT(X_train, y_train, rf_y_train_predicted)
647 |             # __analyze_CHAT(X_test, y_test, rf_y_test_predicted)
648 |             rf_cm_dict, output = classifier.confusion_matrix(rf_regr_packets,
649 |                                                              classifier.y_test_packets,
650 |                                                              rf_y_test_packets_predicted,
651 |                                                              classifier.packet_ids)
652 |             print(output)
653 |             rf_f1_scores, output = classifier.get_F1_score(rf_cm_dict,
654 |                                                            classifier.y_test_packets,
655 |                                                            rf_y_test_packets_predicted,
656 |                                                            classifier.packet_ids,
657 |                                                            "rf_packets")
658 |             print(output)
659 |             classifier.avg_f1_scores(rf_f1_scores, classifier.packet_ids)
660 |     else:
661 |         print("CLASSIFICATION BASED ON PACKETS NOT ENABLED")
662 | 
663 |     print(classifier.classification_results)
664 |     if RF_ENABLED or GB_ENABLED or XG_ENABLED:
665 |         classifier.save_results()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Encrypted Traffic Classification at Line Rate in Programmable Switches with Machine Learning
 2 | 
 3 | This repository contains the source code for our work on Encrypted Traffic Classification (ETC) in programmable switches with P4 and Machine Learning, appearing in the Proceedings of IEEE/IFIP NOMS 2024, 6–10 May 2024, Seoul, South Korea.
 4 | 
 5 | ## Overview of the ETC framework
 6 | <img src="etc_framework.png" alt="ETC Overview" style="height: 350px; width:500px;"/>  
 7 | 
 8 | This work leverages recent advances in data plane programmability to achieve real-time ETC in programmable switches at line rate, with high throughput and low latency. The proposed solution comprises (i) an ETC-aware Random Forest (RF) modelling process where only features based on packet size and packet arrival times are used, and (ii) an encoding of the trained RF model into production-grade P4-programmable switches.
 9 | 
10 | For full details, please consult [our paper](https://dspace.networks.imdea.org/bitstream/handle/20.500.12761/1791/etc_noms24_postprint.pdf?sequence=1&isAllowed=y).
11 | 
12 | An extended version is currently in submission as an invited paper to a journal.
13 | 
14 | ## Organization of the repository  
15 | There are two folders:  
16 | <!-- - _Data_ : information on how to access the data  -->
17 | - _In_switch_ETC_ : the python and P4 code for the training and encoding of the in-switch RF models for RF.
18 | - _Offline_ETC_ : the python code for the offline data analysis and ETC modelling process.
19 | 
20 | ## Use cases
21 | The use cases considered in the paper are: 
22 | - QUIC traffic classification based on the publicly available <a href="https://drive.google.com/drive/folders/1cwHhzvaQbi-ap8yfrj2vHyPmUTQhaYOj">Netflow QUIC dataset</a>. The challenge is classifying traffic into one of 5 classes. 
23 | - Encrypted instant messaging application fingerprinting with 6 classes, based on the <a href="https://ieee-dataport.org/documents/encrypted-mobile-instant-messaging-traffic-dataset">Encrypted Instant Messaging Dataset</a> made available by the NIMS Lab.
24 | - VPN traffic classification, distinguishing 7 classes. It is based on the <a href="https://www.unb.ca/cic/datasets/vpn.html">ISCX-VPN-NonVPN-2016 Dataset</a>.
25 | 
26 | We provide the python and P4 code for the Encrypted Instant Messaging App classification use case with 6 classes. <br> The same approach for feature/model selection and encoding to P4 applies to all the use cases.
27 | 
28 | ## Citation
29 | If you make use of this code, kindly cite our paper:  
30 | ```
31 | @inproceedings{etc-noms-2024,
32 |   author={Akem, Aristide Tanyi-Jong and Fraysse, Guillaume and Fiore, Marco},
33 |   booktitle={NOMS 2024-2024 IEEE Network Operations and Management Symposium}, 
34 |   title={Encrypted Traffic Classification at Line Rate in Programmable Switches with Machine Learning}, 
35 |   year={2024},
36 |   volume={},
37 |   number={},
38 |   pages={1-9},
39 |   doi={10.1109/NOMS59830.2024.10575394}}
40 | 
41 | ```
42 | 
43 | If you need any additional information, send us an email at _aristide.akem_ at _imdea.org_.
44 | 
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/etc_framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nds-group/ETC_NOMS_2024/1ad4bbd4e1c097e87625b185e6d0ba14fa201bbb/etc_framework.png


--------------------------------------------------------------------------------