├── In_switch_ETC ├── Offline_model_preparation │ ├── NIMS_IMA_sample_data.csv │ ├── Nims2023_Data_Analysis.ipynb │ ├── generate_table_entries.py │ ├── noms2024_20_5.pkl │ ├── pre-processing │ │ ├── extract_flows.sh │ │ ├── extract_flows_from_txt.py │ │ ├── extract_pkts.sh │ │ └── readme.md │ ├── readme.md │ └── test_data_nimsIMA.csv ├── README.md └── Switch │ ├── controller_digest_noms.py │ ├── noms_20_5_4.p4 │ ├── readme.md │ └── table_entries.py ├── Offline_ETC ├── README.md ├── __init__.py ├── cstnet-tls13_traffic_classifier.py ├── data_preparation │ ├── pcap2csv.sh │ └── pkts2flows.py ├── encrypted_traffic_classification.py ├── iscxvpn2016-vpn-classifier.py ├── netflow_quic_traffic_classifier.py ├── noms2023_ima_only_traffic_classifier.py ├── noms2023_instant_messaging_traffic_classifier.py └── ucdavis_quic_classifier.py ├── README.md └── etc_framework.png /In_switch_ETC/Offline_model_preparation/generate_table_entries.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pickle as pickle 4 | import numpy as np 5 | import pandas as pd 6 | pd.options.mode.chained_assignment = None # default='warn' 7 | from sklearn import tree 8 | import re 9 | from netaddr import IPAddress 10 | from statistics import mode 11 | import random 12 | import ipaddress 13 | 14 | np.random.seed(42) 15 | 16 | ## import and get entries from trained models ## 17 | clf = pd.read_pickle('noms2024_20_5.pkl') 18 | 19 | ## list the feature names 20 | feature_names = ['Flow IAT Min', 'Max Packet Length', 'Flow IAT Max', 'Packet Length Total'] 21 | 22 | print(feature_names) 23 | 24 | ## definition of useful functions 25 | ## gets all splits and conditions 26 | def get_splits(forest, feature_names): 27 | data = [] 28 | #generate dataframe with all thresholds and features 29 | for t in range(len(forest.estimators_)): 30 | clf = forest[t] 31 | n_nodes = clf.tree_.node_count 32 | features = [feature_names[i] for i in clf.tree_.feature] 33 | for i in range(0, n_nodes): 34 | node_id = i 35 | left_child_id = clf.tree_.children_left[i] 36 | right_child_id = clf.tree_.children_right[i] 37 | threshold = clf.tree_.threshold[i] 38 | feature = features[i] 39 | if threshold != -2.0: 40 | data.append([t, node_id, left_child_id, 41 | right_child_id, threshold, feature]) 42 | data = pd.DataFrame(data) 43 | data.columns = ["Tree","NodeID","LeftID","RightID","Threshold","Feature"] 44 | return data 45 | 46 | ## gets the feature table of each feature from the splits 47 | def get_feature_table(splits_data, feature_name): 48 | feature_data = splits_data[splits_data["Feature"]==feature_name] 49 | feature_data = feature_data.sort_values(by="Threshold") 50 | feature_data = feature_data.reset_index(drop=True) 51 | ## 52 | # feature_data["Threshold"] = (feature_data["Threshold"]).astype(int) 53 | feature_data["Threshold"] = feature_data["Threshold"].astype(int) 54 | ## 55 | code_table = pd.DataFrame() 56 | code_table["Threshold"] = feature_data["Threshold"] 57 | #print(feature_data) 58 | #create a column for each split in each tree 59 | for tree_id, node in zip(list(feature_data["Tree"]), list(feature_data["NodeID"])): 60 | colname = "s"+str(tree_id)+"_"+str(node) 61 | code_table[colname] = np.where((code_table["Threshold"] <= 62 | feature_data[(feature_data["NodeID"]== node) & 63 | (feature_data["Tree"]==tree_id)]["Threshold"].values[0]), 0, 1) 64 | #add a row to represent the values above the largest threshold 65 | temp = [max(code_table["Threshold"])+1] 66 | temp.extend(list([1]*(len(code_table.columns)-1))) 67 | code_table.loc[len(code_table)] = temp 68 | code_table = code_table.drop_duplicates(subset=['Threshold']) 69 | code_table = code_table.reset_index(drop=True) 70 | return code_table 71 | 72 | ## get feature tables with ranges and codes only 73 | def get_feature_codes_with_ranges(feature_table, num_of_trees): 74 | Codes = pd.DataFrame() 75 | for tree_id in range(num_of_trees): 76 | colname = "code"+str(tree_id) 77 | Codes[colname] = feature_table[feature_table[[col for col in feature_table.columns if ('s'+str(tree_id)+'_') in col]].columns[0:]].apply(lambda x: ''.join(x.dropna().astype(str)),axis=1) 78 | Codes[colname] = ["0b" + x for x in Codes[colname]] 79 | feature_table["Range"] = [0]*len(feature_table) 80 | feature_table["Range"].loc[0] = "0,"+str(feature_table["Threshold"].loc[0]) 81 | for i in range(1, len(feature_table)): 82 | if (i==(len(feature_table))-1): 83 | feature_table["Range"].loc[i] = str(feature_table["Threshold"].loc[i])+","+str(feature_table["Threshold"].loc[i]) 84 | else: 85 | feature_table["Range"].loc[i] = str(feature_table["Threshold"].loc[i-1]+1) + ","+str(feature_table["Threshold"].loc[i]) 86 | Ranges = feature_table["Range"] 87 | return Ranges, Codes 88 | 89 | ## get list of splits crossed to get to leaves 90 | def retrieve_branches(estimator): 91 | number_nodes = estimator.tree_.node_count 92 | children_left_list = estimator.tree_.children_left 93 | children_right_list = estimator.tree_.children_right 94 | feature = estimator.tree_.feature 95 | threshold = estimator.tree_.threshold 96 | # Calculate if a node is a leaf 97 | is_leaves_list = [(False if cl != cr else True) for cl, cr in zip(children_left_list, children_right_list)] 98 | # Store the branches paths 99 | paths = [] 100 | for i in range(number_nodes): 101 | if is_leaves_list[i]: 102 | # Search leaf node in previous paths 103 | end_node = [path[-1] for path in paths] 104 | # If it is a leave node yield the path 105 | if i in end_node: 106 | output = paths.pop(np.argwhere(i == np.array(end_node))[0][0]) 107 | yield output 108 | else: 109 | # Origin and end nodes 110 | origin, end_l, end_r = i, children_left_list[i], children_right_list[i] 111 | # Iterate over previous paths to add nodes 112 | for index, path in enumerate(paths): 113 | if origin == path[-1]: 114 | paths[index] = path + [end_l] 115 | paths.append(path + [end_r]) 116 | # Initialize path in first iteration 117 | if i == 0: 118 | paths.append([i, children_left_list[i]]) 119 | paths.append([i, children_right_list[i]]) 120 | 121 | ## get classes and certainties 122 | def get_classes(clf): 123 | leaves = [] 124 | classes = [] 125 | certainties = [] 126 | for branch in list(retrieve_branches(clf)): 127 | leaves.append(branch[-1]) 128 | for leaf in leaves: 129 | if clf.tree_.n_outputs == 1: 130 | value = clf.tree_.value[leaf][0] 131 | else: 132 | value = clf.tree_.value[leaf].T[0] 133 | class_name = np.argmax(value) 134 | certainty = int(round(max(value)/sum(value),2)*100) 135 | classes.append(class_name) 136 | certainties.append(certainty) 137 | return classes, certainties 138 | 139 | ## get the codes corresponging to the branches followed 140 | def get_leaf_paths(clf): 141 | depth = clf.max_depth 142 | branch_codes = [] 143 | for branch in list(retrieve_branches(clf)): 144 | code = [0]*len(branch) 145 | for i in range(1, len(branch)): 146 | if (branch[i]==clf.tree_.children_left[branch[i-1]]): 147 | code[i] = 0 148 | elif (branch[i]==clf.tree_.children_right[branch[i-1]]): 149 | code[i] = 1 150 | branch_codes.append(list(code[1:])) 151 | return branch_codes 152 | 153 | ## get the order of the splits to enable code generation 154 | def get_order_of_splits(data, feature_names): 155 | splits_order = [] 156 | for feature_name in feature_names: 157 | feature_data = data[data.iloc[:,4]==feature_name] 158 | feature_data = feature_data.sort_values(by="Threshold") 159 | for node in list(feature_data.iloc[:,0]): 160 | splits_order.append(node) 161 | return splits_order 162 | 163 | def get_splits_per_tree(clf, feature_names): 164 | data = [] 165 | n_nodes = clf.tree_.node_count 166 | #set feature names 167 | features = [feature_names[i] for i in clf.tree_.feature] 168 | #generate dataframe with all thresholds and features 169 | for i in range(0,n_nodes): 170 | node_id = i 171 | left_child_id = clf.tree_.children_left[i] 172 | right_child_id = clf.tree_.children_right[i] 173 | threshold = clf.tree_.threshold[i] 174 | feature = features[i] 175 | if threshold != -2.0: 176 | data.append([node_id, left_child_id, 177 | right_child_id, threshold, feature]) 178 | data = pd.DataFrame(data) 179 | data.columns = ["NodeID","LeftID","RightID","Threshold","Feature"] 180 | return data 181 | 182 | ## Get codes and masks 183 | def get_codes_and_masks(clf, feature_names): 184 | splits = get_order_of_splits(get_splits_per_tree(clf, feature_names), feature_names) 185 | depth = clf.max_depth 186 | codes = [] 187 | masks = [] 188 | for branch, coded in zip(list(retrieve_branches(clf)), get_leaf_paths(clf)): 189 | code = [0]*len(splits) 190 | mask = [0]*len(splits) 191 | for index, split in enumerate(splits): 192 | if split in branch: 193 | mask[index] = 1 194 | masks.append(mask) 195 | codes.append(code) 196 | masks = pd.DataFrame(masks) 197 | masks['Mask'] = masks[masks.columns[0:]].apply(lambda x: ''.join(x.dropna().astype(str)),axis=1) 198 | masks = ["0b" + x for x in masks['Mask']] 199 | indices = range(0,len(splits)) 200 | temp = pd.DataFrame(columns=["split", "index"],dtype=object) 201 | temp["split"] = splits 202 | temp["index"] = indices 203 | final_codes = [] 204 | for branch, code, coded in zip(list(retrieve_branches(clf)), codes, get_leaf_paths(clf)): 205 | indices_to_use = temp[temp["split"].isin(branch)].sort_values(by="split")["index"] 206 | for i, j in zip(range(0,len(coded)), list(indices_to_use)): 207 | code[j] = coded[i] 208 | final_codes.append(code) 209 | final_codes = pd.DataFrame(final_codes) 210 | final_codes["Code"] = final_codes[final_codes.columns[0:]].apply(lambda x: ''.join(x.dropna().astype(str)),axis=1) 211 | final_codes = ["0b" + x for x in final_codes["Code"]] 212 | return final_codes, masks 213 | ## End of model manipulation ## 214 | 215 | 216 | # Get table entries and generate file with table entries 217 | with open("table_entries.py", "w") as entries_file: 218 | 219 | print("from netaddr import IPAddress\n", file=entries_file) 220 | 221 | print("p4 = bfrt.noms_20_5_4.pipe\n", file=entries_file) 222 | 223 | clear_tables = """ 224 | def clear_all(verbose=True, batching=True): 225 | global p4 226 | global bfrt 227 | for table_types in (['MATCH_DIRECT', 'MATCH_INDIRECT_SELECTOR'], 228 | ['SELECTOR'], 229 | ['ACTION_PROFILE']): 230 | for table in p4.info(return_info=True, print_info=False): 231 | if table['type'] in table_types: 232 | if verbose: 233 | print("Clearing table {:<40} ... ". 234 | format(table['full_name']), end='', flush=True) 235 | table['node'].clear(batch=batching) 236 | if verbose: 237 | print('Done') 238 | """ 239 | 240 | port_setup = """ 241 | # This script configures QSFP ports automatically on the TOFINO Switch 242 | # Adapted from ICA-1131 Intel Connectivity Academy Course 243 | for qsfp_cage in [1, 5]: 244 | for lane in range(0, 1): 245 | dp = bfrt.port.port_hdl_info.get(CONN_ID = qsfp_cage, CHNL_ID = lane, print_ents = False).data[b'$DEV_PORT'] 246 | bfrt.port.port.add(DEV_PORT= dp, SPEED = "BF_SPEED_100G", FEC = "BF_FEC_TYP_NONE", AUTO_NEGOTIATION = "PM_AN_FORCE_DISABLE", PORT_ENABLE = True) 247 | """ 248 | print(port_setup, file=entries_file) 249 | 250 | print(clear_tables, file=entries_file) 251 | 252 | print("clear_all(verbose=True)\n", file=entries_file) 253 | print("voting_table = p4.Ingress.voting_table", file=entries_file) 254 | print("target_flows_table = p4.Ingress.target_flows_table", file=entries_file) 255 | 256 | for num_feat in range(len(feature_names)): 257 | print("table_feature"+str(num_feat)+" = p4.Ingress.table_feature"+str(num_feat), file=entries_file) 258 | print('', file=entries_file) 259 | 260 | for num_tree in range(len(clf.estimators_)): 261 | print("code_table"+str(num_tree)+" = p4.Ingress.code_table"+str(num_tree), file=entries_file) 262 | print('', file=entries_file) 263 | 264 | # Get entries for feature tables 265 | tree_code0 = [] 266 | tree_code1 = [] 267 | tree_code2 = [] 268 | tree_code3 = [] 269 | tree_code4 = [] 270 | 271 | for fea in range(0,len(feature_names)): 272 | Ranges, Codes = get_feature_codes_with_ranges(get_feature_table(get_splits(clf, feature_names), feature_names[fea]), len(clf.estimators_)) 273 | for ran, cods0, cods1, cods2, cods3, cods4 in zip(Ranges, Codes.iloc[:,0], Codes.iloc[:,1], Codes.iloc[:,2], Codes.iloc[:,3], Codes.iloc[:,4]): 274 | if(ran == Ranges[len(Ranges)-1]): 275 | print("table_feature"+str(fea)+".add_with_SetCode"+str(fea)+"(feature"+str(fea)+"_start="+str(ran.split(",")[0])+ \ 276 | ", feature"+str(fea)+"_end="+str(65535)+", code0="+str(cods0) + ", code1=" + str(cods1) + ", code2=" + str(cods2) + \ 277 | ", code3=" + str(cods3) + ", code4=" + str(cods4) + ")", file = entries_file) 278 | # change 65535 to the maximum value of the feature in cases where the feature is not 16 bits 279 | else: 280 | print("table_feature"+str(fea)+".add_with_SetCode"+str(fea)+"(feature"+str(fea)+"_start="+str(ran.split(",")[0])+ \ 281 | ", feature"+str(fea)+"_end="+str(ran.split(",")[1])+", code0="+str(cods0)+", code1="+str(cods1)+", code2=" +str(cods2) + \ 282 | ", code3=" + str(cods3) + ", code4=" + str(cods4) + ")", file = entries_file) 283 | # change 65535 to the maximum value of the feature in cases where the feature is not 16 bits 284 | 285 | tree_code0.append(len(cods0)-2) 286 | tree_code1.append(len(cods1)-2) 287 | tree_code2.append(len(cods2)-2) 288 | tree_code3.append(len(cods3)-2) 289 | tree_code4.append(len(cods4)-2) 290 | 291 | print('', file=entries_file) 292 | 293 | tree_code_sizes = [tree_code0, tree_code1, tree_code2, tree_code3, tree_code4] 294 | 295 | print(tree_code_sizes) 296 | 297 | print('print("******************* ENTERED FEATURE TABLE RULES *****************")\n', file=entries_file) 298 | 299 | for tree_id in range(0, len(clf.estimators_)): 300 | Final_Codes, Final_Masks = get_codes_and_masks(clf.estimators_[tree_id], feature_names) 301 | Classe, Certain = get_classes(clf.estimators_[tree_id]) 302 | for cod, mas, cla, cer in zip(Final_Codes, Final_Masks, Classe, Certain): 303 | print("code_table"+str(tree_id)+".add_with_SetClass"+str(tree_id)+"(codeword"+str(tree_id)+"=", cod, ", codeword"+str(tree_id)+"_mask=", mas, ", classe=",cla+1,")", file=entries_file) 304 | print('', file=entries_file) 305 | 306 | # Get voting table entries 307 | for i in range(1, 7): 308 | for j in range(1, 7): 309 | for k in range(1, 7): 310 | for l in range(1, 7): 311 | for m in range(1, 7): 312 | try: 313 | choices = [i, j, k, l, m] 314 | mode_number = mode(choices) 315 | print("voting_table.add_with_set_final_class(" + "class0=" + str(i) + ", class1=" + str(j) + \ 316 | ", class2=" + str(k) + ", class3=" + str(l) + ", class4=" + str(m) + \ 317 | ", class_result=" + str(mode_number) + ")", file=entries_file) 318 | except: 319 | pass 320 | 321 | print(" ", file=entries_file) 322 | 323 | # Forwarding: 0 Inference: 1 324 | flow_id_info = pd.read_csv("test_data_nimsIMA.csv",usecols=['Flow ID','Label']) 325 | flow_id_info = flow_id_info.drop_duplicates(subset=['Flow ID']) 326 | for index, flow in flow_id_info.iterrows(): 327 | flow_id = flow['Flow ID'] 328 | id_values = flow_id.split(" ") 329 | # With all tuple elements 330 | try: 331 | print("target_flows_table.add_with_set_flow_class("+"src_addr="+str(int(ipaddress.ip_address(id_values[0])))+ \ 332 | ", dst_addr="+str(int(ipaddress.ip_address(id_values[1])))+ \ 333 | ", hdr_srcport="+str(id_values[2])+ \ 334 | ", hdr_dstport="+str(id_values[3])+ \ 335 | ", protocol="+str(id_values[4])+ \ 336 | ", f_class="+str(0)+")", file=entries_file) 337 | except: 338 | continue 339 | 340 | print("bfrt.complete_operations()", file=entries_file) 341 | 342 | print("** TABLE ENTRIES GENERATED AND STORED IN DESIGNATED FILE **") 343 | -------------------------------------------------------------------------------- /In_switch_ETC/Offline_model_preparation/noms2024_20_5.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nds-group/ETC_NOMS_2024/1ad4bbd4e1c097e87625b185e6d0ba14fa201bbb/In_switch_ETC/Offline_model_preparation/noms2024_20_5.pkl -------------------------------------------------------------------------------- /In_switch_ETC/Offline_model_preparation/pre-processing/extract_flows.sh: -------------------------------------------------------------------------------- 1 | for f in ./txt_files/*.txt 2 | do 3 | echo $f 4 | python3 extract_flows_from_txt.py $f $f.csv 8 5 | done 6 | -------------------------------------------------------------------------------- /In_switch_ETC/Offline_model_preparation/pre-processing/extract_flows_from_txt.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import sys 4 | 5 | filename_in = sys.argv[1] 6 | filename_out = sys.argv[2] 7 | npkts = int(sys.argv[3]) 8 | 9 | packet_data = pd.DataFrame() 10 | 11 | packet_data = pd.read_csv(filename_in, sep = '|', header=None) 12 | 13 | packet_data.columns = ['timestamp', 'ip.src', 'ip.dst', 'tcp.srcport', 'tcp.dstport', 'ip.proto', 'ip.len','udp.srcport', 'udp.dstport'] 14 | 15 | packet_data = packet_data[(packet_data["ip.proto"] != "1,17") & (packet_data["ip.proto"] != "1,6")].reset_index(drop=True) 16 | packet_data = packet_data.dropna(subset=['ip.proto']) 17 | packet_data["ip.src"] = packet_data["ip.src"].astype(str) 18 | packet_data["ip.dst"] = packet_data["ip.dst"].astype(str) 19 | packet_data["ip.len"] = packet_data["ip.len"].astype("int") 20 | ## 21 | packet_data["tcp.srcport"] = packet_data["tcp.srcport"] 22 | packet_data["tcp.dstport"] = packet_data["tcp.dstport"] 23 | packet_data["udp.srcport"] = packet_data["udp.srcport"].astype('Int64') 24 | packet_data["udp.dstport"] = packet_data["udp.dstport"].astype('Int64') 25 | # 26 | packet_data["srcport"] = np.where(packet_data["ip.proto"] == "6", packet_data["tcp.srcport"], packet_data["udp.srcport"]) 27 | packet_data["dstport"] = np.where(packet_data["ip.proto"] == "6", packet_data["tcp.dstport"], packet_data["udp.dstport"]) 28 | # 29 | packet_data["srcport"] = np.where(packet_data["ip.proto"] == 6, packet_data["tcp.srcport"], packet_data["udp.srcport"]) 30 | packet_data["dstport"] = np.where(packet_data["ip.proto"] == 6, packet_data["tcp.dstport"], packet_data["udp.dstport"]) 31 | # 32 | packet_data["srcport"] = packet_data["srcport"].astype('Int64') 33 | packet_data["dstport"] = packet_data["dstport"].astype('Int64') 34 | 35 | #===============================CREATE THE FLOW IDs AND DROP UNWANTED COLUMNS =============================================# 36 | packet_data = packet_data.drop(["tcp.srcport","tcp.dstport","udp.srcport","udp.dstport"],axis=1) 37 | packet_data = packet_data.reset_index(drop=True) 38 | 39 | packet_data["flow.id"] = packet_data["ip.src"].astype(str) + " " + packet_data["ip.dst"].astype(str) + " " + packet_data["srcport"].astype(str) + " " + packet_data["dstport"].astype(str) + " " + packet_data["ip.proto"].astype(str) 40 | 41 | 42 | # Labeling 43 | filename_patterns = {"background" : "Background", 44 | "webbrowsing" : "WebBrowsing", 45 | "youtube" : "YouTube", 46 | "gmail" : "Gmail", 47 | "discord" : "Discord", 48 | "whatsapp" : "WhatsApp", 49 | "signal" : "Signal", 50 | "telegram" : "Telegram", 51 | "messenger" : "Messenger", 52 | "teams" : "Teams" 53 | } 54 | 55 | for pattern, labeld in filename_patterns.items(): 56 | if pattern in filename_in: 57 | label = labeld 58 | 59 | number_of_pkts_limit, min_number_of_packets = npkts, npkts 60 | #===============================Extract flows from packets and calculate features=============================================# 61 | main_packet_size = {} # dictionary to store list of packet sizes for each flow (Here key = flowID, value = list of packet sizes) 62 | flow_list = [] # contains the flowIDs (a combination of SIP,DIP,srcPort, dstPort, proto) 63 | main_inter_arrival_time = {} # dictionary to store list of IATs for each flow (Here key = flowID, value = list of IATs) 64 | last_time = {} # for each flow we store timestamp of the last packet arrival 65 | 66 | avg_pkt_sizes = {} # contains the flowID and their calculated average packet sizes 67 | string = {} # For each flow, we have a string of feature values (just for printing purpose, on screen) 68 | 69 | labels = {} # contains the flowID and their labels 70 | packet_count = {} # contains flowID as key and number of packets as valu 71 | 72 | # ==============================================================================================================================# 73 | print("NOW: COLLECTING PACKETS INTO FLOWS...") 74 | for row in packet_data.itertuples(index=True, name='Pandas'): 75 | time = float(row[1]) # timestamp of the packet 76 | srcip = row[2] #src ip 77 | dstip = row[3] #dst ip 78 | pktsize = row[5] #packet size 79 | proto = row[4] #protocol 80 | srcport = row[6] #source port 81 | dstport = row[7] #destination port 82 | key = row[8] #key which is a concatenation of the 5-tuple to identify the flow 83 | 84 | if key in flow_list: # check if the packet belongs to already existing flow ? 85 | if (len(main_packet_size[key]) < number_of_pkts_limit ): 86 | packet_count[key] = packet_count[key] + 1 # increment packet count 87 | main_packet_size[key].append(pktsize) # append its packet size to the packet size list for this flow 88 | lasttime = last_time[key] 89 | diff = round(float(time) - float(lasttime), 9) # calculate inter-arrival time (seconds) 90 | main_inter_arrival_time[key].append(diff) # append IAT 91 | ## 92 | labels[key] = label 93 | ## 94 | last_time[key] = time # update last time for the flow, to the timestamp of this packet 95 | 96 | 97 | else: # if this packet is the first one in this NEW flow 98 | flow_list.append(key) # make its entry in the existing flow List 99 | packet_count[key] = 1 # first packet arrived for this flow, set count =1 100 | main_packet_size[key] = [pktsize] # make its entry in the packet size dictionary 101 | ## 102 | labels[key] = label 103 | ## 104 | main_inter_arrival_time[key] = [] # create a blank list in this dictionary, as it is the first packet 105 | 106 | last_time[key] = time 107 | 108 | # ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 109 | print("NOW: COMPUTING AND WRITING FLOW FEATURES INTO CSV...") 110 | header = "Flow ID,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Total,Packet Count,Current Packet Length,Flow IAT Min,Flow IAT Max,Flow IAT Mean,Flow Duration,Label" 111 | 112 | with open(filename_out, "w") as text_file: 113 | text_file.write(header) 114 | text_file.write("\n") 115 | # ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- 116 | # Calculate features related to packet size 117 | for key in flow_list: 118 | packet_list = main_packet_size[key] # packet_list contains the list of packet sizes for the flow in consideration 119 | length = len(packet_list) # number of packets 120 | avg_pkt_sizes[key] = sum(packet_list) / length # calculate avg packet size, and store 121 | min_pkt_size = min(packet_list) 122 | max_pkt_size = max(packet_list) 123 | 124 | string[key] = key + "," + str(min_pkt_size) + "," + str(max_pkt_size) + "," + str(avg_pkt_sizes[key]) + "," + str(sum(packet_list)) + "," + str(len(packet_list)) + "," + str(packet_list[len(packet_list)-1]) # concatenate features in string format 125 | # ------------------- --------------------------------------------------------------------------------------------------------------------------------------------------- 126 | # Now calculate IAT-related features 127 | inter_arrival_time_list = main_inter_arrival_time[key] # a list containing IATs for the flow 128 | length = len(inter_arrival_time_list) 129 | if length == 0: 130 | min_IAT = 0 131 | max_IAT = 0 132 | else: 133 | min_IAT = min(inter_arrival_time_list) 134 | min_IAT_ms = round(1000000000*min_IAT, 9) # convert in nanoseconds 135 | max_IAT = max(inter_arrival_time_list) 136 | max_IAT_ms = round(1000000000*max_IAT, 9) # convert in nanoseconds 137 | 138 | if length > 0: 139 | flow_duration = sum(inter_arrival_time_list) # flow duration seconds 140 | flow_duration_ms = round(1000000000*flow_duration, 9) # convert in nanoseconds 141 | avg_iat = flow_duration / length # Average IAT 142 | avg_iat_in_ms = round(1000000000*avg_iat, 9) # convert in nanoseconds 143 | 144 | if(len(main_packet_size[key]) >= min_number_of_packets): 145 | string[key] = string[key] + "," + str(min_IAT_ms) + "," + str(max_IAT_ms) + "," + str(avg_iat_in_ms) + "," + str(flow_duration_ms) 146 | string[key] = string[key] + "," + str(labels[key]) 147 | text_file.write(string[key]) 148 | text_file.write("\n") -------------------------------------------------------------------------------- /In_switch_ETC/Offline_model_preparation/pre-processing/extract_pkts.sh: -------------------------------------------------------------------------------- 1 | for f in *.pcap 2 | do 3 | echo $f 4 | tshark -r $f -Y 'ip.proto == 6 or ip.proto == 17' -T fields -e frame.time_relative -e ip.src -e ip.dst -e tcp.srcport -e tcp.dstport -e ip.proto -e ip.len -e udp.srcport -e udp.dstport -E separator='|' > ./txt_files/$f.txt 5 | done 6 | -------------------------------------------------------------------------------- /In_switch_ETC/Offline_model_preparation/pre-processing/readme.md: -------------------------------------------------------------------------------- 1 | The scripts in this folder are useful for extracting the data from the pcap files. 2 | - run the _extract_pkts.sh_ script in the folder containing the downloaded pcap files to extract the packet features. 3 | - the packet files are saved in a _txt_files_ folder 4 | - run the _extract_flows.sh_ script in the folder containing the downloaded pcap files to aggregate the packet data in the _txt_files_ folder into flow data saved in .csv files. 5 | - this bash script makes use of the _extract_flows_from_txt.py_ script which takes as input the txt file, the csv file which is the output, and the number of packets to consider in each flow. 6 | - merge the generated flow files into a single csv -------------------------------------------------------------------------------- /In_switch_ETC/Offline_model_preparation/readme.md: -------------------------------------------------------------------------------- 1 | - use the _Nims2023_Data_Analysis.ipynb_ file to analyze, train and save models. 2 | - use the _generate_table_entries.py_ file to convert the trained and saved model into table entries for the switch. 3 | - the _NIMS_IMA_sample_data.csv_ contains one day of data (8 November 2022) used for the analysis to shorten duration of in-switch experiments. 4 | - the _test_data_nimsIMA.csv_ contains the test data needed to create table entries for the flow table in the switch. 5 | - the noms2024_20_5.pkl is a sample trained and saved RF model with trees of maximum depth 20, 5 trees and 4 features. 6 | -------------------------------------------------------------------------------- /In_switch_ETC/README.md: -------------------------------------------------------------------------------- 1 | ## Organization of the folder 2 | There are two folders: 3 | 4 | - _Switch_ : the P4 code for the Tofino switch, the M/A table entries, and the runtime controller code. 5 | - _Offline_ : the jupyter notebooks for training the machine learning models and for offline evaluation, and the scripts for generating the M/A table entries from trained models. -------------------------------------------------------------------------------- /In_switch_ETC/Switch/controller_digest_noms.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | from __future__ import print_function 4 | 5 | import os 6 | import sys 7 | import pdb 8 | 9 | SDE_INSTALL = os.environ['SDE_INSTALL'] 10 | SDE_PYTHON2 = os.path.join(SDE_INSTALL, 'lib', 'python2.7', 'site-packages') 11 | sys.path.append(SDE_PYTHON2) 12 | sys.path.append(os.path.join(SDE_PYTHON2, 'tofino')) 13 | 14 | PYTHON3_VER = '{}.{}'.format( 15 | sys.version_info.major, 16 | sys.version_info.minor) 17 | SDE_PYTHON3 = os.path.join(SDE_INSTALL, 'lib', 'python' + PYTHON3_VER, 'site-packages') 18 | sys.path.append(SDE_PYTHON3) 19 | sys.path.append(os.path.join(SDE_PYTHON3, 'tofino')) 20 | sys.path.append(os.path.join(SDE_PYTHON3, 'tofino', 'bfrt_grpc')) 21 | 22 | import grpc 23 | import bfrt_grpc.bfruntime_pb2 as bfruntime_pb2 24 | import bfrt_grpc.client as bfrt_client 25 | import pandas as pd 26 | import time 27 | import socket, struct 28 | 29 | filename_out = sys.argv[1] #output csv file with classification results 30 | 31 | # Connect to the BF Runtime Server 32 | interface = bfrt_client.ClientInterface( 33 | grpc_addr = 'localhost:50052', 34 | client_id = 1, 35 | device_id = 0) 36 | print('Connected to BF Runtime Server') 37 | 38 | 39 | # Get the information about the running program 40 | bfrt_info = interface.bfrt_info_get() 41 | print('The target runs the program ', bfrt_info.p4_name_get()) 42 | 43 | # Establish that we are using this program on the given connection 44 | interface.bind_pipeline_config(bfrt_info.p4_name_get()) 45 | 46 | # Get digest 47 | learn_filter = bfrt_info.learn_get("digest") 48 | 49 | # List of registers in P4 program 50 | registers = ['Ingress.reg_flow_ID','Ingress.reg_status','Ingress.reg_pkt_count', 'Ingress.reg_classified_flag', 'Ingress.reg_flow_iat_max', 'Ingress.reg_flow_iat_min', 'Ingress.reg_pkt_len_max', 'Ingress.reg_pkt_len_total','Ingress.reg_time_last_pkt'] 51 | 52 | # Getting info about the flow table 53 | flow_act_tbl = bfrt_info.table_get('Ingress.target_flows_table') 54 | print('Table max packet length info:', flow_act_tbl) 55 | 56 | target = bfrt_client.Target(device_id=0, pipe_id=0xffff) 57 | 58 | header = 'source_addr,destin_addr,source_port,destin_port,protocol,pkt_count,flow_packet_class' 59 | 60 | count = 0 61 | 62 | with open(filename_out, "w") as text_file: 63 | text_file.write(header) 64 | text_file.write("\n") 65 | 66 | flow_counter = 0 67 | while True: 68 | try: 69 | digest = interface.digest_get(timeout=400) 70 | except: 71 | f = open("x.txt", "a") 72 | f.write('---- \n') 73 | f.close() 74 | break 75 | 76 | recv_target = digest.target 77 | 78 | digest_type = 1 79 | data_list = learn_filter.make_data_list(digest) 80 | 81 | if digest_type == 1: 82 | count = count + 1 83 | keys_reg = {'Ingress.reg_flow_ID': [],'Ingress.reg_status': [], 84 | 'Ingress.reg_pkt_count': [], 'Ingress.reg_classified_flag': [], 85 | 'Ingress.reg_flow_iat_min': [], 'Ingress.reg_pkt_len_max': [], 86 | 'Ingress.reg_flow_iat_max': [], 87 | 'Ingress.reg_pkt_len_total': [], 'Ingress.reg_time_last_pkt': []} 88 | datas_reg = {'Ingress.reg_flow_ID': [],'Ingress.reg_status': [], 89 | 'Ingress.reg_pkt_count': [], 'Ingress.reg_classified_flag': [], 90 | 'Ingress.reg_flow_iat_min': [], 'Ingress.reg_pkt_len_max': [], 91 | 'Ingress.reg_flow_iat_max': [], 92 | 'Ingress.reg_pkt_len_total': [], 'Ingress.reg_time_last_pkt': []} 93 | keys_table = [] 94 | datas_table = [] 95 | for dd in data_list: 96 | data_dict = dd.to_dict() 97 | # convert ip address into normal format 98 | source_addr = socket.inet_ntoa(struct.pack('!L', data_dict['source_addr'])) 99 | destin_addr = socket.inet_ntoa(struct.pack('!L', data_dict['destin_addr'])) 100 | source_port = str(data_dict['source_port']) 101 | destin_port = str(data_dict['destin_port']) 102 | protocol = str(data_dict['protocol']) 103 | flow_packet_class = data_dict['class_value'] 104 | pkt_count = str(data_dict['packet_num']) 105 | register_index = data_dict['register_index'] 106 | # 107 | FlowID = source_addr + ' ' + destin_addr + ' ' + source_port + ' ' + destin_port + ' ' + protocol 108 | # 109 | if (pkt_count == '0'): 110 | csv_row = source_addr + ',' + destin_addr + ',' + source_port + ',' + destin_port + ',' + protocol + ',' + pkt_count + ',' + str(-1) 111 | else: 112 | csv_row = source_addr + ',' + destin_addr + ',' + source_port + ',' + destin_port + ',' + protocol + ',' + pkt_count + ',' + str(flow_packet_class) 113 | 114 | with open(filename_out, "a") as text_file: 115 | text_file.write(csv_row) 116 | text_file.write("\n") 117 | 118 | if (data_dict['packet_num'] == 8): 119 | keys_table.append(flow_act_tbl.make_key( 120 | [bfrt_client.KeyTuple('hdr.ipv4.src_addr', data_dict['source_addr']), bfrt_client.KeyTuple('hdr.ipv4.dst_addr', data_dict['destin_addr']), 121 | bfrt_client.KeyTuple('meta.hdr_dstport', data_dict['destin_port']), bfrt_client.KeyTuple('meta.hdr_srcport', data_dict['source_port']), 122 | bfrt_client.KeyTuple('hdr.ipv4.protocol', data_dict['protocol'])])) 123 | 124 | datas_table.append(flow_act_tbl.make_data([ 125 | bfrt_client.DataTuple('f_class', flow_packet_class) 126 | ], 'Ingress.set_flow_class')) 127 | 128 | for reg_name in registers: 129 | reg_tbl = bfrt_info.table_get(reg_name) 130 | keys_reg[reg_name].append(reg_tbl.make_key([bfrt_client.KeyTuple('$REGISTER_INDEX', register_index)])) 131 | datas_reg[reg_name].append(reg_tbl.make_data([bfrt_client.DataTuple(reg_name+'.f1', 0)])) 132 | 133 | try: 134 | flow_act_tbl.entry_mod(target, keys_table, datas_table, p4_name=bfrt_info.p4_name_get()) 135 | print("Flow table entry modified") 136 | except: 137 | print("Error in flow_act_tbl.entry_mod") 138 | for reg_name in registers: 139 | reg_tbl = bfrt_info.table_get(reg_name) 140 | reg_tbl.entry_mod(target, key_list=keys_reg[reg_name], data_list=datas_reg[reg_name], flags={"from_hw":True}, p4_name=bfrt_info.p4_name_get()) 141 | print("Register table entry modified") 142 | -------------------------------------------------------------------------------- /In_switch_ETC/Switch/noms_20_5_4.p4: -------------------------------------------------------------------------------- 1 | /* -*- P4_16 -*- */ 2 | #include 3 | #include 4 | /************************************************************************* 5 | ************* C O N S T A N T S A N D T Y P E S ******************* 6 | **************************************************************************/ 7 | typedef bit<48> mac_addr_t; 8 | typedef bit<32> ipv4_addr_t; 9 | typedef bit<16> ether_type_t; 10 | const bit<16> TYPE_IPV4 = 0x800; 11 | const bit<16> TYPE_RECIRC = 0x88B5; 12 | const bit<8> TYPE_TCP = 6; 13 | const bit<8> TYPE_UDP = 17; 14 | const bit<32> MAX_REGISTER_ENTRIES = 2048; 15 | #define INDEX_WIDTH 11 16 | /************************************************************************* 17 | *********************** H E A D E R S ********************************* 18 | *************************************************************************/ 19 | /* Standard ethernet header */ 20 | header ethernet_h { 21 | mac_addr_t dst_addr; 22 | mac_addr_t src_addr; 23 | ether_type_t ether_type; 24 | } 25 | /* IPV4 header */ 26 | header ipv4_h { 27 | bit<4> version; 28 | bit<4> ihl; 29 | bit<8> diffserv; 30 | bit<16> total_len; 31 | bit<16> identification; 32 | bit<3> flags; 33 | bit<13> frag_offset; 34 | bit<8> ttl; 35 | bit<8> protocol; 36 | bit<16> hdr_checksum; 37 | ipv4_addr_t src_addr; 38 | ipv4_addr_t dst_addr; 39 | } 40 | /* TCP header */ 41 | header tcp_h { 42 | bit<16> src_port; 43 | bit<16> dst_port; 44 | bit<32> seq_no; 45 | bit<32> ack_no; 46 | bit<4> data_offset; 47 | bit<4> res; 48 | bit<1> cwr; 49 | bit<1> ece; 50 | bit<1> urg; 51 | bit<1> ack; 52 | bit<1> psh; 53 | bit<1> rst; 54 | bit<1> syn; 55 | bit<1> fin; 56 | bit<16> window; 57 | bit<16> checksum; 58 | bit<16> urgent_ptr; 59 | } 60 | /* UDP header */ 61 | header udp_h { 62 | bit<16> src_port; 63 | bit<16> dst_port; 64 | bit<16> udp_total_len; 65 | bit<16> checksum; 66 | } 67 | 68 | /*Custom header for recirculation*/ 69 | header recirc_h { 70 | bit<8> class_result; 71 | } 72 | 73 | /*********************** H E A D E R S ************************/ 74 | struct my_ingress_headers_t { 75 | ethernet_h ethernet; 76 | recirc_h recirc; 77 | ipv4_h ipv4; 78 | tcp_h tcp; 79 | udp_h udp; 80 | } 81 | 82 | /****** G L O B A L I N G R E S S M E T A D A T A *********/ 83 | struct my_ingress_metadata_t { 84 | bit<1> is_first; 85 | bit<8> classified_flag; 86 | bit<1> is_hash_collision; 87 | 88 | bit<1> reg_status; 89 | bit<32> flow_ID; 90 | bit<(INDEX_WIDTH)> register_index; 91 | 92 | bit<16> hdr_srcport; 93 | bit<16> hdr_dstport; 94 | 95 | bit<8> pkt_count; 96 | bit<32> time_last_pkt; 97 | 98 | bit<32> iat; 99 | bit<16> pkt_len_max; 100 | bit<16> pkt_len_total; 101 | 102 | bit<32> flow_iat_max; 103 | bit<32> flow_iat_min; 104 | 105 | bit<8> class0; 106 | bit<8> class1; 107 | bit<8> class2; 108 | bit<8> class3; 109 | bit<8> class4; 110 | 111 | bit<8> final_class; 112 | 113 | bit<202> codeword0; 114 | bit<220> codeword1; 115 | bit<205> codeword2; 116 | bit<221> codeword3; 117 | bit<204> codeword4; 118 | } 119 | 120 | struct flow_class_digest { // maximum size allowed is 47 bytes 121 | 122 | ipv4_addr_t source_addr; // 32 bits 123 | ipv4_addr_t destin_addr; // 32 bits 124 | bit<16> source_port; 125 | bit<16> destin_port; 126 | bit<8> protocol; 127 | bit<8> class_value; 128 | bit<8> packet_num; 129 | bit<(INDEX_WIDTH)> register_index; // To send info to the controller 130 | } 131 | 132 | /************************************************************************* 133 | *********************** P A R S E R *********************************** 134 | *************************************************************************/ 135 | parser TofinoIngressParser( 136 | packet_in pkt, 137 | out ingress_intrinsic_metadata_t ig_intr_md) { 138 | state start { 139 | pkt.extract(ig_intr_md); 140 | transition select(ig_intr_md.resubmit_flag) { 141 | 1 : parse_resubmit; 142 | 0 : parse_port_metadata; 143 | } 144 | } 145 | state parse_resubmit { 146 | // Parse resubmitted packet here. 147 | transition reject; 148 | } 149 | state parse_port_metadata { 150 | pkt.advance(PORT_METADATA_SIZE); 151 | transition accept; 152 | } 153 | } 154 | 155 | parser IngressParser(packet_in pkt, 156 | /* User */ 157 | out my_ingress_headers_t hdr, 158 | out my_ingress_metadata_t meta, 159 | /* Intrinsic */ 160 | out ingress_intrinsic_metadata_t ig_intr_md) 161 | { 162 | /* This is a mandatory state, required by Tofino Architecture */ 163 | TofinoIngressParser() tofino_parser; 164 | 165 | state start { 166 | tofino_parser.apply(pkt, ig_intr_md); 167 | transition parse_ethernet; 168 | } 169 | 170 | state parse_ethernet { 171 | pkt.extract(hdr.ethernet); 172 | transition select(hdr.ethernet.ether_type) { 173 | TYPE_RECIRC : parse_recirc; 174 | TYPE_IPV4: parse_ipv4; 175 | default: accept; 176 | } 177 | } 178 | 179 | state parse_recirc { 180 | pkt.extract(hdr.recirc); 181 | transition parse_ipv4; 182 | } 183 | 184 | state parse_ipv4 { 185 | pkt.extract(hdr.ipv4); 186 | meta.final_class=10; 187 | transition select(hdr.ipv4.protocol) { 188 | TYPE_TCP: parse_tcp; 189 | TYPE_UDP: parse_udp; 190 | default: accept; 191 | } 192 | } 193 | 194 | state parse_tcp { 195 | pkt.extract(hdr.tcp); 196 | meta.hdr_dstport = hdr.tcp.dst_port; 197 | meta.hdr_srcport = hdr.tcp.src_port; 198 | transition accept; 199 | } 200 | 201 | state parse_udp { 202 | pkt.extract(hdr.udp); 203 | meta.hdr_dstport = hdr.udp.dst_port; 204 | meta.hdr_srcport = hdr.udp.src_port; 205 | transition accept; 206 | } 207 | } 208 | 209 | /************************************************************************* 210 | ************** I N G R E S S P R O C E S S I N G ******************* 211 | *************************************************************************/ 212 | /***************** M A T C H - A C T I O N *********************/ 213 | control Ingress( 214 | /* User */ 215 | inout my_ingress_headers_t hdr, 216 | inout my_ingress_metadata_t meta, 217 | /* Intrinsic */ 218 | in ingress_intrinsic_metadata_t ig_intr_md, 219 | in ingress_intrinsic_metadata_from_parser_t ig_prsr_md, 220 | inout ingress_intrinsic_metadata_for_deparser_t ig_dprsr_md, 221 | inout ingress_intrinsic_metadata_for_tm_t ig_tm_md) 222 | { 223 | action drop() { 224 | ig_dprsr_md.drop_ctl = 1; 225 | } 226 | 227 | /* Registers for flow management */ 228 | Register,bit<(INDEX_WIDTH)>>(MAX_REGISTER_ENTRIES) reg_classified_flag; 229 | /* Register read action */ 230 | RegisterAction,bit<(INDEX_WIDTH)>,bit<8>>(reg_classified_flag) 231 | update_classified_flag = { 232 | void apply(inout bit<8> classified_flag, out bit<8> output) { 233 | if (hdr.recirc.isValid()){ 234 | classified_flag = hdr.ipv4.ttl; 235 | } 236 | output = classified_flag; 237 | } 238 | }; 239 | 240 | Register,bit<(INDEX_WIDTH)>>(MAX_REGISTER_ENTRIES) reg_status; 241 | /* Register read action */ 242 | RegisterAction,bit<(INDEX_WIDTH)>,bit<1>>(reg_status) 243 | read_reg_status = { 244 | void apply(inout bit<1> status, out bit<1> output) { 245 | output = status; 246 | status = 1; 247 | } 248 | }; 249 | 250 | Register,bit<(INDEX_WIDTH)>>(MAX_REGISTER_ENTRIES) reg_flow_ID; 251 | /* Register read action */ 252 | RegisterAction,bit<(INDEX_WIDTH)>,bit<32>>(reg_flow_ID) 253 | update_flow_ID = { 254 | void apply(inout bit<32> flow_ID) { 255 | flow_ID = meta.flow_ID; 256 | } 257 | }; 258 | /* Register read action */ 259 | RegisterAction,bit<(INDEX_WIDTH)>,bit<32>>(reg_flow_ID) 260 | read_only_flow_ID = { 261 | void apply(inout bit<32> flow_ID, out bit<32> output) { 262 | output = flow_ID; 263 | } 264 | }; 265 | 266 | Register,bit<(INDEX_WIDTH)>>(MAX_REGISTER_ENTRIES) reg_time_last_pkt; 267 | /* Register read action */ 268 | RegisterAction,bit<(INDEX_WIDTH)>,bit<32>>(reg_time_last_pkt) 269 | read_time_last_pkt = { 270 | void apply(inout bit<32> time_last_pkt, out bit<32> output) { 271 | output = time_last_pkt; 272 | time_last_pkt = ig_prsr_md.global_tstamp[31:0]; 273 | } 274 | }; 275 | 276 | //registers for ML inference - features 277 | Register,bit<(INDEX_WIDTH)>>(MAX_REGISTER_ENTRIES) reg_pkt_count; 278 | /* Register read action */ 279 | RegisterAction,bit<(INDEX_WIDTH)>,bit<8>>(reg_pkt_count) 280 | read_pkt_count = { 281 | void apply(inout bit<8> pkt_count, out bit<8> output) { 282 | pkt_count = pkt_count + 1; 283 | output = pkt_count; 284 | } 285 | }; 286 | 287 | Register,bit<(INDEX_WIDTH)>>(MAX_REGISTER_ENTRIES) reg_pkt_len_max; 288 | /* Register read action */ 289 | RegisterAction,bit<(INDEX_WIDTH)>,bit<16>>(reg_pkt_len_max) 290 | read_pkt_len_max = { 291 | void apply(inout bit<16> pkt_len_max, out bit<16> output) { 292 | if (meta.is_first == 1){ 293 | pkt_len_max = hdr.ipv4.total_len; 294 | } 295 | else if (hdr.ipv4.total_len > pkt_len_max){ 296 | pkt_len_max = hdr.ipv4.total_len; 297 | } 298 | output = pkt_len_max; 299 | } 300 | }; 301 | 302 | Register,bit<(INDEX_WIDTH)>>(MAX_REGISTER_ENTRIES) reg_pkt_len_total; 303 | /* Register read action */ 304 | RegisterAction,bit<(INDEX_WIDTH)>,bit<16>>(reg_pkt_len_total) 305 | read_pkt_len_total = { 306 | void apply(inout bit<16> pkt_len_total, out bit<16> output) { 307 | if (meta.is_first == 1){ 308 | pkt_len_total = hdr.ipv4.total_len; 309 | } 310 | else{ 311 | pkt_len_total = pkt_len_total + hdr.ipv4.total_len; 312 | } 313 | output = pkt_len_total; 314 | } 315 | }; 316 | 317 | Register,bit<(INDEX_WIDTH)>>(MAX_REGISTER_ENTRIES) reg_flow_iat_max; 318 | /* Register read action */ 319 | RegisterAction,bit<(INDEX_WIDTH)>,bit<32>>(reg_flow_iat_max) 320 | read_flow_iat_max = { 321 | void apply(inout bit<32> flow_iat_max, out bit<32> output) { 322 | if (meta.is_first != 1){ 323 | if(meta.iat > flow_iat_max){ 324 | flow_iat_max = meta.iat; 325 | } 326 | } 327 | output = flow_iat_max; 328 | } 329 | }; 330 | 331 | Register,bit<(INDEX_WIDTH)>>(MAX_REGISTER_ENTRIES) reg_flow_iat_min; 332 | /* Register read action */ 333 | RegisterAction,bit<(INDEX_WIDTH)>,bit<32>>(reg_flow_iat_min) 334 | read_flow_iat_min = { 335 | void apply(inout bit<32> flow_iat_min, out bit<32> output) { 336 | if (meta.pkt_count <= 2){ 337 | flow_iat_min = meta.iat; 338 | } 339 | else if(meta.iat < flow_iat_min){ 340 | flow_iat_min = meta.iat; 341 | } 342 | output = flow_iat_min; 343 | } 344 | }; 345 | 346 | 347 | /* Declaration of the hashes*/ 348 | Hash>(HashAlgorithm_t.CRC32) flow_id_calc; 349 | Hash>(HashAlgorithm_t.CRC16) idx_calc; 350 | 351 | /* Calculate hash of the 5-tuple to represent the flow ID */ 352 | action get_flow_ID(bit<16> srcPort, bit<16> dstPort) { 353 | meta.flow_ID = flow_id_calc.get({hdr.ipv4.src_addr, 354 | hdr.ipv4.dst_addr,srcPort, dstPort, hdr.ipv4.protocol}); 355 | } 356 | /* Calculate hash of the 5-tuple to use as 1st register index */ 357 | action get_register_index(bit<16> srcPort, bit<16> dstPort) { 358 | meta.register_index = idx_calc.get({hdr.ipv4.src_addr, 359 | hdr.ipv4.dst_addr,srcPort, dstPort, hdr.ipv4.protocol}); 360 | } 361 | 362 | /* Assign class if at leaf node */ 363 | action SetClass0(bit<8> classe) { 364 | meta.class0 = classe; 365 | } 366 | action SetClass1(bit<8> classe) { 367 | meta.class1 = classe; 368 | } 369 | action SetClass2(bit<8> classe) { 370 | meta.class2 = classe; 371 | } 372 | action SetClass3(bit<8> classe) { 373 | meta.class3 = classe; 374 | } 375 | action SetClass4(bit<8> classe) { 376 | meta.class4 = classe; 377 | } 378 | 379 | /* Compute packet interarrival time (IAT)*/ 380 | action get_iat_value(){ 381 | meta.iat = ig_prsr_md.global_tstamp[31:0] - meta.time_last_pkt; 382 | } 383 | 384 | /* Forward to a specific port upon classification */ 385 | action ipv4_forward(PortId_t port) { 386 | ig_tm_md.ucast_egress_port = port; 387 | } 388 | 389 | /* Custom Do Nothing Action */ 390 | action nop(){} 391 | 392 | /* Recirculate packet via loopback port 68 */ 393 | action recirculate(bit<7> recirc_port) { 394 | ig_tm_md.ucast_egress_port[8:7] = ig_intr_md.ingress_port[8:7]; 395 | ig_tm_md.ucast_egress_port[6:0] = recirc_port; 396 | hdr.recirc.setValid(); 397 | hdr.recirc.class_result = meta.final_class; 398 | hdr.ethernet.ether_type = TYPE_RECIRC; 399 | } 400 | 401 | /* Feature table actions */ 402 | action SetCode0(bit<29> code0, bit<30> code1, bit<23> code2, bit<31> code3, bit<28> code4) { 403 | meta.codeword0[201:173] = code0; 404 | meta.codeword1[219:190] = code1; 405 | meta.codeword2[204:182] = code2; 406 | meta.codeword3[220:190] = code3; 407 | meta.codeword4[203:176] = code4; 408 | } 409 | action SetCode1(bit<53> code0, bit<64> code1, bit<63> code2, bit<60> code3, bit<42> code4) { 410 | meta.codeword0[172:120] = code0; 411 | meta.codeword1[189:126] = code1; 412 | meta.codeword2[181:119] = code2; 413 | meta.codeword3[189:130] = code3; 414 | meta.codeword4[175:134] = code4; 415 | } 416 | action SetCode2(bit<57> code0, bit<48> code1, bit<60> code2, bit<54> code3, bit<52> code4) { 417 | meta.codeword0[119:63] = code0; 418 | meta.codeword1[125:78] = code1; 419 | meta.codeword2[118:59] = code2; 420 | meta.codeword3[129:76] = code3; 421 | meta.codeword4[133:82] = code4; 422 | } 423 | action SetCode3(bit<63> code0, bit<78> code1, bit<59> code2, bit<76> code3, bit<82> code4) { 424 | meta.codeword0[62:0] = code0; 425 | meta.codeword1[77:0] = code1; 426 | meta.codeword2[58:0] = code2; 427 | meta.codeword3[75:0] = code3; 428 | meta.codeword4[81:0] = code4; 429 | } 430 | 431 | /* Feature tables */ 432 | table table_feature0{ 433 | key = {meta.flow_iat_min[31:17]: range @name("feature0");} 434 | actions = {@defaultonly nop; SetCode0;} 435 | size = 64; 436 | const default_action = nop(); 437 | } 438 | table table_feature1{ 439 | key = {meta.pkt_len_max: range @name("feature1");} 440 | actions = {@defaultonly nop; SetCode1;} 441 | size = 160; 442 | const default_action = nop(); 443 | } 444 | table table_feature2{ 445 | key = {meta.flow_iat_max[31:24]: range @name("feature2");} 446 | actions = {@defaultonly nop; SetCode2;} 447 | size = 112; 448 | const default_action = nop(); 449 | } 450 | table table_feature3{ 451 | key = {meta.pkt_len_total: range @name("feature3");} 452 | actions = {@defaultonly nop; SetCode3;} 453 | size = 244; 454 | const default_action = nop(); 455 | } 456 | 457 | /* Code tables */ 458 | table code_table0{ 459 | key = {meta.codeword0: ternary;} 460 | actions = {@defaultonly nop; SetClass0;} 461 | size = 203; 462 | const default_action = nop(); 463 | } 464 | table code_table1{ 465 | key = {meta.codeword1: ternary;} 466 | actions = {@defaultonly nop; SetClass1;} 467 | size = 221; 468 | const default_action = nop(); 469 | } 470 | table code_table2{ 471 | key = {meta.codeword2: ternary;} 472 | actions = {@defaultonly nop; SetClass2;} 473 | size = 206; 474 | const default_action = nop(); 475 | } 476 | table code_table3{ 477 | key = {meta.codeword3: ternary;} 478 | actions = {@defaultonly nop; SetClass3;} 479 | size = 222; 480 | const default_action = nop(); 481 | } 482 | table code_table4{ 483 | key = {meta.codeword4: ternary;} 484 | actions = {@defaultonly nop; SetClass4;} 485 | size = 205; 486 | const default_action = nop(); 487 | } 488 | 489 | action set_default_result() { 490 | meta.final_class = meta.class0; 491 | ig_dprsr_md.digest_type = 1; 492 | recirculate(68); 493 | } 494 | 495 | action set_final_class(bit<8> class_result) { 496 | meta.final_class = class_result; 497 | ig_dprsr_md.digest_type = 1; 498 | recirculate(68); 499 | } 500 | 501 | table voting_table { 502 | key = { 503 | meta.class0: exact; 504 | meta.class1: exact; 505 | meta.class2: exact; 506 | meta.class3: exact; 507 | meta.class4: exact; 508 | } 509 | actions = {set_final_class; @defaultonly set_default_result;} 510 | size = 5256; 511 | const default_action = set_default_result(); 512 | } 513 | 514 | /* Forwarding-Inference Block Table */ 515 | action set_flow_class(bit<8> f_class) { 516 | meta.final_class = f_class; 517 | } 518 | table target_flows_table { 519 | key = { 520 | hdr.ipv4.src_addr: exact; 521 | hdr.ipv4.dst_addr: exact; 522 | meta.hdr_srcport: exact; 523 | meta.hdr_dstport: exact; 524 | hdr.ipv4.protocol: exact; 525 | } 526 | actions = {set_flow_class; @defaultonly drop;} 527 | size = 500; 528 | const default_action = drop(); 529 | } 530 | 531 | apply { 532 | // filter for background or already classified traffic 533 | target_flows_table.apply(); 534 | 535 | // get flow ID and register index 536 | bit<32> tmp_flow_ID; 537 | get_flow_ID(meta.hdr_srcport, meta.hdr_dstport); 538 | get_register_index(meta.hdr_srcport, meta.hdr_dstport); 539 | 540 | if(meta.final_class==0){ //flow not classified 541 | 542 | // check if register for emptiness 543 | meta.reg_status = read_reg_status.execute(meta.register_index); 544 | 545 | // check if register array is empty 546 | if (meta.reg_status == 0){ // we do not yet know this flow 547 | meta.is_first = 1; 548 | update_flow_ID.execute(meta.register_index); 549 | // modify timestamp register 550 | meta.time_last_pkt = read_time_last_pkt.execute(meta.register_index); 551 | meta.pkt_count = read_pkt_count.execute(meta.register_index); 552 | meta.pkt_len_max = read_pkt_len_max.execute(meta.register_index); 553 | meta.pkt_len_total = read_pkt_len_total.execute(meta.register_index); 554 | ipv4_forward(260); 555 | } 556 | else { // not the first packet - get flow_ID from register 557 | meta.is_first = 0; 558 | tmp_flow_ID = read_only_flow_ID.execute(meta.register_index); 559 | if(meta.flow_ID != tmp_flow_ID){ // hash collision 560 | meta.pkt_count = 0; //hash col 561 | // send digest to inform controller of the collision 562 | ig_dprsr_md.digest_type = 1; 563 | ipv4_forward(260); 564 | } 565 | else { // not first packet and not hash collision 566 | //read and update packet count 567 | meta.pkt_count = read_pkt_count.execute(meta.register_index); 568 | 569 | // read and update packet length features 570 | meta.pkt_len_max = read_pkt_len_max.execute(meta.register_index); 571 | meta.pkt_len_total = read_pkt_len_total.execute(meta.register_index); 572 | 573 | // modify timestamp register 574 | meta.time_last_pkt = read_time_last_pkt.execute(meta.register_index); 575 | 576 | // compute IAT value 577 | get_iat_value(); 578 | 579 | //read and update IAT features 580 | meta.flow_iat_max = read_flow_iat_max.execute(meta.register_index); 581 | meta.flow_iat_min = read_flow_iat_min.execute(meta.register_index); 582 | 583 | // check if # of packets requirement is met 584 | if(meta.pkt_count == 8){ 585 | 586 | // apply feature tables to assign codes 587 | table_feature0.apply(); 588 | table_feature1.apply(); 589 | table_feature2.apply(); 590 | table_feature3.apply(); 591 | 592 | // apply code tables to assign labels 593 | code_table0.apply(); 594 | code_table1.apply(); 595 | code_table2.apply(); 596 | code_table3.apply(); 597 | code_table4.apply(); 598 | 599 | // decide final class 600 | voting_table.apply(); 601 | } 602 | else{ // this happens to first packets and packet number 5 onwards 603 | meta.classified_flag = update_classified_flag.execute(meta.register_index); 604 | 605 | if (meta.classified_flag != 0) {//No need to check again - already classified 606 | hdr.recirc.setInvalid(); 607 | hdr.ethernet.ether_type = TYPE_IPV4; 608 | //set value of ttl to classification result (stats only) 609 | hdr.ipv4.ttl = meta.classified_flag; 610 | } 611 | ipv4_forward(260); 612 | } //END OF CHECK FOR PREVIOUS CLASSIFICATION 613 | } //END OF CHECK ON IF NO COLLISION 614 | } // END OF CHECK ON WHETHER FIRST CLASS 615 | } 616 | ipv4_forward(260); 617 | } //END OF APPLY 618 | } //END OF INGRESS CONTROL 619 | 620 | /************************************************************************* 621 | *********************** D E P A R S E R ******************************* 622 | *************************************************************************/ 623 | 624 | control IngressDeparser(packet_out pkt, 625 | /* User */ 626 | inout my_ingress_headers_t hdr, 627 | in my_ingress_metadata_t meta, 628 | /* Intrinsic */ 629 | in ingress_intrinsic_metadata_for_deparser_t ig_dprsr_md) 630 | { 631 | Digest() digest; 632 | 633 | apply { 634 | 635 | if (ig_dprsr_md.digest_type == 1) { 636 | // Pack digest and send to controller 637 | digest.pack({hdr.ipv4.src_addr, hdr.ipv4.dst_addr, meta.hdr_srcport, meta.hdr_dstport, hdr.ipv4.protocol, meta.final_class, meta.pkt_count, meta.register_index}); 638 | } 639 | 640 | /* we do not update checksum because we used ttl field for stats*/ 641 | pkt.emit(hdr); 642 | } 643 | } 644 | 645 | /************************************************************************* 646 | **************** E G R E S S P R O C E S S I N G ******************* 647 | *************************************************************************/ 648 | struct my_egress_headers_t { 649 | } 650 | 651 | /******** G L O B A L E G R E S S M E T A D A T A *********/ 652 | 653 | struct my_egress_metadata_t { 654 | } 655 | 656 | /*********************** P A R S E R **************************/ 657 | 658 | parser EgressParser(packet_in pkt, 659 | /* User */ 660 | out my_egress_headers_t hdr, 661 | out my_egress_metadata_t meta, 662 | /* Intrinsic */ 663 | out egress_intrinsic_metadata_t eg_intr_md) 664 | { 665 | /* This is a mandatory state, required by Tofino Architecture */ 666 | state start { 667 | pkt.extract(eg_intr_md); 668 | transition accept; 669 | } 670 | } 671 | 672 | /***************** M A T C H - A C T I O N *********************/ 673 | 674 | control Egress( 675 | /* User */ 676 | inout my_egress_headers_t hdr, 677 | inout my_egress_metadata_t meta, 678 | /* Intrinsic */ 679 | in egress_intrinsic_metadata_t eg_intr_md, 680 | in egress_intrinsic_metadata_from_parser_t eg_prsr_md, 681 | inout egress_intrinsic_metadata_for_deparser_t eg_dprsr_md, 682 | inout egress_intrinsic_metadata_for_output_port_t eg_oport_md) 683 | { 684 | apply { 685 | } 686 | } 687 | 688 | /********************* D E P A R S E R ************************/ 689 | 690 | control EgressDeparser(packet_out pkt, 691 | /* User */ 692 | inout my_egress_headers_t hdr, 693 | in my_egress_metadata_t meta, 694 | /* Intrinsic */ 695 | in egress_intrinsic_metadata_for_deparser_t eg_dprsr_md) 696 | { 697 | apply { 698 | pkt.emit(hdr); 699 | } 700 | } 701 | 702 | /************************************************************************* 703 | *********************** S W I T C H ******************************* 704 | *************************************************************************/ 705 | Pipeline( 706 | IngressParser(), 707 | Ingress(), 708 | IngressDeparser(), 709 | EgressParser(), 710 | Egress(), 711 | EgressDeparser() 712 | ) pipe; 713 | 714 | Switch(pipe) main; 715 | -------------------------------------------------------------------------------- /In_switch_ETC/Switch/readme.md: -------------------------------------------------------------------------------- 1 | To run the code: 2 | - check the code and change the forwarding port 260 to the right one in your setup 3 | - compile the P4 code 4 | - we used the Intel SDE version 9.7.0 5 | - load the code onto the switch using _bf_switchd_ 6 | - load the table entries in the _table_entries.py_ file using _bfrt_python_ 7 | - this also configures and brings up ports 1, 5 and 9 (56, 260, 292) which we use. 8 | - modify them according to your setup. 9 | - run the _controller_digest_noms.py_ script to enable the controller to collect packet digests with classification results, clean registers after flows are classified, and update the flow table. 10 | - give your output csv file as an argument to this script when running it. 11 | - send packets from the pcap files through the switch using tcpreplay 12 | - the current configuration has a filter table in the P4 program that will filter only the flows belonging to the test data for classification in order to easily compare them with the offline results. 13 | - use the function at the end of the _NIMS2023_Data_Analysis.ipynb_ notebook to analyze the csv obtained at the controller at the end of the experiment. 14 | -------------------------------------------------------------------------------- /Offline_ETC/README.md: -------------------------------------------------------------------------------- 1 | # ENCRYPTED TRAFFIC CLASSIFICATION 2 | 3 | This repo contains the scripts used for the data preparation and data engineering used in the paper [Encrypted Traffic Classification at Line Rate in Programmable Switches with Machine Learning](https://dspace.networks.imdea.org/handle/20.500.12761/1791) by Aristide Tanyi-Jong Akem, Guillaume Fraysse, Marco Fiore presented at IEEE/IFIP Network Operations and Management Symposium (NOMS) 2024. 4 | 5 | ## Datasets 6 | 5 different datasets containing Encrypted data were considered. Some are public some require a subscription to access them. The first three were kept in the paper: 7 | * The [ISCXVPN2016 Dataset](http://dx.doi.org/10.5220/0005740704070414) dataset (from Draper-Gil, G.; Lashkari, A.; Mamun, M. and A. Ghorbani, A. (2016). Characterization of Encrypted and VPN Traffic using Time-related Features. In Proceedings of the 2nd International Conference on Information Systems Security and Privacy - ICISSP; ISBN 978-989-758-167-0; ISSN 2184-4356, SciTePress, pages 407-414. DOI: 10.5220/0005740704070414) is a popular labeled dataset made available by the Canadian Institute of Cybersecurity (CIC) from University of New Brunswick (UNB). It comprises about 28GB of traffic data captured using tcpdump and Wireshark. A subset of this dataset is made of VPN data, generated using an external VPN service.For this work the dataset was processed 8 | from the raw PCAP files using the pipeline described in Section 2 to keep only the VPN subset and aggregate the packets in flows. This results in 4960 flows. Classes This dataset includes 7 classes of encrypted traffic: Browsing, Email, 9 | Chat, Streaming, File Transfer, VoIP, and P2P. Visualization Figure 6a shows the distribution of the samples 10 | * NOMS2023 Encrypted Mobile Instant Messaging Traffic Dataset. The **NOMS2023 Encrypted Mobile Instant Messaging Traffic Dataset** (by Zolboo Erdenebaatar, Riyad Alshammari, Nur Zincir-Heywood, Marwa Elsayed, Biswajit Nandy, Nabil Seddigh, January 23, 2023, "Encrypted Mobile Instant Messaging Traffic Dataset", IEEE Dataport) can be downloaded at [https://dx.doi.org/10.21227/aer2-kq52](https://dx.doi.org/10.21227/aer2-kq52). It is divided in 7 files in the zip format. Six of these files contains data from traffic to commonly used Instant Messaging applications (Discord, Facebook Messenger, Signal, Microsoft Teams, Telegram and WhatsApp). The last file (non_ima_encrypted_traffic.zip) contains encrypted traffic that is not from any of this classes and is not traffic from Instant Messaging applications. It contains four classes, the first three are other types of usage: Gmail, WebBrowsing, YouTube. The last class Background contains all background 11 | traffic, i.e. traffic recorded during the same period but that is not for the classes identified by the other applications. For this work we considered only the data from the 6 Instant Messaging application and considered the classification 12 | of traffic in these six classes. The subset of the dataset that is then considered contains 6 different classes: Discord, Facebook Messenger, Signal, Microsoft Teams, Telegram and WhatsApp. 13 | * The Netflow QUIC dataset from [V. Tong, H. A. Tran, S. Souihi and A. Mellouk, "A Novel QUIC Traffic Classifier Based on Convolutional Neural Networks," 2018 IEEE Global Communications Conference (GLOBECOM), Abu Dhabi, United Arab Emirates, 2018, pp. 1-6, doi: 10.1109/GLOCOM.2018.8647128.](https://ieeexplore.ieee.org/abstract/document/8647128) is a labeled dataset of QUIC traffic to Google services. This dataset is significantly larger than the others with 365000 flows and a total of 136 millions packets. This dataset contains traffic classified in 5 different classes from Google services: CHAT, VoIP, FileTransfer, Video streaming YouTube, Google Play Music. 14 | * [UC Davis](https://doi.org/10.48550/arXiv.1812.09761) : The UCDavis QUIC Dataset is a labeled dataset that can be downloaded 15 | at [https://drive.google.com/drive/folders/1Pvev0hJ82usPh6dWDlz7Lv8L6h3JpWhE](https://drive.google.com/drive/folders/1Pvev0hJ82usPh6dWDlz7Lv8L6h3JpWhE) (file pretraining.zip). Traffic on different services offered by Google was captured by University of California, Davis (UC Davis) team. The data was collected using AutoIt4 and Selenium WebDriver5 scripts on different systems running various versions of Windows and Ubuntu Linux. Only the QUIC traffic was kept. This dataset contains 5 classes which are 5 different Google Services: Google Drive, YouTube, Google Doc, Google Search and Google Music. 16 | * The CSTNET TLS1.3 dataset (by [Lin, X., Xiong, G., Gou, G., Li, Z., Shi, J. and Yu, J., 2022, April. Et-bert: A contextualized datagram representation with pre-training transformers for encrypted traffic classification. In Proceedings of the ACM Web Conference 2022 (pp. 633-642)](https://dl.acm.org/doi/abs/10.1145/3485447.3512217) is a labeled dataset of encrypted traffic to a large number (120) of services. This high number of classes is an order of magnitude bigger than the other 4 datasets. It is probably more realistic from a network operator perspective whose customers generate traffic not only to a handful of services but to any service on the internet. This dataset contains data from 120 classes, each of which is labeled by the domain name of an application (e.g. google.com, elsevier.com, ..). 17 | 18 | ## Data preparation 19 | Most datasets are in raw PCAP format. We have performed two steps: 20 | * convert PCAP to CSV 21 | * compute the flows for each packet. A flow is a 5-tuple (IP src, port src, ip dst, port dst, protocol). Each packet with the same value for the tuple get associated with a unique flow-id. This step add a new column in the CSV file with this flow id. 22 | 23 | ### PCAP to csv 24 | After using a Python script absed on scappy we moved to tshark for performance on the larger datasets. 25 | We used the tshark command, cf. the script **data_preparation/pcap2csv.sh**. Once you have downloaded a dataset you can run the script on each of the PCAP files and redirect the output to a CSV file: 26 | 27 | ```bash 28 | bash data_preparation/pcap2csv.sh datafile.pcap > datafile.csv 29 | ``` 30 | 31 | ### Add the flow id column 32 | 33 | To add the flow Id information to the dataset, we have developped the Python script **data_preparation/pkts2flows.py**. 34 | To use it, simply change the placeholder values *inputdir* and *outputdir* in the script. 35 | * *inputdir* must point to the directory where the csv files of the dataset are stored. 36 | * *outputdir* must point to the folder where you want the new files to be written. 37 | 38 | ```bash 39 | python data_preparation/pkts2flows.py 40 | ``` 41 | -------------------------------------------------------------------------------- /Offline_ETC/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nds-group/ETC_NOMS_2024/1ad4bbd4e1c097e87625b185e6d0ba14fa201bbb/Offline_ETC/__init__.py -------------------------------------------------------------------------------- /Offline_ETC/cstnet-tls13_traffic_classifier.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | from os import listdir 6 | from os.path import isfile, join 7 | import sys 8 | import time 9 | 10 | import pandas as pd 11 | 12 | import numpy as np 13 | 14 | from scipy.stats import kurtosis, skew 15 | 16 | from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures 17 | from sklearn.compose import ColumnTransformer 18 | from sklearn.pipeline import Pipeline 19 | from sklearn.metrics import classification_report, f1_score, confusion_matrix, ConfusionMatrixDisplay 20 | 21 | import seaborn as sns 22 | import matplotlib.pyplot as plt 23 | 24 | from encrypted_traffic_classification import EncryptedTrafficClassifier, EncryptedTrafficClassifierIterator 25 | 26 | ######################################## 27 | # Data preparation: convert RAW data 28 | ######################################## 29 | class CstNetTls13Classifier(EncryptedTrafficClassifier): 30 | def __init__(self, nb_folds, nb_packets_per_flow): 31 | super().__init__( 32 | nb_folds= nb_folds, 33 | nb_packets_per_flow = nb_packets_per_flow, 34 | filename_prefix = "cstnet_tls13", 35 | processed_data_output_dir = "cstnet_tls13_output/", 36 | data_dir = "data/cstnet_tls13/" 37 | ) 38 | 39 | pools = [tuple(pool) for pool in [self.nb_packets_per_flow, range(self.nb_folds)]] 40 | result = [[]] 41 | for pool in pools: 42 | result = [x+[y] for x in result for y in pool] 43 | self.flow_ids = result 44 | 45 | pools = [tuple(pool) for pool in [self.nb_packets_per_flow]] 46 | result = [[]] 47 | for pool in pools: 48 | result = [x+[y] for x in result for y in pool] 49 | self.flow_ids_without_folds = result 50 | 51 | pools = [tuple(pool) for pool in [range(self.nb_folds)]] 52 | result = [[]] 53 | for pool in pools: 54 | result = [x+[y] for x in result for y in pool] 55 | self.packet_ids = result 56 | 57 | ######################################## 58 | # Preprocessing 59 | ######################################## 60 | def _get_flows_with_all_packets(self): 61 | print("_get_flows_with_all_packets") 62 | start_time = time.time() 63 | subdirs = sorted([f for f in listdir(self.data_dir)]) 64 | nb_flows = 0 65 | df_flows = pd.DataFrame() 66 | self.classes = set() 67 | for subdir in subdirs: 68 | # print("subdir", self.data_dir+subdir) 69 | _files = sorted([f for f in listdir(self.data_dir + subdir) if isfile(join(self.data_dir + subdir, f))]) 70 | # print(" files", _files) 71 | for _i in range(len(_files)): 72 | f = self.data_dir + subdir + "/" + _files[_i] 73 | 74 | df_new = pd.read_csv(f, 75 | names = [ 76 | 'flow_id', 77 | 'timestamp', 78 | 'iat', 79 | 'source', 80 | 'sport', 81 | 'dest', 82 | 'dport', 83 | 'protocol', 84 | 'length' 85 | ], 86 | header = 0 87 | ) 88 | print(f, df_new.shape) 89 | 90 | # drop DNS traffic 91 | df_new = df_new.drop(df_new[df_new['sport'] == 53].index) 92 | df_new = df_new.drop(df_new[df_new['dport'] == 53].index) 93 | 94 | found = False 95 | for _c in self.all_classes: 96 | if _c in f: 97 | found = True 98 | df_new['class'] = _c 99 | self.classes.add(_c) 100 | break 101 | if found == False: 102 | print("class not identified for", f) 103 | 104 | # extract flow and add statistical features 105 | dfs = [] 106 | for flow_id in df_new['flow_id'].unique(): 107 | nb_flows += 1 108 | d = df_new[df_new['flow_id'] == flow_id].head(n = 1) 109 | d['nb_packets'] = len(df_new[df_new['flow_id'] == flow_id]) 110 | c = d['class'].tolist() 111 | dport = d.dport.tolist() 112 | sport = d.sport.tolist() 113 | #print(d) 114 | _df = df_new.loc[df_new['flow_id'] == flow_id, 'iat'] 115 | d['sum_iat'] = np.sum(_df) 116 | 117 | _df = df_new.loc[df_new['flow_id'] == flow_id, 'length'] 118 | d['sum_length'] = np.sum(_df) 119 | d['src'] = f 120 | dfs.append(d) 121 | _d = pd.concat(dfs) 122 | df_flows = pd.concat([_d, df_flows]) 123 | # For debugging 124 | # break 125 | 126 | print(" processing took ", time.time() - start_time, "seconds.") 127 | print("%d flows processed" % nb_flows) 128 | # Finish processing the data, create the train/tests split and save as pickle files 129 | df_flows = df_flows.fillna(0) 130 | 131 | self.classes = list(self.classes) 132 | self._hotencode_class(df_flows) 133 | return df_flows 134 | 135 | def data_preparation(self): 136 | print("data_preparation") 137 | import warnings 138 | warnings.filterwarnings("ignore") 139 | 140 | df_flows = {} 141 | files = [] 142 | subdirs = [f for f in listdir(self.data_dir)] 143 | for subdir in subdirs: 144 | # print("subdir", self.data_dir+subdir) 145 | _files = [f for f in listdir(self.data_dir + subdir) if isfile(join(self.data_dir + subdir, f))] 146 | # print(" files", _files) 147 | for _i in range(len(_files)): 148 | _files[_i] = self.data_dir + subdir + "/" + _files[_i] 149 | files += _files 150 | 151 | # print(files) 152 | for i in self.nb_packets_per_flow: 153 | self.__generate_pickle_for_n_packets(i, files) 154 | 155 | def __generate_pickle_for_n_packets(self, n, files): 156 | print("__generate_pickle_for_n_packets n =", n) 157 | nb_flows = 0 158 | df_flows = pd.DataFrame() 159 | # dfs = [] 160 | self.classes = set() 161 | for f in files: 162 | # print("f=", f) 163 | df_new = pd.read_csv(f, 164 | names = [ 165 | 'flow_id', 166 | 'timestamp', 167 | 'iat', 168 | 'source', 169 | 'sport', 170 | 'dest', 171 | 'dport', 172 | 'protocol', 173 | 'length' 174 | ], 175 | header = 0 176 | ) 177 | print(n, f, df_new.shape) 178 | 179 | # drop DNS traffic 180 | df_new = df_new.drop(df_new[df_new['sport'] == 53].index) 181 | df_new = df_new.drop(df_new[df_new['dport'] == 53].index) 182 | 183 | found = False 184 | for _c in self.all_classes: 185 | if _c in f: 186 | found = True 187 | df_new['class'] = _c 188 | self.classes.add(_c) 189 | break 190 | if found == False: 191 | print("class not identified for", f) 192 | 193 | # extract flow and add statistical features 194 | for flow_id in df_new['flow_id'].unique(): 195 | nb_flows += 1 196 | _df_new = df_new[df_new['flow_id'] == flow_id].head(n = n) 197 | d = _df_new.head(n = 1) 198 | d['nb_packets'] = len(_df_new) #df_new[df_new['flow_id'] == flow_id]) 199 | c = d['class'].tolist() 200 | dport = d.dport.tolist() 201 | sport = d.sport.tolist() 202 | #print(d) 203 | _df = _df_new['iat'] 204 | d['min_iat'] = np.min(df_new[df_new['iat'] > 0]['iat']) # probably useless as most probably always 0 for the first packet 205 | d['max_iat'] = np.max(_df) 206 | d['sum_iat'] = np.sum(_df) 207 | d['mean_iat'] = np.mean(_df) 208 | d['median_iat'] = np.median(_df) 209 | d['std_iat'] = np.std(_df) 210 | d['1stQ_iat'] = np.quantile(_df, 0.25) 211 | d['3rdQ_iat'] = np.quantile(_df, 0.75) 212 | _a = list(_df) 213 | d['skew_iat'] = skew(_a) 214 | d['kurt_iat'] = kurtosis(_a) 215 | 216 | _df = _df_new['length'] 217 | d['min_length'] = np.min(_df) 218 | d['max_length'] = np.max(_df) 219 | d['sum_length'] = np.sum(_df) 220 | d['median_length'] = np.median(_df) 221 | d['mean_length'] = np.mean(_df) 222 | d['std_length'] = np.std(_df) 223 | d['1stQ_length'] = np.quantile(_df, 0.25) 224 | d['3rdQ_length'] = np.quantile(_df, 0.75) 225 | _a = list(_df) 226 | d['skew_length'] = skew(_a) 227 | # d['skew_length'] = skew(np.array(df_new.loc[df_new['flow_id'] == flow_id, 'length'])) 228 | d['kurt_length'] = kurtosis(_a) 229 | d['src'] = f 230 | # dfs.append(d) 231 | df_flows = pd.concat([d, df_flows]) 232 | # if nb_flows > 20: 233 | # break 234 | 235 | print("%d flows processed" % nb_flows) 236 | # Finish processing the data, create the train/tests split and save as pickle files 237 | df_flows = df_flows.fillna(0) 238 | 239 | self.classes = list(self.classes) 240 | self._hotencode_class(df_flows) 241 | 242 | filename = self.filename_prefix + "_" + str(n) + ".pickle" 243 | # filename = "cstnet_tls13_" + str(n) + ".pickle" 244 | self._generate_data_folds(df_flows, filename) 245 | 246 | ######################################## 247 | # Data Analysis 248 | ######################################## 249 | """ 250 | def __show_actual_and_predicted(self, X, y, y_pred, _class): 251 | print(self.classes) 252 | for _i in itertools.product(NB_PACKETS, self.filenames): 253 | i = (_i[0], _i[1], 0) 254 | print(i) 255 | df = X[i].copy() 256 | df['type'] = y[i] 257 | df['type_pred'] = y_pred[i] 258 | print(df.columns) 259 | a4_dims = (23.4, 16.54) 260 | fig, ax = plt.subplots(figsize = a4_dims) 261 | sns.lmplot( 262 | x = 'sum_iat', 263 | y = 'sum_length', 264 | data = df[df['type'] == _class], 265 | hue = 'type', 266 | fit_reg = False, 267 | height = 4, aspect = 5, 268 | # color = 'green', 269 | # scatter_kws = {'alpha': 0.3}, 270 | # ax = ax, 271 | legend = False, 272 | palette = 'viridis' 273 | ) 274 | #ax.set(xlabel='time_delta', ylabel='packet_size') 275 | ax.set(xlabel = 'duration', ylabel = 'sum_packet_size') 276 | plt.legend(title = 'Class', labels =self.classes) 277 | plt.savefig("cstnet_tls13_" + self.classes[_class] + "_"+ str(i[0]) + "_" + i[1]+".png", format = 'png') 278 | fig, ax2 = plt.subplots(figsize = a4_dims) 279 | sns.lmplot( 280 | x = 'sum_iat', 281 | y = 'sum_length', 282 | data = df[df['type_pred'] == _class], 283 | hue = 'type', 284 | fit_reg = False, 285 | height = 4, aspect = 5, 286 | # color = 'orange', 287 | # scatter_kws = {'alpha': 0.3}, 288 | legend = False, 289 | palette = 'viridis', 290 | # ax = ax2 291 | ) 292 | ax2.set(xlabel = 'duration', ylabel = 'sum_packet_size') 293 | plt.legend(title = 'Class', labels =self.classes) 294 | plt.savefig(self.filename_prefix + "_" + self.classes[_class] + "_pred_"+ str(i[0]) + "_" + i[1]+".png", format = 'png') 295 | """ 296 | ######################################## 297 | # Entry point 298 | ######################################## 299 | if __name__ == "__main__": 300 | parser = argparse.ArgumentParser( 301 | prog='cstnet_tls13_instant_messaging_traffic_classifier', 302 | description='Classify packets or flows from CTSNET TTLS1.3 dataset', 303 | epilog='' 304 | ) 305 | parser.add_argument('-p', '--nb_packets', action = 'append', type = int, required = True) #, default = [4, 8]) 306 | parser.add_argument('-c', '--classifier', action = 'append', type = str) #, default = ['rf']) 307 | parser.add_argument('-f', '--nb_folds', action = 'store', default = 12, type = int) 308 | parser.add_argument('-v', '--visualization', action = 'store_true', required = False, default = False) 309 | parser.add_argument('-r', '--report', action = 'store_true', required = False, default = False) 310 | parser.add_argument('-F', '--force_rf_classification', action = 'store_true', required = False, default = False) 311 | args = parser.parse_args(sys.argv[1:]) 312 | 313 | VISUALIZATION_ENABLED = False 314 | if args.visualization == True: 315 | VISUALIZATION_ENABLED = True 316 | 317 | RF_ENABLED = False 318 | GB_ENABLED = False 319 | XG_ENABLED = False 320 | for c in args.classifier: 321 | c = c.lower() 322 | if c == "rf": 323 | RF_ENABLED = True 324 | elif c == "gb": 325 | GB_ENABLED = True 326 | elif c == "xg": 327 | XG_ENABLED = True 328 | else: 329 | print("Unknown classifier", c) 330 | 331 | classifier = CstNetTls13Classifier( 332 | nb_folds = args.nb_folds, 333 | nb_packets_per_flow = args.nb_packets 334 | ) 335 | 336 | if args.force_rf_classification == True: 337 | classifier.force_rf_classification = True 338 | 339 | classifier.all_classes = [ 340 | "163.com", 341 | "chia.net", 342 | "github.com", 343 | "leetcode-cn.com", 344 | "qcloud.com", 345 | "toutiao.com", 346 | "51cto.com", 347 | "chinatax.gov.cn", 348 | "gitlab.com", 349 | "media.net", 350 | "qq.com", 351 | "twimg.com", 352 | "51.la", 353 | "cisco.com", 354 | "gmail.com", 355 | "mi.com", 356 | "researchgate.net", 357 | "twitter.com", 358 | "acm.org", 359 | "cloudflare.com", 360 | "goat.com", 361 | "microsoft.com", 362 | "runoob.com", 363 | "unity3d.com", 364 | "adobe.com", 365 | "cloudfront.net", 366 | "google.com", 367 | "mozilla.org", 368 | "sciencedirect.com", 369 | "v2ex.com", 370 | "alibaba.com", 371 | "cnblogs.com", 372 | "grammarly.com", 373 | "msn.com", 374 | "semanticscholar.org", 375 | "vivo.com.cn", 376 | "alicdn.com", 377 | "codepen.io", 378 | "gravatar.com", 379 | "naver.com", 380 | "sina.com.cn", 381 | "vk.com", 382 | "alipay.com", 383 | "crazyegg.com", 384 | "guancha.cn", 385 | "netflix.com", 386 | "smzdm.com", 387 | "vmware.com", 388 | "amap.com", 389 | "criteo.com", 390 | "huanqiu.com", 391 | "nike.com", 392 | "snapchat.com", 393 | "walmart.com", 394 | "amazonaws.com", 395 | "ctrip.com", 396 | "huawei.com", 397 | "notion.so", 398 | "sohu.com", 399 | "weibo.com", 400 | "ampproject.org", 401 | "dailymotion.com", 402 | "hubspot.com", 403 | "nvidia.com", 404 | "springer.com", 405 | "wikimedia.org", 406 | "apple.com", 407 | "deepl.com", 408 | "huya.com", 409 | "office.net", 410 | "spring.io", 411 | "wikipedia.org", 412 | "arxiv.org", 413 | "digitaloceanspaces.com", 414 | "ibm.com", 415 | "onlinedown.net", 416 | "squarespace.com", 417 | "wp.com", 418 | "asus.com", 419 | "duckduckgo.com", 420 | "icloud.com", 421 | "opera.com", 422 | "statcounter.com", 423 | "xiaomi.com", 424 | "atlassian.net", 425 | "eastday.com", 426 | "ieee.org", 427 | "oracle.com", 428 | "steampowered.com", 429 | "ximalaya.com", 430 | "azureedge.net", 431 | "eastmoney.com", 432 | "instagram.com", 433 | "outbrain.com", 434 | "taboola.com", 435 | "yahoo.com", 436 | "baidu.com", 437 | "elsevier.com", 438 | "iqiyi.com", 439 | "overleaf.com", 440 | "t.co", 441 | "yandex.ru", 442 | "bilibili.com", 443 | "facebook.com", 444 | "jb51.net", 445 | "paypal.com", 446 | "teads.tv", 447 | "youtube.com", 448 | "biligame.com", 449 | "feishu.cn", 450 | "jd.com", 451 | "pinduoduo.com", 452 | "thepaper.cn", 453 | "yy.com", 454 | "booking.com", 455 | "ggpht.com", 456 | "kugou.com", 457 | "python.org", 458 | "tiktok.com", 459 | "zhihu.com" 460 | ] 461 | 462 | non_needed_features = [ 463 | 'flow_id', 464 | 'class', 465 | 'source', 466 | 'dest', 467 | 'sport', 468 | 'dport', 469 | 'protocol', 470 | 'timestamp', 471 | # 'nb_packets', 472 | 'src', 473 | 'iat', 474 | 'direction', 475 | 'length' 476 | ] 477 | 478 | all_features_flows = [ 479 | 'min_iat', 480 | 'max_iat', 481 | 'sum_iat', 482 | 'mean_iat', 483 | 'median_iat', 484 | 'std_iat', 485 | '1stQ_iat', 486 | '3rdQ_iat', 487 | 'skew_iat', 488 | 'kurt_iat', 489 | 'min_length', 490 | 'max_length', 491 | 'sum_length', 492 | 'median_length', 493 | 'mean_length', 494 | 'std_length', 495 | '1stQ_length', 496 | '3rdQ_length', 497 | 'skew_length', 498 | 'kurt_length', 499 | 'nb_packets', 500 | # 'sport', 501 | # 'dport', 502 | # 'protocol', 503 | # 'direction' 504 | ] 505 | # best_features = [ 506 | # 'max_iat', 507 | # 'sum_iat', 508 | # 'mean_iat', 509 | # 'median_iat', 510 | # 'std_iat', 511 | # '1stQ_iat', 512 | # '3rdQ_iat', 513 | # 'skew_iat', 514 | # 'kurt_iat', 515 | # 'min_length', 516 | # 'max_length', 517 | # 'sum_length', 518 | # 'median_length', 519 | # 'mean_length', 520 | # 'std_length', 521 | # '1stQ_length', 522 | # '3rdQ_length', 523 | # 'skew_length', 524 | # 'kurt_length' 525 | # ] 526 | best_features = ['3rdQ_iat', 'std_iat', 'std_length', 'skew_iat', 'max_iat', 'sum_iat', 'mean_length', '1stQ_length', 'max_length', 'mean_iat', 'min_length', 'sum_length', 'median_length', '1stQ_iat', 'median_iat', '3rdQ_length', 'kurt_iat', 'kurt_length', 'nb_packets'] 527 | online_features=[ 528 | 'sum_iat', 529 | 'sum_length', 530 | 'max_length', 531 | 'mean_iat', 532 | 'max_iat', 533 | 'mean_length', 534 | 'min_length', 535 | 'min_iat' 536 | ] 537 | feats_flows = all_features_flows 538 | 539 | # Preprocessing 540 | if not classifier.data_prepared(): 541 | classifier.data_preparation() 542 | classifier.load_flows() 543 | else: 544 | classifier.load_flows() 545 | classifier.classes = classifier.all_classes 546 | # if not classifier.data_prepared(): 547 | # classifier.data_preparation() 548 | # else: 549 | # classifier.classes = classifier.all_classes 550 | 551 | # classifier.load_flows() 552 | classifier.cleanup_data(classifier.X_train_flows, 553 | classifier.y_train_flows, 554 | classifier.X_test_flows, 555 | classifier.y_test_flows, 556 | classifier.flow_ids, 557 | non_needed_features) 558 | # classifier._cleanup_data(non_needed_features) 559 | # classifier.X_train_flows_fitted, classifier.X_test_flows_fitted = classifier.preprocessing( 560 | # classifier.X_train_flows, 561 | # classifier.y_train_flows, 562 | # classifier.X_test_flows, 563 | # classifier.y_test_flows, 564 | # classifier.flow_ids, 565 | # feats_flows 566 | # ) 567 | classifier.X_train_flows_fitted = classifier.X_train_flows 568 | classifier.X_test_flows_fitted = classifier.X_test_flows 569 | # __correlation() 570 | # feats = all_features 571 | # analyze_models_for_npkts(10, all_features, "all_feats") 572 | 573 | if args.report == True: 574 | classifier._viz(distribution = 0, class_distribution = -1, nb_packets = -1, min_iat = -1, max_iat = -1) 575 | for n in classifier.nb_packets_per_flow: 576 | if n == 4: 577 | classifier._viz(distribution = -1, class_distribution = 0, nb_packets = 0, min_iat = -1, max_iat = -1) 578 | elif n == 8: 579 | classifier._viz(distribution = -1, class_distribution = 10, nb_packets = 0, min_iat = -1, max_iat = -1) 580 | elif n == 600000: 581 | classifier._viz(distribution = -1, class_distribution = 11, nb_packets = 0, min_iat = 1, max_iat = -1) 582 | sys.exit(1) 583 | if VISUALIZATION_ENABLED: 584 | # f = classifier.filename_prefix + '_datasetflows_distribution.pickle' 585 | # if isfile(classifier.processed_data_output_dir + f): 586 | # print("Loading dataset from pickle file", f) 587 | # _df = classifier._load_pickle(f) 588 | # else: 589 | # print("Creating dataset") 590 | # _df = classifier._get_flows_with_all_packets() 591 | # classifier._pickle_dump(_df, f) 592 | # print("Dataset saved in file", f) 593 | # classifier._class_distribution(_df, classifier.filename_prefix + '_flows_distribution') 594 | # # classifier._nb_packets_distribution(_df, classifier.filename_prefix + "_flows_nb_packets_distribution" ) 595 | # classifier._distribution(_df, classifier.filename_prefix + "_flows_class_split" ) 596 | pkt = classifier.nb_packets_per_flow[0] 597 | fold = 0 598 | _i = pkt, fold 599 | _df1 = classifier.X_train_flows[_i].copy() 600 | # print("_df1", _df1.columns) 601 | # print("y_train", classifier.y_train_flows[_i]) 602 | # print(classifier.y_train_flows[_i][classifier.y_train_flows[_i].index.duplicated()]) 603 | _df1['type'] = classifier.y_train_flows[_i].values 604 | # print("_df1 type", _df1.columns) 605 | _df2 = classifier.X_test_flows[_i].copy() 606 | # print("_df2", _df1.columns) 607 | _df2['type'] = classifier.y_test_flows[_i].values 608 | _df = pd.concat([_df1, _df2]) 609 | _df.reset_index() 610 | print(_df.shape) 611 | print(_df['type'].value_counts().to_string()) 612 | # classifier._distribution(_df, classifier.filename_prefix + "_flows_class_split_" + str(pkt) + '_pkt', xticks = False) 613 | # classifier._class_distribution(_df, classifier.filename_prefix + '_flows_distribution_' + str(pkt) + '_pkt', xticks = False) 614 | classifier._nb_packets_distribution(_df, classifier.filename_prefix + "_flows_nb_packets_distribution_" + str(pkt) + '_pkt', xticks = False) 615 | # classifier._min_iat_distribution(_df, classifier.filename_prefix + "_flows_min_iat_distribution_" + str(pkt) + '_pkt', xticks = False) 616 | 617 | if RF_ENABLED: 618 | print("==== RandomForest =====") 619 | """ 620 | classifier.X_train_flows_fitted, classifier.X_test_flows_fitted = classifier.preprocessing( 621 | classifier.X_train_flows, 622 | classifier.y_train_flows, 623 | classifier.X_test_flows, 624 | classifier.y_test_flows, 625 | classifier.flow_ids, 626 | feats_flows 627 | ) 628 | classifier.X_train_flows_fitted = classifier.X_train_flows 629 | classifier.X_test_flows_fitted = classifier.X_test_flows 630 | """ 631 | rf_regr_flows, rf_y_train_flows_predicted, rf_y_test_flows_predicted = classifier.RF_predict( 632 | classifier.X_train_flows_fitted, 633 | classifier.y_train_flows, 634 | classifier.X_test_flows_fitted, 635 | classifier.y_test_flows, 636 | ) 637 | rf_cm_dict_flows, output = classifier.confusion_matrix(rf_regr_flows, 638 | classifier.y_test_flows, 639 | rf_y_test_flows_predicted, 640 | classifier.flow_ids, 641 | "rf" 642 | ) 643 | print(output) 644 | rf_f1_scores_flows, output = classifier.get_F1_score(rf_cm_dict_flows, 645 | classifier.y_test_flows, 646 | rf_y_test_flows_predicted, 647 | classifier.flow_ids, 648 | "rf_flows") 649 | print(output) 650 | avg_scores, output = classifier.avg_f1_scores(rf_f1_scores_flows, classifier.flow_ids) 651 | print(output) 652 | 653 | if GB_ENABLED: 654 | gb_regr, gb_y_train_predicted, gb_y_test_predicted = classifier.GBoost_predict(feats_flows, df_score) 655 | gb_cm_dict = classifier.confusion_matrix(gb_regr, gb_y_test_predicted, False) 656 | gb_f1_scores = classifier.get_F1_score(df_score, gb_cm_dict, y_test, gb_y_test_predicted, "gb", False) 657 | classifier.avg_f1_scores(gb_f1_scores_flows, classifier.flow_ids_without_folds) 658 | classifier.avg_f1_scores(gb_f1_scores) 659 | 660 | if XG_ENABLED: 661 | print("==== XGBoost =====") 662 | xg_regr, xg_y_train_predicted, xg_y_test_flows_predicted = classifier.XGBoost_predict( 663 | classifier.X_train_flows_fitted, 664 | classifier.y_train_flows, 665 | classifier.X_test_flows_fitted, 666 | classifier.y_test_flows 667 | ) 668 | xg_cm_dict_flows, output = classifier.confusion_matrix(xg_regr, 669 | classifier.y_test_flows, 670 | xg_y_test_flows_predicted, 671 | classifier.flow_ids, 672 | "xg" 673 | ) 674 | print(output) 675 | 676 | xg_f1_scores_flows, output = classifier.get_F1_score( 677 | xg_cm_dict_flows, 678 | classifier.y_test_flows, 679 | xg_y_test_flows_predicted, 680 | classifier.flow_ids, 681 | "xg_flows") 682 | print(output) 683 | avg_scores, output = classifier.avg_f1_scores(xg_f1_scores_flows, classifier.flow_ids) 684 | print(output) 685 | 686 | print(classifier.classification_results) 687 | if RF_ENABLED or GB_ENABLED or XG_ENABLED: 688 | classifier.save_results() -------------------------------------------------------------------------------- /Offline_ETC/data_preparation/pcap2csv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | pcapfile=$1 3 | 4 | tshark -o gui.column.format:"SP,%uS,DP,%uD" -r "${pcapfile}" -T fields -E header=y -E separator=, -e frame.number -e frame.time_epoch -e frame.time_delta -e ip.src -e _ws.col.SP -e ip.dst -e _ws.col.DP -e ip.proto -e frame.len 5 | -------------------------------------------------------------------------------- /Offline_ETC/data_preparation/pkts2flows.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | from os import listdir 5 | from os.path import join 6 | 7 | import pandas as pd 8 | import numpy as np 9 | 10 | def __pkts2flow(path, outpath, filename, first_flow_id): 11 | print("opening", join(path, filename)) 12 | df = pd.read_csv(join(path, filename), names = ["packet_id", "timestamp", "iat", "src", "psrc", "dst", "pdst", "protocol", "length"], dtype={'packet_id': 'int', 'timestamp': 'float', 'iat': 'float', 'src': 'str', "psrc": 'int', 'dst':'str', 'pdst': 'int', 'protocol': 'int', 'length': 'int'},header = 0) 13 | df['protocol'].replace('', np.nan, inplace = True) 14 | df = df.dropna(axis = 1) 15 | df['flow_id'] = df.groupby(['src', 'psrc', 'dst', 'pdst', 'protocol']).ngroup() 16 | df['flow_id'] = df['flow_id'].astype('int') 17 | 18 | print(df.shape) 19 | 20 | # update flow_id to consecutive values 21 | df = df.sort_values(by = ['flow_id']) 22 | df.flow_id = df.flow_id.ne(df.flow_id.shift()).cumsum().add(first_flow_id).astype('int') 23 | 24 | r = df['flow_id'].max() 25 | df.to_csv(join(outpath, filename), index = False) 26 | return r 27 | 28 | def main(): 29 | # change inputdir to the full name of the directory where the dataset CSV files are stored. 30 | paths = ["inputdir"] 31 | first_flow_id = 0 32 | # change outputdir to the full name of the directory where you want to store the new CSV files 33 | output_path = "outputdir" 34 | 35 | for path in paths: 36 | for f in listdir(path): 37 | if "csv" in f: 38 | first_flow_id = __pkts2flow(path, output_path, f, first_flow_id) 39 | 40 | if __name__ == "__main__": 41 | main() -------------------------------------------------------------------------------- /Offline_ETC/netflow_quic_traffic_classifier.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import itertools 6 | from os import listdir 7 | from os.path import isfile, join 8 | import sys 9 | import time 10 | 11 | import pandas as pd 12 | 13 | import numpy as np 14 | 15 | from scipy.stats import kurtosis, skew 16 | 17 | from sklearn.metrics import classification_report, f1_score, confusion_matrix 18 | 19 | import seaborn as sns 20 | import matplotlib.pyplot as plt 21 | 22 | from encrypted_traffic_classification import EncryptedTrafficClassifier, EncryptedTrafficClassifierIterator 23 | 24 | filename_patterns = { 25 | "youtube_": "STREAMING", 26 | "Google_Play_Music_": "Google_Play_Music", 27 | "GoogleHangout_VoIP_": "GoogleHangout_VoIP", 28 | "GoogleHangout_Chat_": "GoogleHangout_Chat", 29 | "FileTransfer_": "FileTransfer", 30 | } 31 | 32 | ######################################## 33 | # Data preparation: convert RAW data 34 | ######################################## 35 | class NetFlowQUICClassifier(EncryptedTrafficClassifier): 36 | def __init__(self, nb_folds, nb_packets_per_flow): 37 | super().__init__( 38 | nb_folds= nb_folds, 39 | nb_packets_per_flow = nb_packets_per_flow, 40 | filename_prefix = "netflow_quic", 41 | processed_data_output_dir = "netflow_quic_output/", 42 | data_dir = "data/Netflow-QUIC/" 43 | ) 44 | 45 | pools = [tuple(pool) for pool in [self.nb_packets_per_flow, range(self.nb_folds)]] 46 | result = [[]] 47 | for pool in pools: 48 | result = [x+[y] for x in result for y in pool] 49 | self.flow_ids = result 50 | 51 | pools = [tuple(pool) for pool in [self.nb_packets_per_flow]] 52 | result = [[]] 53 | for pool in pools: 54 | result = [x+[y] for x in result for y in pool] 55 | self.flow_ids_without_folds = result 56 | 57 | pools = [tuple(pool) for pool in [range(self.nb_folds)]] 58 | result = [[]] 59 | for pool in pools: 60 | result = [x+[y] for x in result for y in pool] 61 | self.packet_ids = result 62 | 63 | ######################################## 64 | # Preprocessing 65 | ######################################## 66 | def _get_flows_with_all_packets(self): 67 | print("_get_flows_with_all_packets") 68 | df_flows = {} 69 | start_time = time.time() 70 | nb_flows = 0 71 | df_flows = pd.DataFrame() 72 | self.classes = set() 73 | files = [f for f in listdir(self.data_dir) if isfile(join(self.data_dir, f))] 74 | for _i in range(len(files)): 75 | f = self.data_dir + "/" + files[_i] 76 | print("f=", f) 77 | start_time = time.time() 78 | df_new = pd.read_csv(f, 79 | names = [ 80 | 'packet_id', 81 | 'timestamp', 82 | 'iat', 83 | 'source', 84 | 'sport', 85 | 'dest', 86 | 'dport', 87 | 'protocol', 88 | 'length', 89 | 'flow_id' 90 | ], 91 | dtype = { 92 | 'flow_id': 'Int32', 93 | 'timestamp': np.float64, 94 | 'iat': np.float64, 95 | 'source':str, 96 | 'sport': 'Int32', 97 | 'dest': str, 98 | 'dport': 'Int32', 99 | 'protocol': 'Int32', 100 | 'length': 'Int64', 101 | 'flow_id': 'Int64' 102 | }, 103 | header = 0 104 | ) 105 | print(f, df_new.shape) 106 | 107 | # drop DNS traffic 108 | df_new = df_new.drop(df_new[df_new['sport'] == 53].index) 109 | df_new = df_new.drop(df_new[df_new['dport'] == 53].index) 110 | 111 | found = False 112 | for k, v in filename_patterns.items(): 113 | if k in f: 114 | df_new['class'] = v 115 | self.classes.add(v) 116 | found = True 117 | break 118 | if found == False: 119 | print("Type for file", f, "not found") 120 | sys.exit(1) 121 | dfs = [] 122 | # extract flow and add statistical features 123 | for flow_id in df_new['flow_id'].unique(): 124 | nb_flows += 1 125 | df_new = df_new.sort_values(by = ['packet_id']) 126 | d = df_new[df_new['flow_id'] == flow_id].head(n = 1) 127 | d['nb_packets'] = len(df_new[df_new['flow_id'] == flow_id]) 128 | c = d['class'].tolist() 129 | dport = d.dport.tolist() 130 | sport = d.sport.tolist() 131 | #print(d) 132 | _df = df_new.loc[df_new['flow_id'] == flow_id, 'iat'] 133 | d['sum_iat'] = np.sum(_df) 134 | 135 | _df = df_new.loc[df_new['flow_id'] == flow_id, 'length'] 136 | d['sum_length'] = np.sum(_df) 137 | dfs.append(d) 138 | _d = pd.concat(dfs) 139 | df_flows = pd.concat([_d, df_flows]) 140 | 141 | print(f, "processed in ", time.time() - start_time, "seconds.") 142 | 143 | # uncomment following line to stop after the first file during debug 144 | # break 145 | 146 | print(" processing took ", time.time() - start_time, "seconds.") 147 | print("%d flows processed" % nb_flows) 148 | # Finish processing the data, create the train/tests split and save as pickle files 149 | df_flows = df_flows.fillna(0) 150 | 151 | self.classes = list(self.classes) 152 | self._hotencode_class(df_flows) 153 | return df_flows 154 | 155 | def data_preparation(self): 156 | print("data_preparation") 157 | import warnings 158 | warnings.filterwarnings("ignore") 159 | 160 | df_flows = {} 161 | files = [f for f in listdir(self.data_dir) if isfile(join(self.data_dir, f))] 162 | for _i in range(len(files)): 163 | files[_i] = self.data_dir + "/" + files[_i] 164 | 165 | # print(files) 166 | for i in self.nb_packets_per_flow: 167 | self.__generate_pickle_for_n_packets(i, files) 168 | 169 | def __generate_pickle_for_n_packets(self, n, files): 170 | print("__generate_pickle_for_n_packets n =", n) 171 | 172 | for fold in range(self.nb_folds): 173 | if self._test_data_prepared((n, fold)): 174 | print("pickle files detected for ", n, "packets") 175 | return 176 | nb_flows = 0 177 | df_flows = pd.DataFrame() 178 | self.classes = set() 179 | for f in files: 180 | # print("f=", f) 181 | start_time = time.time() 182 | df_new = pd.read_csv(f, 183 | names = [ 184 | 'packet_id', 185 | 'timestamp', 186 | 'iat', 187 | 'source', 188 | 'sport', 189 | 'dest', 190 | 'dport', 191 | 'protocol', 192 | 'length', 193 | 'flow_id' 194 | ], 195 | dtype = { 196 | 'flow_id': 'Int32', 197 | 'timestamp': np.float64, 198 | 'iat': np.float64, 199 | 'source':str, 200 | 'sport': 'Int32', 201 | 'dest': str, 202 | 'dport': 'Int32', 203 | 'protocol': 'Int32', 204 | 'length': 'Int64', 205 | 'flow_id': 'Int64' 206 | }, 207 | header = 0 208 | ) 209 | print(n, f, df_new.shape) 210 | 211 | # drop DNS traffic 212 | df_new = df_new.drop(df_new[df_new['sport'] == 53].index) 213 | df_new = df_new.drop(df_new[df_new['dport'] == 53].index) 214 | 215 | found = False 216 | for k, v in filename_patterns.items(): 217 | if k in f: 218 | df_new['class'] = v 219 | self.classes.add(v) 220 | found = True 221 | break 222 | if found == False: 223 | print("Type for file", f, "not found") 224 | sys.exit(1) 225 | 226 | # extract flow and add statistical features 227 | for flow_id in df_new['flow_id'].unique(): 228 | nb_flows += 1 229 | df_new = df_new.sort_values(by = ['packet_id']) 230 | _df_new = df_new[df_new['flow_id'] == flow_id].head(n = n) 231 | d = _df_new.head(n = 1) 232 | d['nb_packets'] = len(_df_new) #df_new[df_new['flow_id'] == flow_id]) 233 | c = d['class'].tolist() 234 | dport = d.dport.tolist() 235 | sport = d.sport.tolist() 236 | #print(d) 237 | _df = _df_new['iat'] 238 | d['min_iat'] = 0 239 | d['min_iat'] = np.min(_df_new[_df_new['iat'] > 0]['iat']) 240 | if len(d[d['iat'] < 0]) > 0: 241 | print(_df_new, "has negative iat") 242 | continue 243 | if len(d[d['iat'] > 120]) > 0: 244 | print(_df_new, "has iat > 120") 245 | continue 246 | d['max_iat'] = np.max(_df) 247 | d['sum_iat'] = np.sum(_df) 248 | d['mean_iat'] = np.mean(_df) 249 | d['median_iat'] = np.median(_df) 250 | d['std_iat'] = np.std(_df) 251 | d['1stQ_iat'] = np.quantile(_df, 0.25) 252 | d['3rdQ_iat'] = np.quantile(_df, 0.75) 253 | _a = list(_df) 254 | d['skew_iat'] = skew(_a) 255 | d['kurt_iat'] = kurtosis(_a) 256 | # d['skew_iat'] = skew(np.array(df_new.loc[df_new['flow_id'] == flow_id, 'iat'])) 257 | # d['kurt_iat'] = kurtosis(np.array(df_new.loc[df_new['flow_id'] == flow_id, 'iat'])) 258 | 259 | _df = _df_new['length'] 260 | d['min_length'] = np.min(_df) 261 | d['max_length'] = np.max(_df) 262 | d['sum_length'] = np.sum(_df) 263 | d['median_length'] = np.median(_df) 264 | d['mean_length'] = np.mean(_df) 265 | d['std_length'] = np.std(_df) 266 | d['1stQ_length'] = np.quantile(_df, 0.25) 267 | d['3rdQ_length'] = np.quantile(_df, 0.75) 268 | _a = list(_df) 269 | d['skew_length'] = skew(_a) 270 | # d['skew_length'] = skew(np.array(df_new.loc[df_new['flow_id'] == flow_id, 'length'])) 271 | d['kurt_length'] = kurtosis(_a) 272 | # d['kurt_length'] = kurtosis(np.array(df_new.loc[df_new['flow_id'] == flow_id, 'length'])) 273 | 274 | d['src'] = f 275 | df_flows = pd.concat([d, df_flows]) 276 | 277 | print(f, "processed in ", time.time() - start_time, "seconds.") 278 | 279 | # uncomment following line to stop after the first file during debug 280 | # break 281 | 282 | print("%d flows processed" % nb_flows) 283 | # Finish processing the data, create the train/tests split and save as pickle files 284 | df_flows = df_flows.fillna(0) 285 | 286 | self.classes = list(self.classes) 287 | self._hotencode_class(df_flows) 288 | 289 | filename = self.filename_prefix + "_" + str(n) + ".pickle" 290 | self._generate_data_folds(df_flows, filename) 291 | 292 | ######################################## 293 | # Data Analysis 294 | ######################################## 295 | # def __show_actual_and_predicted(self, X, y, y_pred, _class): 296 | # print(self.classes) 297 | # for _i in itertools.product(NB_PACKETS, self.filenames): 298 | # i = (_i[0], _i[1], 0) 299 | # print(i) 300 | # df = X[i].copy() 301 | # df['type'] = y[i] 302 | # df['type_pred'] = y_pred[i] 303 | # print(df.columns) 304 | # a4_dims = (23.4, 16.54) 305 | # fig, ax = plt.subplots(figsize = a4_dims) 306 | # sns.lmplot( 307 | # x = 'sum_iat', 308 | # y = 'sum_length', 309 | # data = df[df['type'] == _class], 310 | # hue = 'type', 311 | # fit_reg = False, 312 | # height = 4, aspect = 5, 313 | # # color = 'green', 314 | # # scatter_kws = {'alpha': 0.3}, 315 | # # ax = ax, 316 | # legend = False, 317 | # palette = 'viridis' 318 | # ) 319 | # #ax.set(xlabel='time_delta', ylabel='packet_size') 320 | # ax.set(xlabel = 'duration', ylabel = 'sum_packet_size') 321 | # plt.legend(title = 'Class', labels =self.classes) 322 | # plt.savefig(self.filename_prefix + "_" + self.classes[_class] + "_"+ str(i[0]) + "_" + i[1] + ".png", format = 'png') 323 | # fig, ax2 = plt.subplots(figsize = a4_dims) 324 | # sns.lmplot( 325 | # x = 'sum_iat', 326 | # y = 'sum_length', 327 | # data = df[df['type_pred'] == _class], 328 | # hue = 'type', 329 | # fit_reg = False, 330 | # height = 4, aspect = 5, 331 | # # color = 'orange', 332 | # # scatter_kws = {'alpha': 0.3}, 333 | # legend = False, 334 | # palette = 'viridis', 335 | # # ax = ax2 336 | # ) 337 | # ax2.set(xlabel = 'duration', ylabel = 'sum_packet_size') 338 | # plt.legend(title = 'Class', labels =self.classes) 339 | # plt.savefig(self.filename_prefix + "_" + self.classes[_class] + "_pred_"+ str(i[0]) + "_" + i[1]+".png", format = 'png') 340 | 341 | # ######################################## 342 | # # Akem's methods 343 | # ######################################## 344 | # # Feature Importance 345 | # """ 346 | # Function to Fit model based on optimal values of depth and number of estimators and use it 347 | # to compute feature importance for all the features. 348 | # """ 349 | # def get_feature_importance(depth, n_tree, max_leaf, X_train, y_train): 350 | # from sklearn.ensemble import RandomForestClassifier 351 | 352 | # # rf_opt = RandomForestClassifier(max_depth = depth, n_estimators = n_tree, random_state=42, bootstrap=False) 353 | # rf_opt = RandomForestClassifier(max_depth = depth, n_estimators = n_tree, max_leaf_nodes=max_leaf, random_state=42, bootstrap=False) 354 | # rf_opt.fit(X_train, y_train) 355 | # feature_importance = pd.DataFrame(rf_opt.feature_importances_) 356 | # feature_importance.index = X_train.columns 357 | # feature_importance = feature_importance.sort_values(by=list(feature_importance.columns),axis=0,ascending=False) 358 | 359 | # return feature_importance 360 | 361 | 362 | # """ 363 | # Function to Fit model based on optimal values of depth and number of estimators and feature importance 364 | # to find the fewest possible features to exceed the previously attained score with all selected features 365 | # """ 366 | # def get_fewest_features(depth, n_tree, max_leaf, importance): 367 | # sorted_feature_names = importance.index 368 | # # print('sorted_feature_names: ', sorted_feature_names) 369 | # features = [] 370 | # for f in range(1,len(sorted_feature_names)+1): 371 | # features.append(sorted_feature_names[0:f]) 372 | # # print('features:', features) 373 | # return features 374 | 375 | 376 | # def get_result_scores(classes, cl_report): 377 | # precision=[] 378 | # recall=[] 379 | # f1_score=[] 380 | # supports=[] 381 | # for a_class in classes: 382 | # precision.append(cl_report[a_class]['precision']) 383 | # recall.append(cl_report[a_class]['recall']) 384 | # f1_score.append(cl_report[a_class]['f1-score']) 385 | # supports.append(cl_report[a_class]['support']) 386 | # return precision, recall, f1_score, supports 387 | 388 | 389 | # def get_scores(classes, depth, n_tree, feats, max_leaf, X_train, y_train, X_test, y_test): 390 | # from sklearn.ensemble import RandomForestClassifier 391 | # model = RandomForestClassifier(max_depth=depth, n_estimators = n_tree, max_leaf_nodes=max_leaf, n_jobs=4, 392 | # random_state=42, bootstrap=False) 393 | 394 | # model.fit(X_train[feats], y_train) 395 | # y_pred = model.predict(X_test[feats]) 396 | 397 | # class_report = classification_report(y_test, y_pred, target_names=classes, output_dict = True) 398 | 399 | # accurac = model.score(X_test[feats], y_test) 400 | # macro_score = class_report['macro avg']['f1-score'] 401 | # weighted_score = class_report['weighted avg']['f1-score'] 402 | 403 | # return model, class_report, macro_score, weighted_score, y_pred, accurac 404 | 405 | 406 | # def get_x_y(Dataset, classes, feats): 407 | # Dataset = Dataset[Dataset["Label"].isin(classes)] 408 | # X = Dataset[feats] 409 | # y = Dataset['Label'].replace(classes, range(len(classes))) 410 | # # y = Dataset.columns[-1].replace(classes, range(len(classes))) 411 | 412 | # return X, y 413 | 414 | # def analyze_models(classes, model_type, depths, n_trees, X_train, y_train, X_test, y_test, max_leaf, filename_out): 415 | 416 | # with open(filename_out, "w") as res_file: 417 | # print('depth;tree;n_feat;Macro_F1;Weighted_F1;Accuracy;feats;c_report', file=res_file) 418 | # if model_type == 'RF': 419 | # # FOR EACH (depth, n_tree, feat) 420 | # for depth in depths: 421 | # for n_tree in n_trees: 422 | # # get feature orders to use 423 | # importance = get_feature_importance(depth, n_tree, max_leaf, X_train, y_train) 424 | 425 | # m_feats = get_fewest_features(depth, n_tree, max_leaf, importance) 426 | # for feats in m_feats: 427 | # # Get the scores with the given (depth, n_tree, feat) 428 | # model, c_report, macro_f1, weight_f1, y_pred, accuracs = get_scores(classes, depth, n_tree, feats, max_leaf, X_train, y_train, X_test, y_test) 429 | 430 | # print(str(depth)+';'+str(n_tree)+';'+str(len(feats))+';'+str(macro_f1)+';'+str(weight_f1)+';'+str(accuracs)+';'+str(list(feats))+';'+str(c_report), file=res_file) 431 | # print("Analysis Complete. Check output file.") 432 | # return [] 433 | 434 | # # N = number of packets in flows, feats = array of feature names to use, feat_name = string to add to output file name 435 | # def analyze_models_for_npkts(self, N, feats, feat_name): 436 | # i = (N, self.filenames[0], 0) 437 | # print("Number of packets per flow: ", N) 438 | 439 | # X_trains, y_trains = X_train[i][feats], y_train[i] 440 | # X_tests, y_tests = X_test[i][feats], y_test[i] 441 | 442 | # results_file = "Models_" + feat_name + "_" + str(N) + "_pkts_.csv" 443 | # analyze_models(self.classes, "RF", range(7, 20, 1), range(1, 8, 2), X_trains, y_trains, X_tests, y_tests, 500, results_file) 444 | 445 | # results = pd.read_csv(results_file, sep=';') 446 | # results = results.sort_values(by=['Weighted_F1','Macro_F1'],ascending=False) 447 | # print(results.head(10)) 448 | # print("******") 449 | # print(results.head(1)['c_report'].values) 450 | 451 | ######################################## 452 | # GBoost 453 | ######################################## 454 | def GBoost_predict(self, feats): 455 | print("GBoost_predict") 456 | from sklearn.ensemble import GradientBoostingClassifier 457 | gb_model = {} 458 | 459 | for i in EncryptedTrafficClassifierIterator(self.flow_ids): 460 | gb_model[i] = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state = 42) 461 | 462 | for i in EncryptedTrafficClassifierIterator(self.flow_ids): 463 | print("==",i,"==") 464 | try: 465 | gb_model[i].fit(X_train[i][feats], y_train[i]) 466 | except ValueError as e: 467 | print(e) 468 | pass 469 | 470 | gb_y_train_predicted = {} 471 | gb_y_test_predicted = {} 472 | gb_train_score = {} 473 | gb_test_score = {} 474 | for i in EncryptedTrafficClassifierIterator(self.flow_ids): 475 | print("==",i,"==") 476 | gb_y_train_predicted[i] = gb_model[i].predict(X_train[i][feats]) 477 | gb_y_test_predicted[i] = gb_model[i].predict(X_test[i][feats]) 478 | gb_train_score[i] = gb_model[i].score(X_train[i][feats], y_train[i]) 479 | gb_test_score[i] = gb_model[i].score(X_test[i][feats], y_test[i]) 480 | 481 | self._get_scores_from_models(gb_model, y_test, gb_y_test_predicted, feats) 482 | 483 | gb_cm_dict = {} 484 | for i in EncryptedTrafficClassifierIterator(self.flow_ids): 485 | print("==",i,"==") 486 | gb_cm_dict[i] = confusion_matrix(y_test[i], gb_y_test_predicted[i].astype(int)) 487 | print(gb_cm_dict[i]) 488 | 489 | for i in EncryptedTrafficClassifierIterator(self.flow_ids): 490 | pkt, _ = i 491 | classification_results.loc[classification_results['nb_packets'] == pkt, 'gb_train_score'] = gb_train_score[i] 492 | classification_results.loc[classification_results['nb_packets'] == pkt, 'gb_test_score'] = gb_test_score[i] 493 | 494 | return gb_model, gb_y_train_predicted, gb_y_test_predicted 495 | 496 | 497 | ######################################## 498 | # Entry point 499 | ######################################## 500 | if __name__ == "__main__": 501 | parser = argparse.ArgumentParser( 502 | prog='netflow_quic_traffic_classifier', 503 | description='Classify packets or flows from NetFlow QUIC dataset', 504 | epilog='' 505 | ) 506 | parser.add_argument('-p', '--nb_packets', action = 'append', type = int, required = True) #, default = [4, 8]) 507 | parser.add_argument('-c', '--classifier', action = 'append', type = str) #, default = ['rf']) 508 | parser.add_argument('-f', '--nb_folds', action = 'store', default = 12, type = int) 509 | parser.add_argument('-v', '--visualization', action = 'store_true', required = False, default = False) 510 | parser.add_argument('-r', '--report', action = 'store_true', required = False, default = False) 511 | parser.add_argument('-F', '--force_rf_classification', action = 'store_true', required = False, default = False) 512 | args = parser.parse_args(sys.argv[1:]) 513 | 514 | VISUALIZATION_ENABLED = False 515 | if args.visualization == True: 516 | VISUALIZATION_ENABLED = True 517 | 518 | RF_ENABLED = False 519 | GB_ENABLED = False 520 | XG_ENABLED = False 521 | for c in args.classifier: 522 | c = c.lower() 523 | if c == "rf": 524 | RF_ENABLED = True 525 | elif c == "gb": 526 | GB_ENABLED = True 527 | elif c == "xg": 528 | XG_ENABLED = True 529 | else: 530 | print("Unknown classifier", c) 531 | 532 | classifier = NetFlowQUICClassifier( 533 | nb_folds = args.nb_folds, 534 | nb_packets_per_flow = args.nb_packets 535 | ) 536 | 537 | if args.force_rf_classification == True: 538 | classifier.force_rf_classification = True 539 | 540 | classifier.all_classes = [ 541 | "youtube", 542 | "Google_Play_Music", 543 | "GoogleHangout_VoIP", 544 | "GoogleHangout_Chat", 545 | "FileTransfer", 546 | ] 547 | 548 | non_needed_features = [ 549 | 'flow_id', 550 | 'class', 551 | 'source', 552 | 'dest', 553 | 'sport', 554 | 'dport', 555 | 'protocol', 556 | 'timestamp', 557 | # 'nb_packets', 558 | 'src', 559 | 'iat', 560 | 'direction', 561 | 'length', 562 | 'packet_id' 563 | ] 564 | 565 | all_features_flows = [ 566 | 'min_iat', 567 | 'max_iat', 568 | 'sum_iat', 569 | 'mean_iat', 570 | 'median_iat', 571 | 'std_iat', 572 | '1stQ_iat', 573 | '3rdQ_iat', 574 | 'skew_iat', 575 | 'kurt_iat', 576 | 'min_length', 577 | 'max_length', 578 | 'sum_length', 579 | 'median_length', 580 | 'mean_length', 581 | 'std_length', 582 | '1stQ_length', 583 | '3rdQ_length', 584 | 'skew_length', 585 | 'kurt_length', 586 | 'nb_packets', 587 | # 'sport', 588 | # 'dport', 589 | # 'protocol', 590 | # 'direction' 591 | ] 592 | # best_features = [ 593 | # 'max_iat', 594 | # 'sum_iat', 595 | # 'mean_iat', 596 | # 'median_iat', 597 | # 'std_iat', 598 | # '1stQ_iat', 599 | # '3rdQ_iat', 600 | # 'skew_iat', 601 | # 'kurt_iat', 602 | # 'min_length', 603 | # 'max_length', 604 | # 'sum_length', 605 | # 'median_length', 606 | # 'mean_length', 607 | # 'std_length', 608 | # '1stQ_length', 609 | # '3rdQ_length', 610 | # 'skew_length', 611 | # 'kurt_length' 612 | # ] 613 | best_features = ['3rdQ_iat', 'std_iat', 'std_length', 'skew_iat', 'max_iat', 'sum_iat', 'mean_length', '1stQ_length', 'max_length', 'mean_iat', 'min_length', 'sum_length', 'median_length', '1stQ_iat', 'median_iat', '3rdQ_length', 'kurt_iat', 'kurt_length', 'nb_packets'] 614 | online_features=[ 615 | 'sum_iat', 616 | 'sum_length', 617 | 'max_length', 618 | 'mean_iat', 619 | 'max_iat', 620 | 'mean_length', 621 | 'min_length', 622 | 'min_iat' 623 | ] 624 | feats_flows = all_features_flows 625 | 626 | # Preprocessing 627 | if not classifier.data_prepared(): 628 | classifier.data_preparation() 629 | classifier.load_flows() 630 | else: 631 | classifier.load_flows() 632 | #_c = classifier.y_train_flows[(classifier.nb_packets_per_flow[0], 0)].unique() 633 | classifier.classes = [-1 for _ in classifier.all_classes] 634 | _Xy = classifier.X_train_flows[(classifier.nb_packets_per_flow[0], 0)].copy() 635 | _Xy['y'] = classifier.y_train_flows[(classifier.nb_packets_per_flow[0], 0)] 636 | for index, row in _Xy.iterrows(): 637 | for _i in range(len(classifier.all_classes)): 638 | if classifier.all_classes[_i] in row['src']: 639 | classifier.classes[row['y']] = classifier.all_classes[_i] 640 | break 641 | #classifier.classes[i] .append(classifier.all_classes[i]) 642 | if -1 not in classifier.classes: 643 | break 644 | print("classes =",classifier.classes) 645 | 646 | #classifier.classes = [] 647 | #for i in _c: 648 | # classifier.classes.append(classifier.all_classes[i]) 649 | 650 | classifier.cleanup_data(classifier.X_train_flows, 651 | classifier.y_train_flows, 652 | classifier.X_test_flows, 653 | classifier.y_test_flows, 654 | classifier.flow_ids, 655 | non_needed_features) 656 | 657 | # scaling during processing make results worse ! 658 | # classifier.X_train_flows_fitted, classifier.X_test_flows_fitted = classifier.preprocessing( 659 | # classifier.X_train_flows, 660 | # classifier.y_train_flows, 661 | # classifier.X_test_flows, 662 | # classifier.y_test_flows, 663 | # classifier.flow_ids, 664 | # feats_flows 665 | # ) 666 | classifier.X_train_flows_fitted = classifier.X_train_flows 667 | classifier.X_test_flows_fitted = classifier.X_test_flows 668 | # for i in EncryptedTrafficClassifierIterator(classifier.flow_ids): 669 | # # print(classifier.X_train_flows[i].columns) 670 | # # print(classifier.X_train_flows[i].index) 671 | # # print(classifier.y_train_flows[i].index) 672 | # classifier.X_train_flows[i] = classifier.X_train_flows[i][classifier.X_train_flows[i]['sum_iat'] < 120] 673 | # classifier.y_train_flows[i] = classifier.y_train_flows[i][classifier.X_train_flows[i].index] 674 | # classifier.X_test_flows[i] = classifier.X_test_flows[i][classifier.X_test_flows[i]['sum_iat'] < 120] 675 | # classifier.y_test_flows[i] = classifier.y_test_flows[i][classifier.X_test_flows[i].index] 676 | 677 | # __correlation() 678 | # analyze_models_for_npkts(10, all_features, "all_feats") 679 | if args.report == True: 680 | __class_names = { 681 | "youtube": "YouTube", 682 | "Google_Play_Music": "Music", 683 | "GoogleHangout_VoIP": "VoIP", 684 | "GoogleHangout_Chat": "Chat", 685 | "FileTransfer": "FileTransfer", 686 | } 687 | for _c in range(len(classifier.classes)): 688 | classifier.classes[_c] = __class_names[classifier.classes[_c]] 689 | classifier._viz(distribution = 0, class_distribution = -1, nb_packets = -1, min_iat = -1, max_iat = -1) 690 | for n in classifier.nb_packets_per_flow: 691 | if n == 4: 692 | classifier._viz(distribution = -1, class_distribution = 10, nb_packets = 0, min_iat = -1, max_iat = -1) 693 | elif n == 8: 694 | classifier._viz(distribution = -1, class_distribution = 10, nb_packets = 0, min_iat = -1, max_iat = -1) 695 | elif n == 600000: 696 | classifier._viz(distribution = -1, class_distribution = 11, nb_packets = 1, min_iat = 1, max_iat = -1) 697 | sys.exit(1) 698 | if VISUALIZATION_ENABLED: 699 | # classifier.classes= ["Chat", "Music", "YouTube", "VoIP", "FileTransfer"] 700 | # classifier._viz(distribution = 0, class_distribution = 10, nb_packets = -1, min_iat = 1, max_iat = -1) 701 | # sys.exit(1) 702 | 703 | # f = classifier.filename_prefix + '_datasetflows_distribution.pickle' 704 | # if isfile(classifier.processed_data_output_dir + f): 705 | # print("Loading dataset from pickle file", f) 706 | # _df = classifier._load_pickle(f) 707 | # else: 708 | # print("Creating dataset") 709 | # _df = classifier._get_flows_with_all_packets() 710 | # classifier._pickle_dump(_df, f) 711 | # print("Dataset saved in file", f) 712 | # classifier._class_distribution(_df, classifier.filename_prefix + '_flows_distribution') 713 | # classifier._nb_packets_distribution(_df, classifier.filename_prefix + "_flows_nb_packets_distribution" ) 714 | # classifier._distribution(_df, classifier.filename_prefix + "_flows_class_split" ) 715 | 716 | __class_names = { 717 | "youtube": "YouTube", 718 | "Google_Play_Music": "Music", 719 | "GoogleHangout_VoIP": "VoIP", 720 | "GoogleHangout_Chat": "Chat", 721 | "FileTransfer": "FileTransfer", 722 | } 723 | for _c in range(len(classifier.classes)): 724 | classifier.classes[_c] = __class_names[classifier.classes[_c]] 725 | pkt = classifier.nb_packets_per_flow[0] 726 | fold = 0 727 | _i = pkt, fold 728 | _df1 = classifier.X_train_flows[_i].copy() 729 | _df1['type'] = classifier.y_train_flows[_i].values 730 | _df2 = classifier.X_test_flows[_i].copy() 731 | _df2['type'] = classifier.y_test_flows[_i].values 732 | _df = pd.concat([_df1, _df2]) 733 | _df.reset_index() 734 | # _df = _df[_df['sum_iat'] < 120] 735 | # _df.to_csv("netflow_datasets_pkts_"+str(pkt)+".csv", index = False) 736 | # sys.exit(1) 737 | print(_df.shape) 738 | classifier._distribution(_df, classifier.filename_prefix + "_flows_class_split_" + str(pkt) + '_pkt') 739 | sys.exit(1) 740 | # classifier._class_distribution(_df, classifier.filename_prefix + '_flows_distribution_' + str(pkt) + '_pkt') 741 | # classifier._nb_packets_distribution(_df, classifier.filename_prefix + "_flows_nb_packets_distribution_" + str(pkt) + '_pkt') 742 | # classifier._min_iat_distribution(_df, classifier.filename_prefix + "_flows_min_iat_distribution_" + str(pkt) + '_pkt') 743 | 744 | if RF_ENABLED: 745 | print("==== RandomForest =====") 746 | __class_names = { 747 | "youtube": "YouTube", 748 | "Google_Play_Music": "Music", 749 | "GoogleHangout_VoIP": "VoIP", 750 | "GoogleHangout_Chat": "Chat", 751 | "FileTransfer": "FileTransfer", 752 | } 753 | for _c in range(len(classifier.classes)): 754 | classifier.classes[_c] = __class_names[classifier.classes[_c]] 755 | 756 | rf_regr_flows, rf_y_train_flows_predicted, rf_y_test_flows_predicted = classifier.RF_predict( 757 | classifier.X_train_flows_fitted, 758 | classifier.y_train_flows, 759 | classifier.X_test_flows_fitted, 760 | classifier.y_test_flows 761 | ) 762 | 763 | # __show_actual_and_predicted(X_test, y_test, rf_y_test_predicted, 1) 764 | rf_cm_dict_flows, output = classifier.confusion_matrix(rf_regr_flows, 765 | classifier.y_test_flows, 766 | rf_y_test_flows_predicted, 767 | classifier.flow_ids, 768 | "rf" 769 | ) 770 | print(output) 771 | rf_f1_scores_flows, output = classifier.get_F1_score(rf_cm_dict_flows, 772 | classifier.y_test_flows, 773 | rf_y_test_flows_predicted, 774 | classifier.flow_ids, 775 | "rf_flows") 776 | print(output) 777 | avg_scores, output = classifier.avg_f1_scores(rf_f1_scores_flows, classifier.flow_ids) #_without_folds) 778 | print(output) 779 | 780 | if GB_ENABLED: 781 | print("==== GradientBoosting =====") 782 | gb_regr, gb_y_train_predicted, gb_y_test_predicted = classifier.GBoost_predict(feats_flows, classification_results) 783 | gb_cm_dict = classifier.confusion_matrix(gb_regr, classifier.y_test_flows, gb_y_test_predicted, classifier.flow_ids, "gb") 784 | gb_f1_scores = classifier.get_F1_score(gb_cm_dict, y_test, gb_y_test_predicted, "gb", False) 785 | classifier.avg_f1_scores(gb_f1_scores_flows, classifier.flow_ids_without_folds) 786 | # classifier.avg_f1_scores(gb_f1_scores) 787 | 788 | if XG_ENABLED: 789 | print("==== XGBoost =====") 790 | xg_regr, xg_y_train_predicted, xg_y_test_flows_predicted = classifier.XGBoost_predict( 791 | classifier.X_train_flows_fitted, 792 | classifier.y_train_flows, 793 | classifier.X_test_flows_fitted, 794 | classifier.y_test_flows 795 | ) 796 | 797 | xg_cm_dict_flows, output = classifier.confusion_matrix(xg_regr, 798 | classifier.y_test_flows, 799 | xg_y_test_flows_predicted, 800 | classifier.flow_ids, 801 | "xg" 802 | ) 803 | print(output) 804 | 805 | xg_f1_scores_flows, output = classifier.get_F1_score( 806 | xg_cm_dict_flows, 807 | classifier.y_test_flows, 808 | xg_y_test_flows_predicted, 809 | classifier.flow_ids, 810 | "xg_flows") 811 | print(output) 812 | # xg_cm_dict, classifier.y_test_flows, xg_y_test_predicted, "xg", False) 813 | avg_scores, output = classifier.avg_f1_scores(xg_f1_scores_flows, classifier.flow_ids) 814 | print(output) 815 | # print(xg_f1_scores) 816 | 817 | print(classifier.classification_results) 818 | if RF_ENABLED or GB_ENABLED or XG_ENABLED: 819 | classifier.save_results() -------------------------------------------------------------------------------- /Offline_ETC/noms2023_instant_messaging_traffic_classifier.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import itertools 6 | from os import listdir 7 | from os.path import isfile, join 8 | import sys 9 | import time 10 | 11 | import pandas as pd 12 | 13 | import numpy as np 14 | 15 | from scipy.stats import kurtosis, skew 16 | 17 | from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures 18 | from sklearn.compose import ColumnTransformer 19 | from sklearn.pipeline import Pipeline 20 | from sklearn.metrics import classification_report, f1_score, confusion_matrix, ConfusionMatrixDisplay 21 | 22 | import seaborn as sns 23 | import matplotlib.pyplot as plt 24 | 25 | from encrypted_traffic_classification import EncryptedTrafficClassifier, EncryptedTrafficClassifierIterator 26 | 27 | ######################################## 28 | # Data preparation: convert RAW data 29 | ######################################## 30 | class NOMS2023InstantMessagingClassifier(EncryptedTrafficClassifier): 31 | def __init__(self, nb_folds, nb_packets_per_flow): 32 | super().__init__( 33 | nb_folds= nb_folds, 34 | nb_packets_per_flow = nb_packets_per_flow, 35 | filename_prefix = "noms2023_im", 36 | processed_data_output_dir = "noms2023_im_output/", 37 | data_dir = "data/noms2023_im/" 38 | ) 39 | 40 | pools = [tuple(pool) for pool in [self.nb_packets_per_flow, range(self.nb_folds)]] 41 | result = [[]] 42 | for pool in pools: 43 | result = [x+[y] for x in result for y in pool] 44 | self.flow_ids = result 45 | 46 | pools = [tuple(pool) for pool in [self.nb_packets_per_flow]] 47 | result = [[]] 48 | for pool in pools: 49 | result = [x+[y] for x in result for y in pool] 50 | self.flow_ids_without_folds = result 51 | 52 | pools = [tuple(pool) for pool in [range(self.nb_folds)]] 53 | result = [[]] 54 | for pool in pools: 55 | result = [x+[y] for x in result for y in pool] 56 | self.packet_ids = result 57 | 58 | ######################################## 59 | # Preprocessing 60 | ######################################## 61 | def data_preparation(self): 62 | print("data_preparation") 63 | import warnings 64 | warnings.filterwarnings("ignore") 65 | 66 | df_flows = {} 67 | files = [f for f in listdir(self.data_dir) if isfile(join(self.data_dir, f))] 68 | for _i in range(len(files)): 69 | files[_i] = self.data_dir + "/" + files[_i] 70 | 71 | # print(files) 72 | for i in self.nb_packets_per_flow: 73 | self.__generate_pickle_for_n_packets(i, files) 74 | 75 | def _get_flows_with_all_packets(self): 76 | print("_get_flows_with_all_packets") 77 | 78 | self.classes = set() 79 | start_time = time.time() 80 | nb_flows = 0 81 | df_flows = pd.DataFrame() 82 | files = [f for f in listdir(self.data_dir) if isfile(join(self.data_dir, f))] 83 | for _i in range(len(files)): 84 | f = self.data_dir + "/" + files[_i] 85 | # print("f=", f) 86 | df_new = pd.read_csv(f, 87 | names = [ 88 | 'flow_id', 89 | 'timestamp', 90 | 'iat', 91 | 'source', 92 | 'sport', 93 | 'dest', 94 | 'dport', 95 | 'protocol', 96 | 'length' 97 | ], 98 | header = 0 99 | ) 100 | print(f, df_new.shape) 101 | 102 | # drop DNS traffic 103 | df_new = df_new.drop(df_new[df_new['sport'] == 53].index) 104 | df_new = df_new.drop(df_new[df_new['dport'] == 53].index) 105 | 106 | found = False 107 | for _c in self.all_classes: 108 | if _c in f: 109 | found = True 110 | df_new['class'] = _c 111 | self.classes.add(_c) 112 | break 113 | if found == False: 114 | print("class not identified for", f) 115 | dfs = [] 116 | # extract flow and add statistical features 117 | for flow_id in df_new['flow_id'].unique(): 118 | nb_flows += 1 119 | d = df_new[df_new['flow_id'] == flow_id].head(n = 1) 120 | d['nb_packets'] = len(df_new[df_new['flow_id'] == flow_id]) 121 | c = d['class'].tolist() 122 | dport = d.dport.tolist() 123 | sport = d.sport.tolist() 124 | #print(d) 125 | _df = df_new.loc[df_new['flow_id'] == flow_id, 'iat'] 126 | d['sum_iat'] = np.sum(_df) 127 | 128 | _df = df_new.loc[df_new['flow_id'] == flow_id, 'length'] 129 | d['sum_length'] = np.sum(_df) 130 | dfs.append(d) 131 | _d = pd.concat(dfs) 132 | df_flows = pd.concat([df_flows, _d]) 133 | # uncomment for debugging 134 | # break 135 | 136 | print(f, "processed in ", time.time() - start_time, "seconds.") 137 | print("%d flows processed" % nb_flows) 138 | # Finish processing the data, create the train/tests split and save as pickle files 139 | df_flows = df_flows.fillna(0) 140 | 141 | self.classes = list(self.classes) 142 | self._hotencode_class(df_flows) 143 | return df_flows 144 | 145 | def __statistical_features(self, df, n, df_flows, f, nb_flows): 146 | nb_flows[0] += 1 147 | # d = df.head(n = 1) 148 | d = df 149 | _df_new = df.head(n = n) 150 | d['nb_packets'] = len(_df_new) #df_new[df_new['flow_id'] == flow_id]) 151 | c = d['class'].tolist() 152 | dport = d.dport.tolist() 153 | sport = d.sport.tolist() 154 | #print(d) 155 | _df = _df_new['iat'] 156 | d['min_iat'] = np.min(df[df['iat'] > 0]['iat']) # probably useless as most probably always 0 for the first packet 157 | d['max_iat'] = np.max(_df) 158 | d['sum_iat'] = np.sum(_df) 159 | d['mean_iat'] = np.mean(_df) 160 | d['median_iat'] = np.median(_df) 161 | d['std_iat'] = np.std(_df) 162 | d['1stQ_iat'] = np.quantile(_df, 0.25) 163 | d['3rdQ_iat'] = np.quantile(_df, 0.75) 164 | _a = list(_df) 165 | d['skew_iat'] = skew(_a) 166 | d['kurt_iat'] = kurtosis(_a) 167 | 168 | _df = _df_new['length'] 169 | d['min_length'] = np.min(_df) 170 | d['max_length'] = np.max(_df) 171 | d['sum_length'] = np.sum(_df) 172 | d['median_length'] = np.median(_df) 173 | d['mean_length'] = np.mean(_df) 174 | d['std_length'] = np.std(_df) 175 | d['1stQ_length'] = np.quantile(_df, 0.25) 176 | d['3rdQ_length'] = np.quantile(_df, 0.75) 177 | _a = list(_df) 178 | d['skew_length'] = skew(_a) 179 | d['kurt_length'] = kurtosis(_a) 180 | 181 | d['src'] = f 182 | # dfs.append(d) 183 | df_flows = pd.concat([d, df_flows]) 184 | if nb_flows[0] % 1000 == 0: 185 | # print(self.classes) 186 | print("nb flows processed is %d" % nb_flows[0]) 187 | print("df_flows.shape", df_flows.shape) 188 | print(d.columns) 189 | print(df_flows.columns) 190 | return d 191 | 192 | def __generate_pickle_for_n_packets(self, n, files): 193 | print("__generate_pickle_for_n_packets n =", n) 194 | 195 | for fold in range(self.nb_folds): 196 | if self._test_data_prepared((n, fold)): 197 | print("pickle files detected for ", n, "packets") 198 | return 199 | nb_flows = [0] 200 | df_flows = pd.DataFrame() 201 | dfs = [] 202 | self.classes = set() 203 | start_time = time.time() 204 | for f in files: 205 | # print("f=", f) 206 | df_new = pd.read_csv(f, 207 | names = [ 208 | 'packet_id', 209 | 'timestamp', 210 | 'iat', 211 | 'source', 212 | 'sport', 213 | 'dest', 214 | 'dport', 215 | 'protocol', 216 | 'length', 217 | 'flow_id', 218 | ], 219 | header = 0, 220 | index_col = False 221 | ) 222 | print(n, f, df_new.shape) 223 | #print(df_new) 224 | # drop DNS traffic 225 | df_new = df_new.drop(df_new[df_new['sport'] == 53].index) 226 | df_new = df_new.drop(df_new[df_new['dport'] == 53].index) 227 | 228 | found = False 229 | for _c in self.all_classes: 230 | if _c in f: 231 | found = True 232 | df_new['class'] = _c 233 | self.classes.add(_c) 234 | break 235 | if found == False: 236 | print("class not identified for", f) 237 | 238 | print("nb flows = ", len(df_new['flow_id'].unique())) 239 | #df_new.groupby(by = 'flow_id', group_keys = False).apply(self.__statistical_features, n, df_flows, f, nb_flows) 240 | # extract flow and add statistical features 241 | for flow_id in df_new['flow_id'].unique(): 242 | nb_flows[0] += 1 243 | d = df_new[df_new['flow_id'] == flow_id].head(n = 1) 244 | _df_new = df_new[df_new['flow_id'] == flow_id].head(n = n) 245 | d['nb_packets'] = len(_df_new) #df_new[df_new['flow_id'] == flow_id]) 246 | if n != 600000 and d['nb_packets'].iloc[0] != n: 247 | print("Flow #", flow_id," has only", d['nb_packets'].iloc[0]," packets, skipping...") 248 | continue 249 | c = d['class'].tolist() 250 | dport = d.dport.tolist() 251 | sport = d.sport.tolist() 252 | #print(d) 253 | _df = _df_new['iat'] 254 | d['sum_iat'] = np.sum(_df) 255 | if d['sum_iat'].iloc[0] == 0: 256 | print("Total duration is 0 for flow #", flow_id, ", skipping...") 257 | continue 258 | d['min_iat'] = np.min(df_new[df_new['iat'] > 0]['iat']) # probably useless as most probably always 0 for the first packet 259 | d['max_iat'] = np.max(_df) 260 | d['mean_iat'] = np.mean(_df) 261 | d['median_iat'] = np.median(_df) 262 | d['std_iat'] = np.std(_df) 263 | try: 264 | d['1stQ_iat'] = np.quantile(_df, 0.25) 265 | except Exception as e: 266 | d['1stQ_iat'] = 0 267 | 268 | try: 269 | d['3rdQ_iat'] = np.quantile(_df, 0.75) 270 | except Exception as e: 271 | d['3rdQ_iat'] = 0 272 | _a = list(_df) 273 | try: 274 | d['skew_iat'] = skew(_a) 275 | except Exception as e: 276 | d['skew_iat'] = 0 277 | try: 278 | d['kurt_iat'] = kurtosis(_a) 279 | except Exception as e: 280 | d['kurt_iat'] = 0 281 | 282 | _df = _df_new['length'] 283 | d['min_length'] = np.min(_df) 284 | d['max_length'] = np.max(_df) 285 | d['sum_length'] = np.sum(_df) 286 | d['median_length'] = np.median(_df) 287 | d['mean_length'] = np.mean(_df) 288 | d['std_length'] = np.std(_df) 289 | try: 290 | d['1stQ_length'] = np.quantile(_df, 0.25) 291 | except Exception as e: 292 | d['1stQ_length'] = 0 293 | try: 294 | d['3rdQ_length'] = np.quantile(_df, 0.75) 295 | except Exception as e: 296 | d['3rdQ_length'] = 0 297 | _a = list(_df) 298 | try: 299 | d['skew_length'] = skew(_a) 300 | except Exception as e: 301 | d['skew_length'] = 0 302 | try: 303 | d['kurt_length'] = kurtosis(_a) 304 | except Exception as e: 305 | d['kurt_length'] = 0 306 | 307 | d['src'] = f 308 | dfs.append(d) 309 | # df_flows = pd.concat([d, df_flows]) 310 | 311 | df_flows = pd.concat(dfs) 312 | 313 | print(f, "processed in ", time.time() - start_time, "seconds.") 314 | print("%d flows processed" % nb_flows[0]) 315 | # Finish processing the data, create the train/tests split and save as pickle files 316 | df_flows = df_flows.fillna(0) 317 | 318 | self.classes = list(self.classes) 319 | self._hotencode_class(df_flows) 320 | 321 | filename = self.filename_prefix + "_" + str(n) + ".pickle" 322 | self._generate_data_folds(df_flows, filename) 323 | 324 | ######################################## 325 | # Data Analysis 326 | ######################################## 327 | def __show_actual_and_predicted(self, X, y, y_pred, _class): 328 | print(self.classes) 329 | for _i in itertools.product(NB_PACKETS, self.filenames): 330 | i = (_i[0], _i[1], 0) 331 | print(i) 332 | df = X[i].copy() 333 | df['type'] = y[i] 334 | df['type_pred'] = y_pred[i] 335 | print(df.columns) 336 | a4_dims = (23.4, 16.54) 337 | fig, ax = plt.subplots(figsize = a4_dims) 338 | sns.lmplot( 339 | x = 'sum_iat', 340 | y = 'sum_length', 341 | data = df[df['type'] == _class], 342 | hue = 'type', 343 | fit_reg = False, 344 | height = 4, aspect = 5, 345 | # color = 'green', 346 | # scatter_kws = {'alpha': 0.3}, 347 | # ax = ax, 348 | legend = False, 349 | palette = 'viridis' 350 | ) 351 | #ax.set(xlabel='time_delta', ylabel='packet_size') 352 | ax.set(xlabel = 'duration', ylabel = 'sum_packet_size') 353 | plt.legend(title = 'Class', labels =self.classes) 354 | plt.savefig(self.filename_prefix + "_" + self.classes[_class] + "_"+ str(i[0]) + "_" + i[1] + ".png", format = 'png') 355 | fig, ax2 = plt.subplots(figsize = a4_dims) 356 | sns.lmplot( 357 | x = 'sum_iat', 358 | y = 'sum_length', 359 | data = df[df['type_pred'] == _class], 360 | hue = 'type', 361 | fit_reg = False, 362 | height = 4, aspect = 5, 363 | # color = 'orange', 364 | # scatter_kws = {'alpha': 0.3}, 365 | legend = False, 366 | palette = 'viridis', 367 | # ax = ax2 368 | ) 369 | ax2.set(xlabel = 'duration', ylabel = 'sum_packet_size') 370 | plt.legend(title = 'Class', labels =self.classes) 371 | plt.savefig(self.filename_prefix + "_" + self.classes[_class] + "_pred_"+ str(i[0]) + "_" + i[1]+".png", format = 'png') 372 | 373 | ######################################## 374 | # Prediction 375 | ######################################## 376 | 377 | 378 | ######################################## 379 | # Akem's methods 380 | ######################################## 381 | # Feature Importance 382 | """ 383 | Function to Fit model based on optimal values of depth and number of estimators and use it 384 | to compute feature importance for all the features. 385 | """ 386 | def get_feature_importance(depth, n_tree, max_leaf, X_train, y_train): 387 | from sklearn.ensemble import RandomForestClassifier 388 | 389 | # rf_opt = RandomForestClassifier(max_depth = depth, n_estimators = n_tree, random_state=42, bootstrap=False) 390 | rf_opt = RandomForestClassifier(max_depth = depth, n_estimators = n_tree, max_leaf_nodes=max_leaf, random_state=42, bootstrap=False) 391 | rf_opt.fit(X_train, y_train) 392 | feature_importance = pd.DataFrame(rf_opt.feature_importances_) 393 | feature_importance.index = X_train.columns 394 | feature_importance = feature_importance.sort_values(by=list(feature_importance.columns),axis=0,ascending=False) 395 | 396 | return feature_importance 397 | 398 | 399 | """ 400 | Function to Fit model based on optimal values of depth and number of estimators and feature importance 401 | to find the fewest possible features to exceed the previously attained score with all selected features 402 | """ 403 | def get_fewest_features(depth, n_tree, max_leaf, importance): 404 | sorted_feature_names = importance.index 405 | # print('sorted_feature_names: ', sorted_feature_names) 406 | features = [] 407 | for f in range(1,len(sorted_feature_names)+1): 408 | features.append(sorted_feature_names[0:f]) 409 | # print('features:', features) 410 | return features 411 | 412 | 413 | def get_result_scores(classes, cl_report): 414 | precision=[] 415 | recall=[] 416 | f1_score=[] 417 | supports=[] 418 | for a_class in classes: 419 | precision.append(cl_report[a_class]['precision']) 420 | recall.append(cl_report[a_class]['recall']) 421 | f1_score.append(cl_report[a_class]['f1-score']) 422 | supports.append(cl_report[a_class]['support']) 423 | return precision, recall, f1_score, supports 424 | 425 | 426 | def get_scores(classes, depth, n_tree, feats, max_leaf, X_train, y_train, X_test, y_test): 427 | from sklearn.ensemble import RandomForestClassifier 428 | model = RandomForestClassifier(max_depth=depth, n_estimators = n_tree, max_leaf_nodes=max_leaf, n_jobs=4, 429 | random_state=42, bootstrap=False) 430 | 431 | model.fit(X_train[feats], y_train) 432 | y_pred = model.predict(X_test[feats]) 433 | 434 | class_report = classification_report(y_test, y_pred, target_names=classes, output_dict = True) 435 | 436 | accurac = model.score(X_test[feats], y_test) 437 | macro_score = class_report['macro avg']['f1-score'] 438 | weighted_score = class_report['weighted avg']['f1-score'] 439 | 440 | return model, class_report, macro_score, weighted_score, y_pred, accurac 441 | 442 | 443 | def get_x_y(Dataset, classes, feats): 444 | Dataset = Dataset[Dataset["Label"].isin(classes)] 445 | X = Dataset[feats] 446 | y = Dataset['Label'].replace(classes, range(len(classes))) 447 | # y = Dataset.columns[-1].replace(classes, range(len(classes))) 448 | 449 | return X, y 450 | 451 | def analyze_models(classes, model_type, depths, n_trees, X_train, y_train, X_test, y_test, max_leaf, filename_out): 452 | 453 | with open(filename_out, "w") as res_file: 454 | print('depth;tree;n_feat;Macro_F1;Weighted_F1;Accuracy;feats;c_report', file=res_file) 455 | if model_type == 'RF': 456 | # FOR EACH (depth, n_tree, feat) 457 | for depth in depths: 458 | for n_tree in n_trees: 459 | # get feature orders to use 460 | importance = get_feature_importance(depth, n_tree, max_leaf, X_train, y_train) 461 | 462 | m_feats = get_fewest_features(depth, n_tree, max_leaf, importance) 463 | for feats in m_feats: 464 | # Get the scores with the given (depth, n_tree, feat) 465 | model, c_report, macro_f1, weight_f1, y_pred, accuracs = get_scores(classes, depth, n_tree, feats, max_leaf, X_train, y_train, X_test, y_test) 466 | 467 | print(str(depth)+';'+str(n_tree)+';'+str(len(feats))+';'+str(macro_f1)+';'+str(weight_f1)+';'+str(accuracs)+';'+str(list(feats))+';'+str(c_report), file=res_file) 468 | print("Analysis Complete. Check output file.") 469 | return [] 470 | 471 | # N = number of packets in flows, feats = array of feature names to use, feat_name = string to add to output file name 472 | def analyze_models_for_npkts(self, N, feats, feat_name): 473 | i = (N, self.filenames[0], 0) 474 | print("Number of packets per flow: ", N) 475 | 476 | X_trains, y_trains = X_train[i][feats], y_train[i] 477 | X_tests, y_tests = X_test[i][feats], y_test[i] 478 | 479 | results_file = "Models_" + feat_name + "_" + str(N) + "_pkts_.csv" 480 | analyze_models(self.classes, "RF", range(7, 20, 1), range(1, 8, 2), X_trains, y_trains, X_tests, y_tests, 500, results_file) 481 | 482 | results = pd.read_csv(results_file, sep=';') 483 | results = results.sort_values(by=['Weighted_F1','Macro_F1'],ascending=False) 484 | print(results.head(10)) 485 | print("******") 486 | print(results.head(1)['c_report'].values) 487 | 488 | ######################################## 489 | # GBoost 490 | ######################################## 491 | def GBoost_predict(self, feats): 492 | print("GBoost_predict") 493 | from sklearn.ensemble import GradientBoostingClassifier 494 | gb_model = {} 495 | 496 | for i in EncryptedTrafficClassifierIterator(self.flow_ids): 497 | gb_model[i] = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state = 42) 498 | 499 | for i in EncryptedTrafficClassifierIterator(self.flow_ids): 500 | print("==",i,"==") 501 | try: 502 | gb_model[i].fit(X_train[i][feats], y_train[i]) 503 | except ValueError as e: 504 | print(e) 505 | pass 506 | 507 | gb_y_train_predicted = {} 508 | gb_y_test_predicted = {} 509 | gb_train_score = {} 510 | gb_test_score = {} 511 | for i in EncryptedTrafficClassifierIterator(self.flow_ids): 512 | print("==",i,"==") 513 | gb_y_train_predicted[i] = gb_model[i].predict(X_train[i][feats]) 514 | gb_y_test_predicted[i] = gb_model[i].predict(X_test[i][feats]) 515 | gb_train_score[i] = gb_model[i].score(X_train[i][feats], y_train[i]) 516 | gb_test_score[i] = gb_model[i].score(X_test[i][feats], y_test[i]) 517 | 518 | self._get_scores_from_models(gb_model, y_test, gb_y_test_predicted, feats) 519 | 520 | gb_cm_dict = {} 521 | for i in EncryptedTrafficClassifierIterator(self.flow_ids): 522 | print("==",i,"==") 523 | gb_cm_dict[i] = confusion_matrix(y_test[i], gb_y_test_predicted[i].astype(int)) 524 | print(gb_cm_dict[i]) 525 | 526 | for i in EncryptedTrafficClassifierIterator(self.flow_ids): 527 | pkt, _ = i 528 | classification_results.loc[classification_results['nb_packets'] == pkt, 'gb_train_score'] = gb_train_score[i] 529 | classification_results.loc[classification_results['nb_packets'] == pkt, 'gb_test_score'] = gb_test_score[i] 530 | 531 | return gb_model, gb_y_train_predicted, gb_y_test_predicted 532 | 533 | 534 | ######################################## 535 | # Entry point 536 | ######################################## 537 | if __name__ == "__main__": 538 | parser = argparse.ArgumentParser( 539 | prog='noms2023_instant_messaging_traffic_classifier', 540 | description='Classify packets or flows from NOMS 2023 Encrypted Mobile Instant Messaging', 541 | epilog='' 542 | ) 543 | parser.add_argument('-p', '--nb_packets', action = 'append', type = int, required = True) #, default = [4, 8]) 544 | parser.add_argument('-c', '--classifier', action = 'append', type = str) #, default = ['rf']) 545 | parser.add_argument('-f', '--nb_folds', action = 'store', default = 12, type = int) 546 | parser.add_argument('-v', '--visualization', action = 'store_true', required = False, default = False) 547 | parser.add_argument('-r', '--report', action = 'store_true', required = False, default = False) 548 | parser.add_argument('-F', '--force_rf_classification', action = 'store_true', required = False, default = False) 549 | args = parser.parse_args(sys.argv[1:]) 550 | 551 | # NB_PACKETS = [2, 3, 4, 5, 6, 7, 8, 9, 10, 600000] 552 | VISUALIZATION_ENABLED = False 553 | if args.visualization == True: 554 | VISUALIZATION_ENABLED = True 555 | 556 | RF_ENABLED = False 557 | GB_ENABLED = False 558 | XG_ENABLED = False 559 | for c in args.classifier: 560 | c = c.lower() 561 | if c == "rf": 562 | RF_ENABLED = True 563 | elif c == "gb": 564 | GB_ENABLED = True 565 | elif c == "xg": 566 | XG_ENABLED = True 567 | else: 568 | print("Unknown classifier", c) 569 | 570 | classifier = NOMS2023InstantMessagingClassifier( 571 | nb_folds = args.nb_folds, 572 | nb_packets_per_flow = args.nb_packets 573 | ) 574 | 575 | if args.force_rf_classification == True: 576 | classifier.force_rf_classification = True 577 | 578 | classifier.all_classes = [ 579 | "discord", 580 | "messenger", 581 | "signal", 582 | "teams", 583 | "telegram", 584 | "whatsapp", 585 | # Non Instant Messenging 586 | #"all_background", 587 | #"gmail", 588 | #"browsing", 589 | #"youtube", 590 | ] 591 | 592 | non_needed_features = [ 593 | 'packet_id', 594 | 'flow_id', 595 | 'class', 596 | 'source', 597 | 'dest', 598 | 'sport', 599 | 'dport', 600 | 'protocol', 601 | 'timestamp', 602 | # 'nb_packets', 603 | 'src', 604 | 'iat', 605 | 'direction', 606 | 'length' 607 | ] 608 | 609 | all_features_flows = [ 610 | 'min_iat', 611 | 'max_iat', 612 | 'sum_iat', 613 | 'mean_iat', 614 | 'median_iat', 615 | 'std_iat', 616 | '1stQ_iat', 617 | '3rdQ_iat', 618 | 'skew_iat', 619 | 'kurt_iat', 620 | 'min_length', 621 | 'max_length', 622 | 'sum_length', 623 | 'median_length', 624 | 'mean_length', 625 | 'std_length', 626 | '1stQ_length', 627 | '3rdQ_length', 628 | 'skew_length', 629 | 'kurt_length', 630 | 'nb_packets', 631 | # 'sport', 632 | # 'dport', 633 | # 'protocol', 634 | # 'direction' 635 | ] 636 | # best_features = [ 637 | # 'max_iat', 638 | # 'sum_iat', 639 | # 'mean_iat', 640 | # 'median_iat', 641 | # 'std_iat', 642 | # '1stQ_iat', 643 | # '3rdQ_iat', 644 | # 'skew_iat', 645 | # 'kurt_iat', 646 | # 'min_length', 647 | # 'max_length', 648 | # 'sum_length', 649 | # 'median_length', 650 | # 'mean_length', 651 | # 'std_length', 652 | # '1stQ_length', 653 | # '3rdQ_length', 654 | # 'skew_length', 655 | # 'kurt_length' 656 | # ] 657 | best_features = ['3rdQ_iat', 'std_iat', 'std_length', 'skew_iat', 'max_iat', 'sum_iat', 'mean_length', '1stQ_length', 'max_length', 'mean_iat', 'min_length', 'sum_length', 'median_length', '1stQ_iat', 'median_iat', '3rdQ_length', 'kurt_iat', 'kurt_length', 'nb_packets'] 658 | online_features=[ 659 | 'sum_iat', 660 | 'sum_length', 661 | 'max_length', 662 | 'mean_iat', 663 | 'max_iat', 664 | 'mean_length', 665 | 'min_length', 666 | 'min_iat' 667 | ] 668 | feats_flows = all_features_flows 669 | 670 | # Preprocessing 671 | if not classifier.data_prepared(): 672 | classifier.data_preparation() 673 | classifier.load_flows() 674 | else: 675 | classifier.load_flows() 676 | _c = classifier.y_train_flows[(classifier.nb_packets_per_flow[0], 0)].unique() 677 | #_df_tmp = classifier.y_train_flows[(classifier.nb_packets_per_flow[0], 0)] 678 | #_df2_tmp = classifier.y_test_flows[(classifier.nb_packets_per_flow[0], 0)] 679 | #print(_df_tmp.value_counts()) 680 | #print(_df2_tmp.value_counts()) 681 | #sys.exit(1) 682 | classifier.classes = [-1 for _ in range(len(classifier.all_classes) + 4)] 683 | _Xy = classifier.X_train_flows[(classifier.nb_packets_per_flow[0], 0)].copy() 684 | _Xy['type'] = classifier.y_train_flows[(classifier.nb_packets_per_flow[0], 0)] 685 | for index, row in _Xy.iterrows(): 686 | for _i in range(len(classifier.all_classes)): 687 | if classifier.all_classes[_i] in row['src']: 688 | classifier.classes[row['type']] = classifier.all_classes[_i] 689 | break 690 | #classifier.classes[i] .append(classifier.all_classes[i]) 691 | if -1 not in classifier.classes: 692 | break 693 | _n = 0 694 | for _i in range(len(classifier.classes)): 695 | if classifier.classes[_i] == -1: 696 | _Xy = _Xy.drop(_Xy[_Xy['type'] == _i].index) 697 | print("dropping", _i) 698 | _n += 1 699 | if _i < len(classifier.classes) - 1: 700 | if _n > 0: 701 | _Xy.loc[_Xy['type'] == (_i + 1),'type'] = _i - _n +1 702 | print(_i+1,"->", _i - _n + 1) 703 | #for i in _c: 704 | # classifier.classes.append(classifier.all_classes[i]) 705 | keep = True 706 | while keep: 707 | try: 708 | classifier.classes.remove(-1) 709 | except ValueError: 710 | keep = False 711 | classes_dict = {} 712 | for _i in range(len(classifier.classes)): 713 | classes_dict[_i] = classifier.classes[_i] 714 | _Xy['class'] = _Xy['type'].map(classes_dict) 715 | print("classes =",classifier.classes) 716 | pkt = classifier.nb_packets_per_flow[0] 717 | # classifier._distribution(_Xy, classifier.filename_prefix + "_flows_class_split_" + str(pkt) + '_pkt_6_IMA') 718 | # sys.exit(1) 719 | 720 | classifier.cleanup_data(classifier.X_train_flows, 721 | classifier.y_train_flows, 722 | classifier.X_test_flows, 723 | classifier.y_test_flows, 724 | classifier.flow_ids, 725 | non_needed_features) 726 | 727 | # scaling during processing make results wore ! 728 | # classifier.X_train_flows_fitted, classifier.X_test_flows_fitted = classifier.preprocessing( 729 | # classifier.X_train_flows, 730 | # classifier.y_train_flows, 731 | # classifier.X_test_flows, 732 | # classifier.y_test_flows, 733 | # classifier.flow_ids, 734 | # feats_flows 735 | # ) 736 | classifier.X_train_flows_fitted = classifier.X_train_flows 737 | classifier.X_test_flows_fitted = classifier.X_test_flows 738 | 739 | # __correlation() 740 | # analyze_models_for_npkts(10, all_features, "all_feats") 741 | if args.report == True: 742 | classifier._viz(distribution = 0, class_distribution = -1, nb_packets = -1, min_iat = -1, max_iat = -1) 743 | for n in classifier.nb_packets_per_flow: 744 | if n == 4: 745 | classifier._viz(distribution = -1, class_distribution = 11, nb_packets = 0, min_iat = -1, max_iat = -1) 746 | elif n == 8: 747 | classifier._viz(distribution = -1, class_distribution = 11, nb_packets = 0, min_iat = -1, max_iat = -1) 748 | elif n == 600000: 749 | classifier._viz(distribution = -1, class_distribution = 11, nb_packets = 1, min_iat = 1, max_iat = -1) 750 | sys.exit(1) 751 | if VISUALIZATION_ENABLED: 752 | # f = classifier.filename_prefix + '_datasetflows_distribution.pickle' 753 | # if isfile(classifier.processed_data_output_dir + f): 754 | # print("Loading dataset from pickle file", f) 755 | # _df = classifier._load_pickle(f) 756 | # else: 757 | # print("Creating dataset") 758 | # _df = classifier._get_flows_with_all_packets() 759 | # classifier._pickle_dump(_df, f) 760 | # print("Dataset saved in file", f) 761 | # classifier._class_distribution(_df, classifier.filename_prefix + '_flows_distribution') 762 | # classifier._nb_packets_distribution(_df, classifier.filename_prefix + "_flows_nb_packets_distribution" ) 763 | # classifier._distribution(_df, classifier.filename_prefix + "_flows_class_split" ) 764 | classifier._viz(distribution = 0, class_distribution = 11, nb_packets = -1, min_iat = -1, max_iat = -1) 765 | # pkt = classifier.nb_packets_per_flow[0] 766 | # fold = 0 767 | # _i = pkt, fold 768 | # _df1 = classifier.X_train_flows[_i].copy() 769 | # _df1['type'] = classifier.y_train_flows[_i] 770 | # _df2 = classifier.X_test_flows[_i].copy() 771 | # _df2['type'] = classifier.y_test_flows[_i] 772 | # _df = pd.concat([_df1, _df2]) 773 | # _df.reset_index() 774 | # print(_df.shape) 775 | # classifier._distribution(_df, classifier.filename_prefix + "_flows_class_split_" + str(pkt) + '_pkt') 776 | # classifier._class_distribution(_df, classifier.filename_prefix + '_flows_distribution_' + str(pkt) + '_pkt') 777 | # # classifier._nb_packets_distribution(_df, classifier.filename_prefix + "_flows_nb_packets_distribution_" + str(pkt) + '_pkt') 778 | # classifier._min_iat_distribution(_df, classifier.filename_prefix + "_flows_min_iat_distribution_" + str(pkt) + '_pkt') 779 | # classifier._max_iat_distribution(_df, classifier.filename_prefix + "_flows_max_iat_distribution_" + str(pkt) + '_pkt') 780 | sys.exit(1) 781 | 782 | if RF_ENABLED: 783 | print("==== RandomForest =====") 784 | """ 785 | classifier.X_train_flows_fitted, classifier.X_test_flows_fitted = classifier.preprocessing( 786 | classifier.X_train_flows, 787 | classifier.y_train_flows, 788 | classifier.X_test_flows, 789 | classifier.y_test_flows, 790 | classifier.flow_ids, 791 | feats_flows 792 | ) 793 | """ 794 | # classifier.X_train_flows_fitted = classifier.X_train_flows 795 | # classifier.X_test_flows_fitted = classifier.X_test_flows 796 | 797 | rf_regr_flows, rf_y_train_flows_predicted, rf_y_test_flows_predicted = classifier.RF_predict( 798 | classifier.X_train_flows_fitted, 799 | classifier.y_train_flows, 800 | classifier.X_test_flows_fitted, 801 | classifier.y_test_flows 802 | ) 803 | 804 | # __show_actual_and_predicted(X_test, y_test, rf_y_test_predicted, 1) 805 | rf_cm_dict_flows, output = classifier.confusion_matrix(rf_regr_flows, 806 | classifier.y_test_flows, 807 | rf_y_test_flows_predicted, 808 | classifier.flow_ids, 809 | "rf" 810 | ) 811 | print(output) 812 | rf_f1_scores_flows, output = classifier.get_F1_score(rf_cm_dict_flows, 813 | classifier.y_test_flows, 814 | rf_y_test_flows_predicted, 815 | classifier.flow_ids, 816 | "rf_flows") 817 | print(output) 818 | avg_scores, output = classifier.avg_f1_scores(rf_f1_scores_flows, classifier.flow_ids) #_without_folds) 819 | print(output) 820 | # rf_cm_dict = classifier.confusion_matrix(rf_regr, rf_y_test_predicted, False) 821 | # rf_f1_scores = classifier.get_F1_score(classification_results, rf_cm_dict, y_test, rf_y_test_predicted, "rf", False) 822 | # classifier.avg_f1_scores(rf_f1_scores) 823 | 824 | if GB_ENABLED: 825 | print("==== GradientBoosting =====") 826 | gb_regr, gb_y_train_predicted, gb_y_test_predicted = classifier.GBoost_predict(feats_flows, classification_results) 827 | gb_cm_dict = classifier.confusion_matrix(gb_regr, classifier.y_test_flows, gb_y_test_predicted, classifier.flow_ids, "gb") 828 | gb_f1_scores = classifier.get_F1_score(gb_cm_dict, y_test, gb_y_test_predicted, "gb", False) 829 | classifier.avg_f1_scores(gb_f1_scores_flows, classifier.flow_ids_without_folds) 830 | classifier.avg_f1_scores(gb_f1_scores) 831 | 832 | if XG_ENABLED: 833 | print("==== XGBoost =====") 834 | xg_regr, xg_y_train_predicted, xg_y_test_flows_predicted = classifier.XGBoost_predict( 835 | classifier.X_train_flows_fitted, 836 | classifier.y_train_flows, 837 | classifier.X_test_flows_fitted, 838 | classifier.y_test_flows 839 | ) 840 | 841 | # feats_flows, classification_results) 842 | # xg_cm_dict = classifier.confusion_matrix(xg_regr, xg_y_test_predicted, False) 843 | xg_cm_dict_flows, output = classifier.confusion_matrix(xg_regr, 844 | classifier.y_test_flows, 845 | xg_y_test_flows_predicted, 846 | classifier.flow_ids, 847 | "xg" 848 | ) 849 | print(output) 850 | 851 | xg_f1_scores_flows, output = classifier.get_F1_score( 852 | xg_cm_dict_flows, 853 | classifier.y_test_flows, 854 | xg_y_test_flows_predicted, 855 | classifier.flow_ids, 856 | "xg_flows") 857 | print(output) 858 | # xg_cm_dict, classifier.y_test_flows, xg_y_test_predicted, "xg", False) 859 | avg_scores, output = classifier.avg_f1_scores(xg_f1_scores_flows, classifier.flow_ids) 860 | print(output) 861 | 862 | print(classifier.classification_results) 863 | if RF_ENABLED or GB_ENABLED or XG_ENABLED: 864 | classifier.save_results() 865 | -------------------------------------------------------------------------------- /Offline_ETC/ucdavis_quic_classifier.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | 6 | import os 7 | from os.path import isfile, join 8 | import sys 9 | import time 10 | 11 | import numpy as np 12 | 13 | import pandas as pd 14 | 15 | from scipy.stats import kurtosis, skew 16 | 17 | from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures 18 | from sklearn.pipeline import Pipeline 19 | from sklearn.compose import ColumnTransformer 20 | 21 | from sklearn.experimental import enable_iterative_imputer 22 | from sklearn.impute import SimpleImputer, IterativeImputer 23 | 24 | from encrypted_traffic_classification import EncryptedTrafficClassifier, EncryptedTrafficClassifierIterator 25 | 26 | REGENERATE_FLOWS_DATA = False 27 | 28 | TEST_FLOWS = True 29 | TEST_PACKETS = False 30 | 31 | ######################################## 32 | # Data preparation: convert RAW data 33 | ######################################## 34 | class UCDavisQuicClassifier(EncryptedTrafficClassifier): 35 | def __init__(self, nb_folds, nb_packets_per_flow): 36 | super().__init__( 37 | nb_folds= nb_folds, 38 | nb_packets_per_flow = nb_packets_per_flow, 39 | filename_prefix = "ucdavis_quic", 40 | # processed_data_output_dir = "ucdavis_quic_output/", 41 | processed_data_output_dir = "ucdavis_quic_output_addendum/", 42 | data_dir = "data/ucdavis_quic_pretraining/" 43 | ) 44 | 45 | pools = [tuple(pool) for pool in [self.nb_packets_per_flow, range(self.nb_folds)]] 46 | result = [[]] 47 | for pool in pools: 48 | result = [x + [y] for x in result for y in pool] 49 | self.flow_ids = result 50 | 51 | pools = [tuple(pool) for pool in [range(self.nb_folds)]] 52 | result = [[]] 53 | for pool in pools: 54 | result = [x + [y] for x in result for y in pool] 55 | self.packet_ids = result 56 | 57 | def data_preparation(self): 58 | print("data_prepation") 59 | # limit = 100000 60 | 61 | start_time = time.time() 62 | traffic_type = 0 63 | subdirs = os.listdir(self.data_dir) 64 | dfs = [] 65 | dfs_memory_usage = 0 66 | df = pd.DataFrame() 67 | for d in subdirs: 68 | self.classes[traffic_type] = d 69 | # print(d) 70 | files = os.listdir(self.data_dir + d) 71 | # i = 0 72 | for filename in files: 73 | f = self.data_dir + d + "/" + filename 74 | # print(filename) 75 | file_df = pd.read_csv(f, 76 | delimiter = '\t', 77 | names = ['timestamp', 'time_delta', 'packet_size', 'direction'] 78 | ) 79 | file_df['type'] = traffic_type 80 | file_df['src'] = filename 81 | dfs.append(file_df) 82 | # short hack: there is a trade-off between memory usage and speed 83 | # as much as possible DataFrames are insert in the dfs numpy array 84 | # which is much faster than pandas.concat, but if the dfs array grows 85 | # too big the process will be killed by oom_killer on Linux, so 86 | # once memory is above 2GB concat what we have in dfs 87 | dfs_memory_usage += file_df.memory_usage(deep = True).sum() 88 | if dfs_memory_usage > 2 * 1024 * 1024 * 1024: 89 | df = pd.concat([df, *dfs]) 90 | del dfs 91 | dfs = [] 92 | dfs_memory_usage = 0 93 | 94 | # i += 1 95 | # if i >= limit: 96 | # break 97 | 98 | traffic_type += 1 99 | 100 | df = pd.concat([df, *dfs]) 101 | df = df.fillna(0) 102 | 103 | del dfs 104 | print(f" flows data loaded in {time.time() - start_time} seconds") 105 | filename = self.filename_prefix + ".pickle" 106 | self._generate_data_folds(df, filename) 107 | 108 | print(df.columns) 109 | print(df.describe()) 110 | print(df.info) 111 | print(df.shape) 112 | 113 | def __get_flow_df(self, flow_df, traffic_type): 114 | # filter by direction 115 | #file_df = file_df[file_df['direction'] == 1] 116 | _df = flow_df['packet_size'] 117 | packet_size = sum(_df) 118 | min_packet_size = np.min(_df) 119 | max_packet_size = np.max(_df) 120 | mean_packet_size = np.mean(_df) 121 | median_packet_size = np.median(_df) 122 | std_packet_size = np.std(_df) 123 | Q1_packet_size = np.quantile(_df, 0.25) 124 | Q3_packet_size = np.quantile(_df, 0.75) 125 | _a = list(_df) 126 | skew_packet_size = skew(_a) 127 | kurt_packet_size = kurtosis(_a) 128 | 129 | min_time_delta = np.min(flow_df[flow_df['time_delta'] > 0]['time_delta']) 130 | _df = flow_df['time_delta'] 131 | time_delta = sum(_df) 132 | max_time_delta = np.max(_df) 133 | mean_time_delta = np.mean(_df) 134 | median_time_delta = np.median(_df) 135 | std_time_delta = np.std(_df) 136 | Q1_iat = np.quantile(_df, 0.25) 137 | Q3_iat = np.quantile(_df, 0.75) 138 | _a = list(_df) 139 | skew_iat = skew(_a) 140 | kurt_iat = kurtosis(_a) 141 | data = { 142 | 'sum_iat': [time_delta], 143 | 'sum_length': [packet_size], 144 | 'min_length': [min_packet_size], 145 | 'max_length': [max_packet_size], 146 | 'mean_length': [mean_packet_size], 147 | 'median_length': [median_packet_size], 148 | 'std_length': [std_packet_size], 149 | '1stQ_length': [Q1_packet_size], 150 | '3stQ_length': [Q3_packet_size], 151 | 'skew_length': [skew_packet_size], 152 | 'kurt_length': [kurt_packet_size], 153 | 'min_iat': [min_time_delta], 154 | 'max_iat': [max_time_delta], 155 | 'mean_iat': [mean_time_delta], 156 | 'median_iat': [median_time_delta], 157 | 'std_iat': [std_time_delta], 158 | '1stQ_iat': [Q1_iat], 159 | '3stQ_iat': [Q3_iat], 160 | 'skew_iat': [skew_iat], 161 | 'kurt_iat': [kurt_iat], 162 | 'nb_packets': [len(flow_df)], 163 | 'type': [traffic_type], 164 | #'direction': [flow_df['direction']] 165 | } 166 | _df = pd.DataFrame(data = data) 167 | _df.fillna(0) 168 | return _df 169 | 170 | def packets2flows_nofold(self): 171 | print("packets2flows_nofold") 172 | traffic_type = 0 173 | subdirs = os.listdir(self.data_dir) 174 | dfs = {} 175 | flow_id = 0 176 | for n in self.nb_packets_per_flow: 177 | dfs[n] = [] 178 | for d in subdirs: 179 | self.classes[traffic_type] = d 180 | files = os.listdir(self.data_dir + d) 181 | for filename in files: 182 | f = self.data_dir + d + "/" + filename 183 | # print(filename) 184 | file_df = pd.read_csv(f, 185 | delimiter = '\t', 186 | names = ['timestamp', 'time_delta', 'packet_size', 'direction'] 187 | ) 188 | file_df['type'] = traffic_type 189 | file_df['src'] = filename 190 | file_df['flow_id'] = flow_id 191 | flow_id += 1 192 | 193 | dfs[n].append(file_df.head(n = n)) 194 | 195 | traffic_type += 1 196 | for n in self.nb_packets_per_flow: 197 | df = pd.concat(dfs[n]) 198 | self._pickle_dump(df, "for_signatures_" + str(n) + "_flows_" + self.pickle_filename_suffix) 199 | 200 | def _get_flows_with_all_packets(self): 201 | print("_get_flows_with_all_packets") 202 | traffic_type = 0 203 | subdirs = os.listdir(self.data_dir) 204 | _df = [] 205 | start_time = time.time() 206 | for d in subdirs: 207 | self.classes[traffic_type] = d 208 | files = os.listdir(self.data_dir + d) 209 | for filename in files: 210 | f = self.data_dir + d + "/" + filename 211 | file_df = pd.read_csv( 212 | f, 213 | delimiter = '\t', 214 | names = ['timestamp', 'time_delta', 'packet_size', 'direction'] 215 | ) 216 | file_df['type'] = traffic_type 217 | file_df['src'] = filename 218 | _flow_df = self.__get_flow_df(file_df, traffic_type) 219 | 220 | _df.append(_flow_df) 221 | # uncomment for debugging behavior with a single file 222 | # break 223 | traffic_type += 1 224 | print(" processing took ", time.time() - start_time, "seconds.") 225 | _df = pd.concat(_df) 226 | _df =_df.fillna(0) 227 | return _df 228 | 229 | def packets2flows(self): 230 | print("packets2flows") 231 | traffic_type = 0 232 | subdirs = sorted(os.listdir(self.data_dir)) 233 | dfs = {} 234 | for n in self.nb_packets_per_flow: 235 | dfs[n] = [] 236 | idx_d = 0 237 | for d in subdirs: 238 | start_time = time.time() 239 | print("Processing directory #%d/%d: %s" % (idx_d, len(subdirs), d)) 240 | idx_d += 1 241 | self.classes[traffic_type] = d 242 | files = sorted(os.listdir(self.data_dir + d)) 243 | for filename in files: 244 | f = self.data_dir + d + "/" + filename 245 | # print(filename) 246 | file_df = pd.read_csv(f, 247 | delimiter = '\t', 248 | names = ['timestamp', 'time_delta', 'packet_size', 'direction'] 249 | ) 250 | file_df['type'] = traffic_type 251 | file_df['src'] = filename 252 | for n in self.nb_packets_per_flow: 253 | _flow_df = self.__get_flow_df(file_df.head(n = n), traffic_type) 254 | 255 | dfs[n].append(_flow_df) 256 | print(" ", d, "processed in ", time.time() - start_time, "seconds.") 257 | 258 | traffic_type += 1 259 | 260 | for n in self.nb_packets_per_flow: 261 | df = pd.concat(dfs[n]) 262 | df = df.fillna(0) 263 | seed = 42 264 | filename = self.filename_prefix + "_" + str(n) + ".pickle" 265 | # filename = self.filename_prefix + "_flows_" + str(n) + ".pickle" 266 | self._generate_data_folds(df, filename) 267 | 268 | def load_flows_nofold(self): 269 | from sklearn.model_selection import StratifiedKFold 270 | skf = StratifiedKFold(n_splits = self.nb_folds, shuffle = True, random_state = self.random_seed) 271 | for n in [4]: 272 | df = self._load_pickle("for_signatures_" + str(n) + "_flows_" + self.pickle_filename_suffix) 273 | X = df.drop('type', axis = 1) 274 | y = df['type'] 275 | 276 | for _i, (train_index, test_index) in enumerate(skf.split(X, y)): 277 | i = _i, n 278 | self.X_train_flows[i] = X.iloc[train_index].fillna(0) 279 | self.y_train_flows[i] = y.iloc[train_index].fillna(0) 280 | self.X_test_flows[i] = X.iloc[test_index].fillna(0) 281 | self.y_test_flows[i] = y.iloc[test_index].fillna(0) 282 | 283 | def load_packets(self, suffix): 284 | print("load_packets", suffix) 285 | start_time = time.time() 286 | 287 | for fold in EncryptedTrafficClassifierIterator(self.packet_ids): 288 | name = str(fold) + "_X_train_" + suffix 289 | self.X_train_packets[fold] = self._load_pickle(name) 290 | 291 | name = str(fold) + "_y_train_" + suffix 292 | self.y_train_packets[fold] = self._load_pickle(name) 293 | 294 | name = str(fold) + "_X_test_" + suffix 295 | self.X_test_packets[fold] = self._load_pickle(name) 296 | 297 | name = str(fold) + "_y_test_" + suffix 298 | self.y_test_packets[fold] = self._load_pickle(name) 299 | print(f" packets data loaded in {time.time() - start_time} seconds") 300 | 301 | def LogReg_predict(X_train, y_train, X_test, y_test, ): 302 | lr = LogisticRegression(penalty='none', solver='newton-cg') 303 | lr.fit(X, y) 304 | metrics.plot_roc_curve(lr, X, y) 305 | plt.plot([0, 1], [0, 1], "-") 306 | plt.show() 307 | 308 | 309 | display(metrics.roc_auc_score(y, lr.predict_proba(X)[:, 1])) 310 | display(metrics.confusion_matrix(y, lr.predict_proba(X)[:, 1]>0.5)) 311 | 312 | print("train score = %f" % (lr.score(X_train, y_train))) 313 | print("test score = %f" % (lr.score(X_test, y_test))) 314 | 315 | ######################################## 316 | # Entry point 317 | ######################################## 318 | if __name__ == "__main__": 319 | parser = argparse.ArgumentParser( 320 | prog='ucdavis_quic_traffic_classifier', 321 | description='Classify packets or flows from UCDavis QUIC dataset', 322 | epilog='' 323 | ) 324 | parser.add_argument('-p', '--nb_packets', action = 'append', type = int, required = True) 325 | parser.add_argument('-c', '--classifier', action = 'append', type = str) 326 | parser.add_argument('-f', '--nb_folds', action = 'store', default = 12, type = int) 327 | parser.add_argument('-v', '--visualization', action = 'store_true', required = False, default = False) 328 | parser.add_argument('-r', '--report', action = 'store_true', required = False, default = False) 329 | parser.add_argument('-F', '--force_rf_classification', action = 'store_true', required = False, default = False) 330 | args = parser.parse_args(sys.argv[1:]) 331 | 332 | VISUALIZATION_ENABLED = False 333 | if args.visualization == True: 334 | VISUALIZATION_ENABLED = True 335 | 336 | RF_ENABLED = False 337 | GB_ENABLED = False 338 | XG_ENABLED = False 339 | 340 | for c in args.classifier: 341 | print(c) 342 | if c == "rf": 343 | RF_ENABLED = True 344 | elif c == "gb": 345 | GB_ENABLED = True 346 | elif c == "xg": 347 | XG_ENABLED = True 348 | else: 349 | print("Unknown classifier", c) 350 | 351 | classifier = UCDavisQuicClassifier( 352 | nb_folds = args.nb_folds, 353 | nb_packets_per_flow = args.nb_packets 354 | ) 355 | FORCE_RF_CLASSIFICATION = False 356 | if args.force_rf_classification == True: 357 | classifier.force_rf_classification = True 358 | 359 | classifier.all_classes = [ 360 | "Google Doc", 361 | "Google Drive", 362 | "Google Music", 363 | "Google Search", 364 | "Youtube", 365 | ] 366 | if REGENERATE_DATA_FOR_SIGNATURES: 367 | classifier.packets2flows_nofold() 368 | sys.exit(1) 369 | 370 | if REGENERATE_FLOWS_DATA: 371 | classifier.load_packets(classifier.pickle_filename_suffix) 372 | classifier.packets2flows() 373 | sys.exit(1) 374 | 375 | # data preparation, convert raw data to pickle file, split in StratifiedKFold X_train, y_train, X_test, y_test 376 | if not classifier.data_prepared(): 377 | # classifier.data_preparation() 378 | classifier.packets2flows() 379 | else: 380 | subdirs = os.listdir(classifier.data_dir) 381 | traffic_type = 0 382 | for d in subdirs: 383 | classifier.classes[traffic_type] = d 384 | traffic_type += 1 385 | 386 | non_needed_features = [ 387 | 'timestamp', 388 | 'direction', 389 | # "nb_packets" 390 | ] 391 | # non_needed_features += [ 392 | # 'min_length', 'max_length', 393 | # 'mean_length', 'std_length', 394 | # '1stQ_length', 395 | # '3stQ_length', 396 | # 'skew_length', 397 | # 'kurt_length', 398 | # 'min_iat', 399 | # 'max_iat', 'mean_iat', 'std_iat', 400 | # '1stQ_iat', 401 | # '3stQ_iat', 402 | # 'skew_iat', 403 | # 'kurt_iat', 404 | # ] 405 | 406 | if TEST_PACKETS: 407 | classifier.load_packets(classifier.pickle_filename_suffix) 408 | classifier.cleanup_data(classifier.X_train_packets, 409 | classifier.y_train_packets, 410 | classifier.X_test_packets, 411 | classifier.y_test_packets, 412 | classifier.packet_ids, 413 | non_needed_features) 414 | 415 | if TEST_FLOWS: 416 | classifier.load_flows() 417 | _c = classifier.y_train_flows[(classifier.nb_packets_per_flow[0], 0)].unique() 418 | classifier.classes = [] 419 | for i in _c: 420 | classifier.classes.append(classifier.all_classes[i]) 421 | classifier.cleanup_data(classifier.X_train_flows, 422 | classifier.y_train_flows, 423 | classifier.X_test_flows, 424 | classifier.y_test_flows, 425 | classifier.flow_ids, 426 | non_needed_features) 427 | 428 | # __correlation() 429 | if args.report == True: 430 | classifier._viz(distribution = 0, class_distribution = -1, nb_packets = -1, min_iat = -1, max_iat = -1) 431 | for n in classifier.nb_packets_per_flow: 432 | if n == 4: 433 | classifier._viz(distribution = -1, class_distribution = 11, nb_packets = 0, min_iat = -1, max_iat = -1) 434 | elif n == 8: 435 | classifier._viz(distribution = -1, class_distribution = 11, nb_packets = 0, min_iat = -1, max_iat = -1) 436 | elif n == 600000: 437 | classifier._viz(distribution = -1, class_distribution = 0, nb_packets = 0, min_iat = 1, max_iat = -1) 438 | sys.exit(1) 439 | if VISUALIZATION_ENABLED: 440 | pkt = classifier.nb_packets_per_flow[0] 441 | fold = 0 442 | _i = pkt, fold 443 | _df1 = classifier.X_train_flows[_i].copy() 444 | _df1['type'] = classifier.y_train_flows[_i] 445 | _df2 = classifier.X_test_flows[_i].copy() 446 | _df2['type'] = classifier.y_test_flows[_i] 447 | _df = pd.concat([_df1, _df2]) 448 | _df.reset_index() 449 | print(_df.shape) 450 | print(_df['type'].value_counts()) 451 | classifier._distribution(_df, classifier.filename_prefix + "_flows_class_split_" + str(pkt) + '_pkt') 452 | # classifier._class_distribution(_df, classifier.filename_prefix + '_flows_distribution_' + str(pkt) + '_pkt') 453 | # classifier._nb_packets_distribution(_df, classifier.filename_prefix + "_flows_nb_packets_distribution_" + str(pkt) + '_pkt') 454 | classifier._min_iat_distribution(_df, classifier.filename_prefix + "_flows_min_iat_distribution_" + str(pkt) + '_pkt') 455 | 456 | 457 | all_features_flows = ['sum_iat', 'sum_length', 'min_length', 'max_length', 458 | 'mean_length', 'median_length', 'std_length', 459 | '1stQ_length', 460 | '3stQ_length', 461 | 'skew_length', 462 | 'kurt_length', 463 | 'min_iat', 464 | 'max_iat', 'mean_iat', 'median_iat', 'std_iat', 465 | '1stQ_iat', 466 | '3stQ_iat', 467 | 'skew_iat', 468 | 'kurt_iat', 469 | 'nb_packets' 470 | ] 471 | 472 | basic_features_flows = ['sum_iat', 'sum_length', 'min_length', 'max_length', 473 | 'mean_length', 'std_length', 474 | 'min_iat', 475 | 'max_iat', 'mean_iat', 'std_iat', 'nb_packets' 476 | ] 477 | 478 | all_features_packets = ['sum_iat', 'sum_length'] 479 | # feats_flows = all_features_flows 480 | feats_flows = basic_features_flows 481 | feats_packets = all_features_packets 482 | 483 | # classification based on flows 484 | if TEST_FLOWS: 485 | # classifier.X_train_flows_fitted, classifier.X_test_flows_fitted = classifier.preprocessing( 486 | # classifier.X_train_flows, 487 | # classifier.y_train_flows, 488 | # classifier.X_test_flows, 489 | # classifier.y_test_flows, 490 | # classifier.flow_ids, 491 | # feats_flows 492 | # ) 493 | classifier.X_train_flows_fitted = classifier.X_train_flows 494 | classifier.X_test_flows_fitted = classifier.X_test_flows 495 | if RF_ENABLED: 496 | rf_regr_flows, rf_y_train_flows_predicted, rf_y_test_flows_predicted, rf_y_test_flows_isolated_predicted = classifier.RF_predict( 497 | classifier.X_train_flows_fitted, 498 | classifier.y_train_flows, 499 | classifier.X_test_flows_fitted, 500 | classifier.y_test_flows 501 | ) 502 | rf_cm_dict_flows, output = classifier.confusion_matrix(rf_regr_flows, 503 | classifier.y_test_flows, 504 | rf_y_test_flows_predicted, 505 | classifier.flow_ids, 506 | "rf" 507 | ) 508 | print(output) 509 | rf_f1_scores_flows, output = classifier.get_F1_score(rf_cm_dict_flows, 510 | classifier.y_test_flows, 511 | rf_y_test_flows_predicted, 512 | classifier.flow_ids, 513 | "rf_flows") 514 | print(output) 515 | avg_scores, output = classifier.avg_f1_scores(rf_f1_scores_flows, classifier.flow_ids) 516 | print(output) 517 | 518 | ###### 519 | 520 | cm_dict = {} 521 | cm_dict_normalized = {} 522 | rf_F1 = {} 523 | skl_F1 = {} 524 | 525 | print("== isolated ==\n") 526 | from sklearn.metrics import f1_score, confusion_matrix 527 | print("classifier.y_test_isolated_flows =", classifier.y_test_isolated_flows.shape) 528 | print("rf_y_test_flows_isolated_predicted =", rf_y_test_flows_isolated_predicted) 529 | for i in EncryptedTrafficClassifierIterator(classifier.flow_ids): 530 | output = ("== %s ==\n" % str(i)) 531 | cm_dict = confusion_matrix(classifier.y_test_isolated_flows, rf_y_test_flows_isolated_predicted[i]) 532 | output += str(cm_dict) + '\n' 533 | cm_dict_normalized = confusion_matrix(classifier.y_test_isolated_flows, rf_y_test_flows_isolated_predicted[i], normalize = 'true') 534 | output += str(cm_dict_normalized) + '\n' 535 | print(output) 536 | 537 | output = "" 538 | 539 | cm = cm_dict 540 | FP = cm.sum(axis=0) - np.diag(cm) 541 | FN = cm.sum(axis=1) - np.diag(cm) 542 | TP = np.diag(cm) 543 | TN = cm.sum() - (FP + FN + TP) 544 | rf_F1[i] = 2 * (TP) / (2 * TP + FP + FN) * 100 545 | output += ("FP = %s\n" % str(FP)) 546 | output += ("FN = %s\n" % str(FN)) 547 | output += ("TP = %s\n" % str(TP)) 548 | output += ("TN = %s\n" % str(TN)) 549 | if len(classifier.y_test_isolated_flows) > 0: 550 | skl_F1 = f1_score(classifier.y_test_isolated_flows, rf_y_test_flows_isolated_predicted[i], average = 'micro') 551 | output += ("skl_F1 = %s\n" % str(skl_F1)) 552 | output += "\n" 553 | for j in range(len(classifier.classes)): 554 | t = classifier.classes[j] 555 | try: 556 | output += ("for type %s \t\t F1 = %.2f\n" % (t, rf_F1[j])) 557 | except IndexError as e: 558 | pass 559 | except KeyError as e: 560 | pass 561 | output += "\n" 562 | print(output) 563 | 564 | output ="" 565 | f1 = {} 566 | for i in EncryptedTrafficClassifierIterator(classifier.flow_ids): 567 | pkt, _ = i 568 | f1[pkt] = [0 for _ in range(len(classifier.classes))] 569 | for i in EncryptedTrafficClassifierIterator(classifier.flow_ids): 570 | pkt, _ = i 571 | for j in range(len(classifier.classes)): 572 | try: 573 | f1[pkt][j] += rf_F1[i][j] 574 | except KeyError as e: 575 | continue 576 | except IndexError as e: 577 | continue 578 | 579 | avg_scores = {} 580 | output = "" 581 | for pkt in classifier.nb_packets_per_flow: 582 | output += f"for {pkt} packets\n" 583 | for j in range(len(classifier.classes)): 584 | t = classifier.classes[j] 585 | avg_scores[(pkt, t)] = f1[pkt][j] / classifier.nb_folds 586 | output += "average for type %s [%d] \t\t F1 = %.2f\n" % (t, j, avg_scores[(pkt, t)]) 587 | output += "\n" 588 | print(output) 589 | #### 590 | 591 | if XG_ENABLED: 592 | print("==== XGBoost =====") 593 | xg_regr, xg_y_train_predicted, xg_y_test_flows_predicted = classifier.XGBoost_predict( 594 | classifier.X_train_flows_fitted, 595 | classifier.y_train_flows, 596 | classifier.X_test_flows_fitted, 597 | classifier.y_test_flows 598 | ) 599 | 600 | # feats_flows, classification_results) 601 | # xg_cm_dict = classifier.confusion_matrix(xg_regr, xg_y_test_predicted, False) 602 | xg_cm_dict_flows, output = classifier.confusion_matrix(xg_regr, 603 | classifier.y_test_flows, 604 | xg_y_test_flows_predicted, 605 | classifier.flow_ids, 606 | "xg" 607 | ) 608 | print(output) 609 | 610 | xg_f1_scores_flows, output = classifier.get_F1_score( 611 | xg_cm_dict_flows, 612 | classifier.y_test_flows, 613 | xg_y_test_flows_predicted, 614 | classifier.flow_ids, 615 | "xg_flows") 616 | print(output) 617 | avg_scores, output = classifier.avg_f1_scores(xg_f1_scores_flows, classifier.flow_ids) 618 | print(output) 619 | # classification based on Packets 620 | if TEST_PACKETS: 621 | classifier.X_train_packets_fitted, classifier.X_test_packets_fitted = classifier.preprocessing( 622 | classifier.X_train_packets, 623 | classifier.y_train_packets, 624 | classifier.X_test_packets, 625 | classifier.y_test_packets, 626 | classifier.packet_ids, 627 | feats_packets 628 | ) 629 | if RF_ENABLED: 630 | rf_regr_packets, rf_y_train_packets_predicted, rf_y_test_packets_predicted = classifier.RF_predict( 631 | classifier.X_train_packets_fitted, 632 | classifier.y_train_packets, 633 | classifier.X_test_packets_fitted, 634 | classifier.y_test_packets, 635 | classifier.packet_ids, 636 | ) 637 | for i in EncryptedTrafficClassifierIterator(classifier.packet_ids): 638 | print("Feature ranking:") 639 | importances = rf_regr_flows[i].best_estimator_.named_steps["rf"].feature_importances_ 640 | std = np.std([tree.feature_importances_ for tree in rf_regr_flows[i].best_estimator_.named_steps["rf"].estimators_], 641 | axis=0) 642 | indices = np.argsort(importances)[::-1] 643 | for f in range(classifier.X_train_flows[i].shape[1]): 644 | print("%d. feature %s (%f)" % (f + 1, classifier.X_train_flows[i].columns[indices[f]], importances[indices[f]])) 645 | # __show_actual_and_predicted(X_test, y_test, rf_y_test_predicted, 1) 646 | # __analyze_CHAT(X_train, y_train, rf_y_train_predicted) 647 | # __analyze_CHAT(X_test, y_test, rf_y_test_predicted) 648 | rf_cm_dict, output = classifier.confusion_matrix(rf_regr_packets, 649 | classifier.y_test_packets, 650 | rf_y_test_packets_predicted, 651 | classifier.packet_ids) 652 | print(output) 653 | rf_f1_scores, output = classifier.get_F1_score(rf_cm_dict, 654 | classifier.y_test_packets, 655 | rf_y_test_packets_predicted, 656 | classifier.packet_ids, 657 | "rf_packets") 658 | print(output) 659 | classifier.avg_f1_scores(rf_f1_scores, classifier.packet_ids) 660 | else: 661 | print("CLASSIFICATION BASED ON PACKETS NOT ENABLED") 662 | 663 | print(classifier.classification_results) 664 | if RF_ENABLED or GB_ENABLED or XG_ENABLED: 665 | classifier.save_results() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Encrypted Traffic Classification at Line Rate in Programmable Switches with Machine Learning 2 | 3 | This repository contains the source code for our work on Encrypted Traffic Classification (ETC) in programmable switches with P4 and Machine Learning, appearing in the Proceedings of IEEE/IFIP NOMS 2024, 6–10 May 2024, Seoul, South Korea. 4 | 5 | ## Overview of the ETC framework 6 | ETC Overview 7 | 8 | This work leverages recent advances in data plane programmability to achieve real-time ETC in programmable switches at line rate, with high throughput and low latency. The proposed solution comprises (i) an ETC-aware Random Forest (RF) modelling process where only features based on packet size and packet arrival times are used, and (ii) an encoding of the trained RF model into production-grade P4-programmable switches. 9 | 10 | For full details, please consult [our paper](https://dspace.networks.imdea.org/bitstream/handle/20.500.12761/1791/etc_noms24_postprint.pdf?sequence=1&isAllowed=y). 11 | 12 | An extended version is currently in submission as an invited paper to a journal. 13 | 14 | ## Organization of the repository 15 | There are two folders: 16 | 17 | - _In_switch_ETC_ : the python and P4 code for the training and encoding of the in-switch RF models for RF. 18 | - _Offline_ETC_ : the python code for the offline data analysis and ETC modelling process. 19 | 20 | ## Use cases 21 | The use cases considered in the paper are: 22 | - QUIC traffic classification based on the publicly available Netflow QUIC dataset. The challenge is classifying traffic into one of 5 classes. 23 | - Encrypted instant messaging application fingerprinting with 6 classes, based on the Encrypted Instant Messaging Dataset made available by the NIMS Lab. 24 | - VPN traffic classification, distinguishing 7 classes. It is based on the ISCX-VPN-NonVPN-2016 Dataset. 25 | 26 | We provide the python and P4 code for the Encrypted Instant Messaging App classification use case with 6 classes.
The same approach for feature/model selection and encoding to P4 applies to all the use cases. 27 | 28 | ## Citation 29 | If you make use of this code, kindly cite our paper: 30 | ``` 31 | @inproceedings{etc-noms-2024, 32 | author={Akem, Aristide Tanyi-Jong and Fraysse, Guillaume and Fiore, Marco}, 33 | booktitle={NOMS 2024-2024 IEEE Network Operations and Management Symposium}, 34 | title={Encrypted Traffic Classification at Line Rate in Programmable Switches with Machine Learning}, 35 | year={2024}, 36 | volume={}, 37 | number={}, 38 | pages={1-9}, 39 | doi={10.1109/NOMS59830.2024.10575394}} 40 | 41 | ``` 42 | 43 | If you need any additional information, send us an email at _aristide.akem_ at _imdea.org_. 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /etc_framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nds-group/ETC_NOMS_2024/1ad4bbd4e1c097e87625b185e6d0ba14fa201bbb/etc_framework.png --------------------------------------------------------------------------------