├── .gitignore ├── README.md ├── classification.py ├── pcap_packet_features.py ├── pcap_parser.py └── print_packets.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | /*.csv 106 | 107 | *.png 108 | 109 | *.sh 110 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning Based IoT IntrusionDetection System: An MQTT Case Study 2 | 3 | This work uses six different machine learning techniques to classify attacks in an MQTT network. 4 | 5 | ## Dataset Used 6 | The used dataset is published in [IEEE DataPort](https://ieee-dataport.org/open-access/mqtt-internet-things-intrusion-detection-dataset) 7 | 8 | ```` 9 | @data{bhxy-ep04-20, 10 | doi = {10.21227/bhxy-ep04}, 11 | url = {http://dx.doi.org/10.21227/bhxy-ep04}, 12 | author = {Hanan Hindy; Christos Tachtatzis; Robert Atkinson; Ethan Bayne; Xavier Bellekens }, 13 | publisher = {IEEE Dataport}, 14 | title = {MQTT Internet of Things Intrusion Detection Dataset}, 15 | year = {2020} } 16 | ```` 17 | 18 | ## Citation 19 | ``` 20 | @article{hindy2020machine, 21 | title={Machine Learning Based IoT Intrusion Detection System: An MQTT Case Study}, 22 | author={Hindy, Hanan and Bayne, Ethan and Bures, Miroslav and Atkinson, Robert and Tachtatzis, Christos and Bellekens, Xavier}, 23 | journal={arXiv preprint arXiv:2006.15340}, 24 | year={2020} 25 | } 26 | ```` 27 | 28 | # Algorithms Used 29 | - Logistic Regression 30 | - k-Nearest Neighbours 31 | - Gaussian Naive Bayes 32 | - Decision Trees 33 | - Random Forests 34 | - Support Vector Machine (linear and RBF kernel) 35 | 36 | 37 | ## How to Run it: 38 | 39 | ``` 40 | Clone this repository 41 | Download dataset files and extract them in the same directory 42 | run classification.py --mode [0: packet, 1: unidirectional, 2: bidirectional] --output [output_folder] --verbose [True/False] 43 | ``` 44 | - The classification outputs are added to the output folder. 45 | -------------------------------------------------------------------------------- /classification.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu Aug 29 12:14:12 2019 5 | 6 | @author: hananhindy 7 | """ 8 | import pandas as pd 9 | import numpy as np 10 | import os 11 | import argparse 12 | 13 | from sklearn.preprocessing import OneHotEncoder 14 | from sklearn.linear_model import LogisticRegression 15 | from sklearn.neighbors import KNeighborsClassifier 16 | from sklearn.svm import SVC, LinearSVC 17 | from sklearn.naive_bayes import GaussianNB 18 | from sklearn.tree import DecisionTreeClassifier 19 | from sklearn.ensemble import RandomForestClassifier 20 | from sklearn.model_selection import train_test_split, StratifiedKFold 21 | from sklearn.metrics import classification_report 22 | 23 | # Helper Function 24 | def str2bool(v): 25 | if v.lower() in ('yes', 'true', 't', 'y', '1'): 26 | return True 27 | elif v.lower() in ('no', 'false', 'f', 'n', '0'): 28 | return False 29 | else: 30 | raise argparse.ArgumentTypeError('Boolean value expected.') 31 | 32 | #protocols = ['ARP', 'CDP', 'CLDAP', 'DATA', 'DNS', 'DTLS', 'DTP', 'ECHO', 'ICMP', 'ISAKMP','MDNS', 'NAT-PMP', 'NBNS', 'NFS', 'NTP', 'PORTMAP', 'RADIUS', 'RIP', 'SRVLOC', 'SNMP', 'SSH', 'STP', 'TCP', 'UDP', 'XDMCP', 'MQTT', 'MPEG_PMT', 'MP2T', 'MPEG_PAT', 'DVB_SDT'] 33 | #label_encoder = LabelEncoder().fit(protocols) 34 | 35 | one_hot_encoder = None 36 | 37 | def load_file(path, mode, is_attack = 1, label = 1, folder_name='Bi/', sliceno = 0, verbose = True): 38 | #global label_encoder 39 | global one_hot_encoder 40 | 41 | #attacker_ips = ['192.168.2.5'] 42 | 43 | columns_to_drop_packet = ['timestamp', 'src_ip', 'dst_ip'] 44 | columns_to_drop_uni = ['proto', 'ip_src', 'ip_dst'] 45 | columns_to_drop_bi = ['proto', 'ip_src', 'ip_dst'] 46 | 47 | if os.path.getsize(path)//10 ** 9 > 0: 48 | x = np.zeros((0,0)) 49 | for chunk in pd.read_csv(path, chunksize=10 ** 6): 50 | chunk.drop(columns = columns_to_drop_packet, inplace = True) 51 | chunk = chunk[chunk.columns.drop(list(chunk.filter(regex='mqtt')))] 52 | 53 | chunk = chunk.fillna(-1) 54 | 55 | with open(folder_name + 'instances_count.csv','a') as f: 56 | f.write('{}, {} \n'.format(path, chunk.shape[0])) 57 | 58 | x_temp = chunk.loc[chunk['is_attack'] == is_attack] 59 | x_temp.drop('is_attack', axis = 1, inplace = True) 60 | #x_temp['protocol'] = label_encoder.transform(x_temp['protocol']) 61 | if one_hot_encoder == None: 62 | one_hot_encoder = OneHotEncoder(categorical_features=[0], n_values=30) 63 | x_temp = one_hot_encoder.fit_transform(x_temp).toarray() 64 | else: 65 | x_temp = one_hot_encoder.transform(x_temp).toarray() 66 | 67 | x_temp = np.unique(x_temp, axis = 0) 68 | 69 | if x.size == 0: 70 | x = x_temp 71 | else: 72 | x = np.concatenate((x, x_temp), axis = 0) 73 | x = np.unique(x, axis = 0) 74 | else: 75 | dataset = pd.read_csv(path) 76 | 77 | if mode == 1 or mode == 2: 78 | dataset = dataset.loc[dataset['is_attack'] == is_attack] 79 | # if is_attack == 0: 80 | # dataset = dataset.loc[operator.and_(dataset['ip_src'].isin(attacker_ips) == False, dataset['ip_dst'].isin(attacker_ips) == False)] 81 | # else: 82 | # dataset = dataset.loc[operator.or_(dataset['ip_src'].isin(attacker_ips), dataset['ip_dst'].isin(attacker_ips))] 83 | # 84 | if mode == 0: 85 | dataset.drop(columns=[columns_to_drop_packet], inplace = True) 86 | dataset = dataset[dataset.columns.drop(list(dataset.filter(regex='mqtt')))] 87 | elif mode == 1: 88 | dataset.drop(columns = columns_to_drop_uni, inplace = True) 89 | elif mode == 2: 90 | dataset.drop(columns = columns_to_drop_bi, inplace = True) 91 | 92 | if verbose: 93 | print(dataset.columns) 94 | 95 | dataset = dataset.fillna(-1) 96 | 97 | if mode == 0: 98 | x = dataset.loc[dataset['is_attack'] == is_attack] 99 | x.drop('is_attack', axis=1, inplace=True) 100 | #x['protocol'] = label_encoder.transform(x['protocol']) 101 | if one_hot_encoder == None: 102 | one_hot_encoder = OneHotEncoder(categorical_features=[0], n_values=30) 103 | x = one_hot_encoder.fit_transform(x).toarray() 104 | else: 105 | x = one_hot_encoder.transform(x).toarray() 106 | else: 107 | x = dataset.values 108 | 109 | with open(folder_name + 'instances_count.csv','a') as f: 110 | f.write('all, {}, {} \n'.format(path, x.shape[0])) 111 | 112 | x = np.unique(x, axis = 0) 113 | 114 | with open(folder_name + 'instances_count.csv','a') as f: 115 | f.write('unique, {}, {} \n'.format(path, x.shape[0])) 116 | 117 | if (mode == 1 and x.shape[0] > 100000) or (mode == 2 and x.shape[0] > 50000): 118 | temp = x.shape[0] // 10 119 | start = sliceno * temp 120 | end = start + temp - 1 121 | x = x[start:end,:] 122 | with open(folder_name + 'instances_count.csv','a') as f: 123 | f.write('Start, {}, End, {} \n'.format(start, end)) 124 | elif mode == 0: 125 | if x.shape[0] > 15000000: 126 | temp = x.shape[0] // 400 127 | start = sliceno * temp 128 | end = start + temp - 1 129 | x = x[start:end,:] 130 | with open(folder_name + 'instances_count.csv','a') as f: 131 | f.write('Start, {}, End, {} \n'.format(start, end)) 132 | elif x.shape[0] > 10000000: 133 | temp = x.shape[0] // 200 134 | start = sliceno * temp 135 | end = start + temp - 1 136 | x = x[start:end,:] 137 | with open(folder_name + 'instances_count.csv','a') as f: 138 | f.write('Start, {}, End, {} \n'.format(start, end)) 139 | elif x.shape[0] > 100000: 140 | temp = x.shape[0] // 10 141 | start = sliceno * temp 142 | end = start + temp - 1 143 | x = x[start:end,:] 144 | with open(folder_name + 'instances_count.csv','a') as f: 145 | f.write('Start, {}, End, {} \n'.format(start, end)) 146 | 147 | 148 | y = np.full(x.shape[0], label) 149 | 150 | with open(folder_name + 'instances_count.csv','a') as f: 151 | f.write('slice, {}, {} \n'.format(path, x.shape[0])) 152 | 153 | return x, y 154 | 155 | def classify_sub(classifier, x_train, y_train, x_test, y_test, cm_file_name, summary_file_name, classifier_name, verbose = True): 156 | classifier.fit(x_train, y_train) 157 | pred = classifier.predict(x_test) 158 | 159 | cm = pd.crosstab(y_test, pred) 160 | cm.to_csv(cm_file_name) 161 | 162 | pd.DataFrame(classification_report(y_test, pred, output_dict = True)).transpose().to_csv(summary_file_name) 163 | 164 | if verbose: 165 | print(classifier_name + ' Done.\n') 166 | 167 | del classifier 168 | del pred 169 | del cm 170 | 171 | def classify(random_state, x_train, y_train, x_test, y_test, folder_name, prefix = "", verbose = True): 172 | confusion_matrix_folder = os.path.join(folder_name, 'Confusion_Matrix/') 173 | summary_folder = os.path.join(folder_name, 'Summary/') 174 | 175 | if os.path.isdir(confusion_matrix_folder) == False: 176 | os.mkdir(confusion_matrix_folder) 177 | if os.path.isdir(summary_folder) == False: 178 | os.mkdir(summary_folder) 179 | 180 | # 1- Linear 181 | linear_classifier = LogisticRegression(random_state = random_state) 182 | classify_sub(linear_classifier, 183 | x_train, y_train, 184 | x_test, y_test, 185 | confusion_matrix_folder + prefix + '_cm_linear.csv', 186 | summary_folder + prefix + '_summary_linear.csv', 187 | 'Linear', 188 | verbose) 189 | 190 | # 2- KNN 191 | knn_classifier = KNeighborsClassifier() 192 | classify_sub(knn_classifier, 193 | x_train, y_train, 194 | x_test, y_test, 195 | confusion_matrix_folder + prefix + '_cm_knn.csv', 196 | summary_folder + prefix + '_summary_knn.csv', 197 | 'KNN', 198 | verbose) 199 | 200 | #3- RBF SVM 201 | kernel_svm_classifier = SVC(kernel = 'rbf', random_state = random_state, gamma='scale') 202 | classify_sub(kernel_svm_classifier, 203 | x_train, y_train, 204 | x_test, y_test, 205 | confusion_matrix_folder + prefix + '_cm_kernel_svm.csv', 206 | summary_folder + prefix + '_summary_kernel_svm.csv', 207 | 'SVM', 208 | verbose) 209 | 210 | #4- Naive Bayes 211 | naive_classifier = GaussianNB() 212 | classify_sub(naive_classifier, 213 | x_train, y_train, 214 | x_test, y_test, 215 | confusion_matrix_folder + prefix + '_cm_naive.csv', 216 | summary_folder + prefix + '_summary_naive.csv', 217 | 'Naive', 218 | verbose) 219 | 220 | #5- Decision Tree 221 | decision_tree_classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = random_state) 222 | classify_sub(decision_tree_classifier, 223 | x_train, y_train, 224 | x_test, y_test, 225 | confusion_matrix_folder + prefix + '_cm_decision_tree.csv', 226 | summary_folder + prefix + '_summary_decision_tree.csv', 227 | 'Decision Tree', 228 | verbose) 229 | 230 | #6- Random Forest 231 | random_forest_classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = random_state) 232 | classify_sub(random_forest_classifier, 233 | x_train, y_train, 234 | x_test, y_test, 235 | confusion_matrix_folder + prefix + '_cm_random_forest.csv', 236 | summary_folder + prefix + '_summary_random_forest.csv', 237 | 'Random Forest', 238 | verbose) 239 | 240 | # 7- Linear SVM 241 | svm_classifier = LinearSVC(random_state = random_state) 242 | classify_sub(svm_classifier, 243 | x_train, y_train, 244 | x_test, y_test, 245 | confusion_matrix_folder + prefix + '_cm_svm.csv', 246 | summary_folder + prefix + '_summary_svm.csv', 247 | 'SVM', 248 | verbose) 249 | 250 | if __name__ == "__main__": 251 | parser = argparse.ArgumentParser() 252 | parser.add_argument('--mode', type = int, default = 2) 253 | parser.add_argument('--output', default='Classification_Bi') 254 | parser.add_argument('--verbose', type = str2bool, default = True) 255 | 256 | args = parser.parse_args() 257 | 258 | for slice_number in range(10): 259 | prefix = '' 260 | if args.mode == 1: 261 | prefix = 'uniflow_' 262 | elif args.mode == 2: 263 | prefix = 'biflow_' 264 | 265 | if args.verbose: 266 | print('Starting Slice #: {}'.format(slice_number)) 267 | print('Start Classification') 268 | 269 | random_state = 0 270 | folder_name = '{}_{}/'.format(args.output, slice_number) 271 | 272 | if os.path.isdir(folder_name) == False: 273 | os.mkdir(folder_name) 274 | 275 | x, y = load_file(prefix + 'normal.csv', 276 | args.mode, 277 | 0, 0, 278 | folder_name, 279 | slice_number, 280 | args.verbose) 281 | 282 | x_temp, y_temp = load_file(prefix + 'scan_A.csv', 283 | args.mode, 284 | 1, 1, 285 | folder_name, 286 | slice_number, 287 | args.verbose) 288 | 289 | x = np.concatenate((x, x_temp), axis = 0) 290 | y = np.append(y, y_temp) 291 | del x_temp, y_temp 292 | 293 | x_temp, y_temp = load_file(prefix + 'scan_sU.csv', 294 | args.mode, 295 | 1, 2, 296 | folder_name, 297 | slice_number, 298 | args.verbose) 299 | 300 | x = np.concatenate((x, x_temp), axis = 0) 301 | y = np.append(y, y_temp) 302 | del x_temp, y_temp 303 | 304 | x_temp, y_temp = load_file(prefix + 'sparta.csv', 305 | args.mode, 306 | 1, 3, 307 | folder_name, 308 | slice_number, 309 | args.verbose) 310 | 311 | x = np.concatenate((x, x_temp), axis = 0) 312 | y = np.append(y, y_temp) 313 | del x_temp, y_temp 314 | 315 | x_temp, y_temp = load_file(prefix + 'mqtt_bruteforce.csv', 316 | args.mode, 317 | 1, 4, 318 | folder_name, 319 | slice_number, 320 | args.verbose) 321 | 322 | x = np.concatenate((x, x_temp), axis = 0) 323 | y = np.append(y, y_temp) 324 | del x_temp, y_temp 325 | 326 | x_train, x_test, y_train, y_test = train_test_split(x, y, 327 | test_size = 0.25, 328 | random_state = 42) 329 | 330 | classify(random_state, x_train, y_train, x_test, y_test, 331 | folder_name, "slice_{}_no_cross_validation".format(slice_number), args.verbose) 332 | 333 | kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0) 334 | 335 | counter = 0 336 | for train, test in kfold.split(x, y): 337 | classify(random_state, x[train], y[train], x[test], y[test], 338 | folder_name, "slice_{}_k_{}".format(slice_number, counter), args.verbose) 339 | counter += 1 340 | 341 | del x 342 | del y 343 | del x_train 344 | del x_test 345 | del y_train 346 | del y_test 347 | -------------------------------------------------------------------------------- /pcap_packet_features.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Aug 26 14:22:32 2019 5 | 6 | @author: hananhindy 7 | """ 8 | 9 | import pyshark 10 | import csv 11 | import argparse 12 | import traceback 13 | import os 14 | 15 | def str2bool(v): 16 | if v.lower() in ('yes', 'true', 't', 'y', '1'): 17 | return True 18 | elif v.lower() in ('no', 'false', 'f', 'n', '0'): 19 | return False 20 | else: 21 | raise argparse.ArgumentTypeError('Boolean value expected.') 22 | 23 | validation_attributes = ['timestamp', 24 | 'src_ip', 'dst_ip' 25 | ] 26 | 27 | attributes = ['protocol', 28 | 'ttl', 'ip_len', 29 | 'ip_flags', 30 | 'ip_flag_df', 'ip_flag_mf', 'ip_flag_rb', 31 | 'src_port', 'dst_port', 32 | 'tcp_flags', 33 | 'tcp_flag_res', 'tcp_flag_ns', 'tcp_flag_cwr', 'tcp_flag_ecn', 'tcp_flag_urg', 'tcp_flag_ack', 'tcp_flag_push', 'tcp_flag_reset', 'tcp_flag_syn', 'tcp_flag_fin', 34 | 'mqtt_messagetype', 'mqtt_messagelength', 35 | 'mqtt_flags', 36 | 'mqtt_flag_uname', 'mqtt_flag_passwd', 'mqtt_flag_retain', 'mqtt_flag_qos', 'mqtt_flag_willflag', 'mqtt_flag_clean', 'mqtt_flag_reserved', 37 | 'is_attack' 38 | ] 39 | 40 | 41 | def extract_attributes(src, dst, attacker_ip, split_flags = False, include_validation_attributes = False): 42 | pcap = pyshark.FileCapture(src_file_name, keep_packets = False) 43 | 44 | first = True 45 | with open(dst_file_name, "a") as csv_file: 46 | for packet in pcap: 47 | entry = {} 48 | if include_validation_attributes: 49 | for key in validation_attributes: 50 | entry[key] = '' 51 | 52 | for key in attributes: 53 | if 'flag_' in key and split_flags == False: 54 | continue 55 | entry[key] = '' 56 | 57 | try: 58 | entry['is_attack'] = 0 59 | if include_validation_attributes: 60 | entry['timestamp'] = packet.sniff_time.strftime('%m/%d/%Y, %H:%M:%S:%f') 61 | 62 | entry['protocol'] = packet.highest_layer 63 | 64 | if 'ip' in packet: 65 | if include_validation_attributes: 66 | entry['src_ip'] = packet.ip.src 67 | entry['dst_ip'] = packet.ip.dst 68 | if packet.ip.src == attacker_ip or packet.ip.dst == attacker_ip: 69 | entry['is_attack'] = 1 70 | 71 | entry['ttl'] = packet.ip.ttl 72 | entry['ip_len'] = packet.ip.len 73 | 74 | if split_flags: 75 | entry['ip_flag_df'] = packet.ip.flags_df 76 | entry['ip_flag_mf'] = packet.ip.flags_mf 77 | entry['ip_flag_rb'] = packet.ip.flags_rb 78 | else: 79 | entry['ip_flags'] = packet.ip.flags 80 | 81 | if 'udp' in packet: 82 | entry['src_port'] = packet.udp.srcport 83 | entry['dst_port'] = packet.udp.dstport 84 | 85 | elif 'tcp' in packet: 86 | entry['src_port'] = packet.tcp.srcport 87 | entry['dst_port'] = packet.tcp.dstport 88 | 89 | if split_flags: 90 | entry['tcp_flag_res'] = packet.tcp.flags_res 91 | entry['tcp_flag_ns'] = packet.tcp.flags_ns 92 | entry['tcp_flag_cwr'] = packet.tcp.flags_cwr 93 | entry['tcp_flag_ecn'] = packet.tcp.flags_ecn 94 | entry['tcp_flag_urg'] = packet.tcp.flags_urg 95 | entry['tcp_flag_ack'] = packet.tcp.flags_ack 96 | entry['tcp_flag_push'] = packet.tcp.flags_push 97 | entry['tcp_flag_reset'] = packet.tcp.flags_reset 98 | entry['tcp_flag_syn'] = packet.tcp.flags_syn 99 | entry['tcp_flag_fin'] = packet.tcp.flags_fin 100 | else: 101 | entry['tcp_flags'] = packet.tcp.flags 102 | else: 103 | continue 104 | 105 | if 'mqtt' in packet: 106 | entry['mqtt_messagetype'] = packet.mqtt.msgtype 107 | entry['mqtt_messagelength'] = packet.mqtt.len 108 | 109 | if 'conflags' in packet.mqtt.field_names: 110 | if split_flags: 111 | entry['mqtt_flag_uname'] = packet.mqtt.conflag_uname 112 | entry['mqtt_flag_passwd'] = packet.mqtt.conflag_passwd 113 | entry['mqtt_flag_retain'] = packet.mqtt.conflag_retain 114 | entry['mqtt_flag_qos'] = packet.mqtt.conflag_qos 115 | entry['mqtt_flag_willflag'] = packet.mqtt.conflag_willflag 116 | entry['mqtt_flag_clean'] = packet.mqtt.conflag_cleansess 117 | entry['mqtt_flag_reserved'] = packet.mqtt.conflag_reserved 118 | else: 119 | entry['mqtt_flags'] = packet.mqtt.conflags 120 | 121 | 122 | writer = csv.DictWriter(csv_file, list(entry.keys()), delimiter=',') 123 | if first: 124 | writer.writeheader() 125 | first = False 126 | 127 | writer.writerow(entry) 128 | 129 | except Exception: 130 | traceback.print_exc() 131 | break 132 | 133 | pcap.close() 134 | 135 | 136 | 137 | 138 | if __name__ == "__main__": 139 | parser = argparse.ArgumentParser() 140 | parser.add_argument('--root', default = './') 141 | parser.add_argument('--split_flags', default = True, type = str2bool) 142 | parser.add_argument('--attacker_ip', default = '192.168.2.5') 143 | parser.add_argument('--include_validation_attributes', default = True, type = str2bool) 144 | 145 | args = parser.parse_args() 146 | root = args.root 147 | split_flags = args.split_flags 148 | attacker_ip = args.attacker_ip 149 | include_validation_attributes = args.include_validation_attributes 150 | 151 | for file in os.listdir(root): 152 | if file.endswith('.pcap'): 153 | 154 | src_file_name = os.path.join(root, file) 155 | dst_file_name = src_file_name.replace('.pcap', '.csv') 156 | if os.path.isfile(dst_file_name) == False: 157 | print('Start processing: {}'.format(file)) 158 | extract_attributes(src_file_name, dst_file_name, attacker_ip, split_flags, include_validation_attributes) 159 | print('End processing: {}'.format(file)) 160 | -------------------------------------------------------------------------------- /pcap_parser.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import dpkt 4 | from print_packets import * 5 | import time 6 | import sys 7 | import datetime 8 | 9 | start_time = time.time() 10 | 11 | output_uniflows_separately = True 12 | 13 | pkt_num_list = [] 14 | time_list = [] 15 | ip_src_list = [] 16 | ip_dst_list = [] 17 | ip_len_list = [] 18 | proto_list = [] 19 | prt_src_list = [] 20 | prt_dst_list = [] 21 | tcp_psh_flag_list = [] 22 | tcp_rst_flag_list = [] 23 | tcp_urg_flag_list = [] 24 | 25 | def get_mean(l): 26 | if len(l) == 0: 27 | return 0 28 | elif len(l) == 1: 29 | return l[0] 30 | else: 31 | return np.absolute(np.diff(np.sort(l))).mean() 32 | 33 | packet_dict = {'pkt_num': pkt_num_list, 34 | 'time': time_list, 35 | 'ip_src': ip_src_list, 36 | 'ip_dst': ip_dst_list, 37 | 'ip_len': ip_len_list, 38 | 'proto': proto_list, 39 | 'prt_src': prt_src_list, 40 | 'prt_dst': prt_dst_list, 41 | 'tcp_psh': tcp_psh_flag_list, 42 | 'tcp_rst': tcp_rst_flag_list, 43 | 'tcp_urg': tcp_urg_flag_list} 44 | 45 | 46 | 47 | # All traffic is either TCP or UDP 48 | #f = open('nmap_scan_all_10x_network_sU_Scan.pcap', 'rb') 49 | #f = open('normal_operation.pcap', 'rb') 50 | sliding_window = False 51 | 52 | if len(sys.argv) > 1: 53 | print(sys.argv[1]) 54 | f = open(sys.argv[1], 'rb') 55 | output_file = sys.argv[1].replace(".pcap", "_WithWindowing.csv") 56 | else: 57 | f = open('bruteforce.pcap', 'rb') 58 | output_file = 'bruteforce.csv' 59 | 60 | if len(sys.argv) > 2 and sys.argv[2] == "0": 61 | output_file = sys.argv[1].replace(".pcap", ".csv") 62 | sliding_window = False 63 | 64 | pcap = dpkt.pcap.Reader(f) 65 | 66 | 67 | count = 1 68 | l2count = 0 69 | icmpcount = 0 70 | igmpcount = 0 71 | udpcount=0 72 | tcpcount=0 73 | unknown_transport_layer = 0 74 | 75 | for ts, buf in pcap: 76 | 77 | if count == 1: 78 | global_t0 = datetime.datetime.utcfromtimestamp(ts) 79 | 80 | if (count > 0): 81 | 82 | eth = dpkt.ethernet.Ethernet(buf) 83 | if not isinstance(eth.data, dpkt.ip.IP): 84 | #print('Non IP Packet type not supported %s\n' % eth.data.__class__.__name__) 85 | l2count+=1 86 | continue 87 | l3 = eth.data 88 | if isinstance(l3.data, dpkt.icmp.ICMP): 89 | icmpcount+=1 90 | #print("ICMP Packet disarded") 91 | continue 92 | 93 | if isinstance(l3.data, dpkt.igmp.IGMP): 94 | igmpcount+=1 95 | continue 96 | 97 | ###### If packet is icmp then continue 98 | 99 | 100 | 101 | #print(l3) 102 | l4 = l3.data 103 | 104 | if not isinstance(l4, dpkt.tcp.TCP) and not isinstance(l4, dpkt.udp.UDP): 105 | unknown_transport_layer += 1 106 | continue 107 | 108 | pkt_num_list.append(count) 109 | time_list.append(ts) 110 | ip_src_list.append(inet_to_str(l3.src)) 111 | ip_dst_list.append(inet_to_str(l3.dst)) 112 | ip_len_list.append(len(eth.data)) 113 | #ip_tos_list.append(l3.tos) 114 | 115 | if isinstance(l4, dpkt.tcp.TCP): 116 | tcpcount+=1 117 | proto_list.append('TCP') 118 | prt_src_list.append(l4.sport) 119 | prt_dst_list.append(l4.dport) 120 | #syn_flag = ( l4.flags & dpkt.tcp.TH_SYN ) != 0 121 | rst_flag = ( l4.flags & dpkt.tcp.TH_RST ) != 0 122 | psh_flag = ( l4.flags & dpkt.tcp.TH_PUSH) != 0 123 | #ack_flag = ( l4.flags & dpkt.tcp.TH_ACK ) != 0 124 | urg_flag = ( l4.flags & dpkt.tcp.TH_URG ) != 0 125 | tcp_psh_flag_list.append(psh_flag) 126 | tcp_rst_flag_list.append(rst_flag) 127 | tcp_urg_flag_list.append(urg_flag) 128 | 129 | 130 | 131 | if isinstance(l4, dpkt.udp.UDP): 132 | udpcount+=1 133 | proto_list.append('UDP') 134 | prt_src_list.append(l4.sport) 135 | prt_dst_list.append(l4.dport) 136 | # Need to add a value to these to maintain consistent rows across lists - will add zeros 137 | tcp_psh_flag_list.append(False) 138 | tcp_rst_flag_list.append(False) 139 | tcp_urg_flag_list.append(False) 140 | count+=1 141 | 142 | print("L2 packets dicarded = ", l2count) 143 | print("ICMP packets dicarded = ", icmpcount) 144 | print("IGMP packets dicarded = ", igmpcount) 145 | print("Unknown Trnsport Layer packets dicarded = ", unknown_transport_layer) 146 | print("UDP packets = ", udpcount) 147 | print("TCP packets = ", tcpcount) 148 | 149 | 150 | 151 | 152 | packet_df = pd.DataFrame(packet_dict) 153 | packet_df.set_index('pkt_num', inplace=True) 154 | 155 | 156 | # ************Create a list of tuples that identify each indepent flow 157 | 158 | tuplist_flowid = {} 159 | flow_count = 0 160 | 161 | flow_list_dict = {} 162 | tcpflowcount = 0 163 | udpflowcount = 0 164 | 165 | for index in range(len(pkt_num_list)): 166 | mytup = (ip_src_list[index], ip_dst_list[index], prt_src_list[index], prt_dst_list[index], proto_list[index]) 167 | 168 | str_temp = "_".join(str(v) for v in mytup) 169 | if str_temp not in tuplist_flowid: 170 | tuplist_flowid[str_temp] = flow_count 171 | flow_list_dict[flow_count] = [] 172 | flow_count += 1 173 | 174 | current_flow_id = tuplist_flowid[str_temp] 175 | flow_tup = ( 176 | ip_src_list[index], ip_dst_list[index], prt_src_list[index], prt_dst_list[index], proto_list[index], 177 | pkt_num_list[index], time_list[index], ip_len_list[index], tcp_psh_flag_list[index], tcp_rst_flag_list[index], 178 | tcp_urg_flag_list[index], current_flow_id) 179 | 180 | flow_list_dict[current_flow_id].append(flow_tup) 181 | 182 | if len(flow_list_dict[current_flow_id]) == 1: 183 | if flow_list_dict[current_flow_id][0][4] == 'TCP': 184 | tcpflowcount+=1 185 | if flow_list_dict[current_flow_id][0][4] == 'UDP': 186 | udpflowcount+=1 187 | 188 | del tuplist_flowid 189 | 190 | print("\nNumber of flows = ", flow_count) 191 | 192 | packet_dict = {'pkt_num': pkt_num_list, 193 | 'time': time_list, 194 | 'ip_src': ip_src_list, 195 | 'ip_dst': ip_dst_list, 196 | 'ip_len': ip_len_list, 197 | 'proto': proto_list, 198 | 'prt_src': prt_src_list, 199 | 'prt_dst': prt_dst_list, 200 | 'tcp_psh': tcp_psh_flag_list, 201 | 'tcp_rst': tcp_rst_flag_list, 202 | 'tcp_urg': tcp_urg_flag_list} 203 | 204 | 205 | 206 | 207 | 208 | print("\nUnique flows = ", len(flow_list_dict)) 209 | 210 | print("\nflow list list element = ", flow_list_dict[0][0]) 211 | if len(flow_list_dict[0]) > 1: 212 | print("\nflow list list element = ", flow_list_dict[0][1]) 213 | if len(flow_list_dict[0]) > 2: 214 | print("\nflow list list element = ", flow_list_dict[0][2]) 215 | 216 | print("UDP flows = ", udpflowcount) 217 | print("TCP flows = ", tcpflowcount) 218 | 219 | class uniFlow: 220 | def __init__(self, ip_src, ip_dst, prt_src, prt_dst, proto, num_pkts, 221 | mean_iat, std_iat, min_iat, max_iat, mean_offset, mean_pkt_len, 222 | std_pkt_len, min_pkt_len, max_pkt_len, num_bytes, num_psh_flags, 223 | num_rst_flags, num_urg_flags): 224 | self.ip_src = ip_src 225 | self.ip_dst = ip_dst 226 | self.prt_src = prt_src 227 | self.prt_dst = prt_dst 228 | self.proto = proto 229 | self.num_pkts = num_pkts # num pkts in this flow 230 | self.mean_iat = mean_iat # ave interarrival time 231 | self.std_iat = std_iat # std dev of IAT (jitter-ish) 232 | self.min_iat = min_iat 233 | self.max_iat = max_iat 234 | self.mean_offset = mean_offset 235 | self.mean_pkt_len = mean_pkt_len # ave pckt len per flow 236 | self.std_pkt_len = std_pkt_len # std deviation of packet lengths 237 | self.max_pkt_len = max_pkt_len 238 | self.min_pkt_len = min_pkt_len 239 | self.num_bytes = num_bytes 240 | self.num_psh_flags = num_psh_flags 241 | self.num_rst_flags = num_rst_flags 242 | self.num_urg_flags = num_urg_flags 243 | self.processed = False 244 | 245 | meta_list = [] 246 | meta_list_time_0 = [] 247 | f_count = 0 248 | for key in flow_list_dict: 249 | flow_list = flow_list_dict[key] 250 | pkt = flow_list[0] # get first pkt in the flow 251 | 252 | 253 | #0 is ip_src 254 | #1 is ip_dst 255 | #2 is prt_src 256 | #3 is prt_dst 257 | #4 is proto 258 | #5 is pkt_num 259 | #6 is time 260 | #7 is ip_len 261 | #8 is tcp_psh_flag 262 | #9 is tcp_rst_flag 263 | #10 is tcp_urg_flag 264 | #11 is flow_id 265 | 266 | 267 | ip_src = pkt[0] 268 | ip_dst = pkt[1] 269 | prt_src = pkt[2] 270 | prt_dst = pkt[3] 271 | proto = pkt[4] 272 | if proto == 'TCP': 273 | proto = 6 274 | elif proto == 'UDP': 275 | proto = 17 276 | num_pkts = len(flow_list) 277 | # need to calc inter-arrival time and ave pkt length 278 | length_list = [] 279 | time_list = [] 280 | psh_list = [] 281 | rst_list = [] 282 | urg_list = [] 283 | for p in flow_list: 284 | length_list.append(p[7]) 285 | time_list.append(p[6]) 286 | psh_list.append(p[8]) 287 | rst_list.append(p[9]) 288 | urg_list.append(p[10]) 289 | mean_pkt_len = sum(length_list) / num_pkts 290 | pkt_len_arry = np.array(length_list) 291 | std_pkt_len = float(np.std(pkt_len_arry)) 292 | min_pkt_len = float(min(pkt_len_arry)) 293 | max_pkt_len = float(max(pkt_len_arry)) 294 | num_bytes = sum(length_list) 295 | num_psh_flags = sum(psh_list) 296 | num_rst_flags = sum(rst_list) 297 | num_urg_flags = sum(urg_list) 298 | if num_pkts > 1: 299 | time_list.sort(reverse = True) # put times in descending order 300 | t_diff = abs(np.diff(time_list)) 301 | mean_iat = sum(t_diff) / (num_pkts - 1) 302 | std_iat = np.std(t_diff) # std dev of IAT 303 | min_iat = min(t_diff) 304 | max_iat = max(t_diff) 305 | # Kenzi's apparently good feature is the mean time between the first 306 | # packet and each sucessive packet: (t2-t1) + (t3-t1) + (t4-t1) / n 307 | time_list.sort() # sort into ascending order now 308 | t0 = time_list[0] 309 | time_total = 0.0 310 | for f in range(1, num_pkts): 311 | time_total += abs(t0 - time_list[f]) 312 | mean_offset = time_total / (num_pkts - 1) 313 | 314 | else: 315 | mean_iat = 0.0 316 | std_iat = 0.0 317 | min_iat = 0.0 318 | max_iat = 0.0 319 | mean_offset = 0.0 320 | uniflow = uniFlow(ip_src, ip_dst, prt_src, prt_dst, proto, num_pkts, mean_iat, 321 | std_iat, min_iat, max_iat, mean_offset, mean_pkt_len, std_pkt_len, 322 | min_pkt_len, max_pkt_len, num_bytes, num_psh_flags, 323 | num_rst_flags, num_urg_flags) 324 | meta_list.append(uniflow) 325 | meta_list_time_0.append((datetime.datetime.utcfromtimestamp(time_list[0]) - global_t0).seconds // 60) 326 | f_count +=1 327 | 328 | 329 | def uniFlow2df(uniflow): 330 | df = pd.DataFrame(columns=['ip_src', 'ip_dst', 'prt_src', 'prt_dst', 'proto', 'num_pkts', 331 | 'mean_iat', 'std_iat', 'min_iat', 'max_iat', 'mean_offset', 'mean_pkt_len', 332 | 'std_pkt_len', 'min_pkt_len', 'max_pkt_len', 'num_bytes', 333 | 'num_psh_flags', 'num_rst_flags', 'num_urg_flags']) 334 | df.loc[0,'ip_src'] = str(uniflow.ip_src) 335 | df.loc[0,'ip_dst'] = str(uniflow.ip_dst) 336 | df.loc[0,'prt_src'] = int(uniflow.prt_src) 337 | df.loc[0,'prt_dst'] = int(uniflow.prt_dst) 338 | df.loc[0,'proto'] = int(uniflow.proto) 339 | df.loc[0,'num_pkts'] = int(uniflow.num_pkts) 340 | df.loc[0,'mean_iat'] = float(uniflow.mean_iat) 341 | df.loc[0,'std_iat'] = float(uniflow.std_iat) 342 | df.loc[0,'min_iat'] = float(uniflow.min_iat) 343 | df.loc[0,'max_iat'] = float(uniflow.max_iat) 344 | df.loc[0,'mean_offset'] = float(uniflow.mean_offset) 345 | df.loc[0,'mean_pkt_len'] = float(uniflow.mean_pkt_len) 346 | df.loc[0,'std_pkt_len'] = float(uniflow.std_pkt_len) 347 | df.loc[0,'min_pkt_len'] = float(uniflow.min_pkt_len) 348 | df.loc[0,'max_pkt_len'] = float(uniflow.max_pkt_len) 349 | df.loc[0,'num_bytes'] = int(uniflow.num_bytes) 350 | df.loc[0,'num_psh_flags'] = int(uniflow.num_psh_flags) 351 | df.loc[0,'num_rst_flags'] = int(uniflow.num_rst_flags) 352 | df.loc[0,'num_urg_flags'] = int(uniflow.num_urg_flags) 353 | return df 354 | 355 | 356 | if output_uniflows_separately: 357 | #feature_df = pd.DataFrame() 358 | feature_df = pd.DataFrame(columns=['ip_src', 'ip_dst', 'prt_src', 'prt_dst', 'proto', 359 | 'num_pkts', 'mean_iat', 'std_iat', 'min_iat', 360 | 'max_iat', 'mean_offset', 'mean_pkt_len', 'num_bytes', 'num_psh_flags', 361 | 'num_rst_flags', 'num_urg_flags']) 362 | 363 | for flow in meta_list: 364 | flow_df = uniFlow2df(flow) 365 | feature_df = feature_df.append(flow_df, ignore_index=True, sort=False) 366 | 367 | 368 | #feature_df.to_csv('robert_stealth.csv', sep=',') 369 | feature_df.to_csv('uniflow_' + output_file, sep=',') 370 | 371 | print('\nAll uniflows processed') 372 | 373 | # No convert uniflows into biflows 374 | #ßfor uniflow in feature_df: 375 | 376 | ################################## 377 | # Combine uniflows into biflows 378 | 379 | df_biflow = pd.DataFrame(columns=['ip_src', 'ip_dst', 'prt_src', 'prt_dst', 'proto', 'fwd_num_pkts', 'bwd_num_pkts', 380 | 'fwd_mean_iat', 'bwd_mean_iat', 'fwd_std_iat', 'bwd_std_iat', 'fwd_min_iat', 'bwd_min_iat', 381 | 'fwd_max_iat', 'bwd_max_iat','fwd_mean_offset', 'bwd_mean_offset', 'fwd_mean_pkt_len', 'bwd_mean_pkt_len', 382 | 'fwd_std_pkt_len', 'bwd_std_pkt_len', 'fwd_min_pkt_len', 'bwd_min_pkt_len', 383 | 'fwd_max_pkt_len', 'bwd_max_pkt_len', 'fwd_num_bytes', 'bwd_num_bytes', 384 | 'fwd_num_psh_flags', 'bwd_num_psh_flags', 385 | 'fwd_num_rst_flags', 'bwd_num_rst_flags', 'fwd_num_urg_flags', 'bwd_num_urg_flags']) 386 | 387 | #feature_df['processed'] = False 388 | 389 | #feature_row = feature_df.iloc[0,:].copy() 390 | # process the TCP flows 391 | print('\nProcessing TCP flows') 392 | sibilings_counts = {} 393 | delta_avg = {} 394 | bi_flow_time = [] 395 | 396 | num_uniflows = len(meta_list) 397 | for row_num in range(num_uniflows): 398 | current = meta_list[row_num] 399 | current_time = meta_list_time_0[row_num] 400 | if (current.processed == False): 401 | ip_src=current.ip_src 402 | ip_dst=current.ip_dst 403 | prt_src=current.prt_src 404 | prt_dst = current.prt_dst 405 | proto = current.proto 406 | # Get reverse tuple values 407 | rev_ip_src = ip_dst 408 | rev_ip_dst = ip_src 409 | rev_prt_src = prt_dst 410 | rev_prt_dst = prt_src 411 | for inner_row in range(row_num, num_uniflows): 412 | if (current.processed == True): 413 | continue; 414 | 415 | inner = meta_list[inner_row] 416 | inner_ip_src=inner.ip_src 417 | inner_ip_dst=inner.ip_dst 418 | inner_prt_src=inner.prt_src 419 | inner_prt_dst = inner.prt_dst 420 | inner_proto = inner.proto 421 | 422 | if (rev_ip_src == inner_ip_src) and (rev_ip_dst == inner_ip_dst) and (rev_prt_src == inner_prt_src) and (rev_prt_dst == inner_prt_dst) and (proto == inner_proto): 423 | # matching flow found! 424 | meta_list[row_num].processed = True 425 | meta_list[inner_row].processed = True 426 | 427 | biflowlist = [str(current_time)+'_'+current.ip_src, current.ip_src, current.ip_dst, current.prt_src, current.prt_dst, current.proto, 428 | current.num_pkts, inner.num_pkts, current.mean_iat, inner.mean_iat, current.std_iat, 429 | inner.std_iat, current.min_iat, inner.min_iat, current.max_iat, inner.max_iat,current.mean_offset, inner.mean_offset, 430 | current.mean_pkt_len, inner.mean_pkt_len, current.std_pkt_len, inner.std_pkt_len, 431 | current.min_pkt_len, inner.min_pkt_len, current.max_pkt_len, inner.max_pkt_len, 432 | current.num_bytes, inner.num_bytes, current.num_psh_flags, inner.num_psh_flags, 433 | current.num_rst_flags, inner.num_rst_flags, current.num_urg_flags, inner.num_urg_flags] 434 | columns_list=['sec_ip_src', 'ip_src', 'ip_dst', 'prt_src', 'prt_dst', 435 | 'proto', 'fwd_num_pkts', 'bwd_num_pkts', 436 | 'fwd_mean_iat', 'bwd_mean_iat', 'fwd_std_iat', 437 | 'bwd_std_iat', 'fwd_min_iat', 'bwd_min_iat', 438 | 'fwd_max_iat', 'bwd_max_iat', 'fwd_mean_offset', 'bwd_mean_offset', 'fwd_mean_pkt_len', 439 | 'bwd_mean_pkt_len', 'fwd_std_pkt_len', 'bwd_std_pkt_len', 440 | 'fwd_min_pkt_len', 'bwd_min_pkt_len', 441 | 'fwd_max_pkt_len', 'bwd_max_pkt_len', 'fwd_num_bytes', 442 | 'bwd_num_bytes', 'fwd_num_psh_flags', 'bwd_num_psh_flags', 443 | 'fwd_num_rst_flags', 'bwd_num_rst_flags', 'fwd_num_urg_flags', 444 | 'bwd_num_urg_flags'] 445 | 446 | df_biflow = df_biflow.append(pd.DataFrame([biflowlist], columns = columns_list), ignore_index=True, sort=False) 447 | else: 448 | continue 449 | else: 450 | continue 451 | 452 | 453 | 454 | print('\nProcessing UDP flows') 455 | # Process the UDP flows 456 | for row_num in range(num_uniflows): 457 | current = meta_list[row_num] 458 | current_time = meta_list_time_0[row_num] 459 | if (current.processed == False): 460 | ip_src=current.ip_src 461 | ip_dst=current.ip_dst 462 | prt_src=current.prt_src 463 | prt_dst = current.prt_dst 464 | proto = current.proto 465 | # Get reverse tuple values 466 | rev_ip_src = ip_dst 467 | rev_ip_dst = ip_src 468 | rev_prt_src = prt_dst 469 | rev_prt_dst = prt_src 470 | if proto == 17: 471 | meta_list[row_num].processed = True 472 | # UDP flows have no reverse direction so i have filled the redundant fields with 473 | # dupicate forward direction data 474 | biflowlist = [str(current_time)+'_'+current.ip_src,current.ip_src, current.ip_dst, current.prt_src, current.prt_dst, current.proto, 475 | current.num_pkts, current.num_pkts, current.mean_iat, current.mean_iat, current.std_iat, 476 | current.std_iat, current.min_iat, current.min_iat, current.max_iat, current.max_iat, current.mean_offset, current.mean_offset, 477 | current.mean_pkt_len, current.mean_pkt_len, current.std_pkt_len, current.std_pkt_len, 478 | current.min_pkt_len, current.min_pkt_len, current.max_pkt_len, current.max_pkt_len, 479 | current.num_bytes, current.num_bytes, current.num_psh_flags, current.num_psh_flags, 480 | current.num_rst_flags, current.num_rst_flags, current.num_urg_flags, current.num_urg_flags] 481 | columns_list=['sec_ip_src','ip_src', 'ip_dst', 'prt_src', 'prt_dst', 482 | 'proto', 'fwd_num_pkts', 'bwd_num_pkts', 483 | 'fwd_mean_iat', 'bwd_mean_iat', 'fwd_std_iat', 484 | 'bwd_std_iat', 'fwd_min_iat', 'bwd_min_iat', 485 | 'fwd_max_iat', 'bwd_max_iat','fwd_mean_offset', 'bwd_mean_offset','fwd_mean_pkt_len', 486 | 'bwd_mean_pkt_len', 'fwd_std_pkt_len', 'bwd_std_pkt_len', 487 | 'fwd_min_pkt_len', 'bwd_min_pkt_len', 488 | 'fwd_max_pkt_len', 'bwd_max_pkt_len', 'fwd_num_bytes', 489 | 'bwd_num_bytes', 'fwd_num_psh_flags', 'bwd_num_psh_flags', 490 | 'fwd_num_rst_flags', 'bwd_num_rst_flags', 'fwd_num_urg_flags', 491 | 'bwd_num_urg_flags'] 492 | 493 | df_biflow = df_biflow.append(pd.DataFrame([biflowlist], columns = columns_list), ignore_index=True, sort=False) 494 | else: 495 | continue 496 | 497 | 498 | 499 | del pkt_num_list 500 | del proto_list 501 | del prt_dst_list 502 | del prt_src_list 503 | del tcp_psh_flag_list 504 | del tcp_rst_flag_list 505 | del time_list 506 | del ip_dst_list 507 | del ip_src_list 508 | del ip_len_list 509 | del tcp_urg_flag_list 510 | del packet_df 511 | del packet_dict 512 | del meta_list 513 | del flow_list 514 | del flow_list_dict 515 | if 'feature_df' in globals(): 516 | del feature_df 517 | 518 | 519 | # Now add flow-bundle data 520 | # Add the numbe of flowws from each IP address and measure of the 521 | # variability of destination port numbers that packets are sent to 522 | # we will sort the port numbers in order then take the mean difference 523 | # a value of 1 should indicate an incremental port scanner 524 | 525 | print('Number of bi flows = {}'.format(np.size(df_biflow, axis = 0))) 526 | 527 | df_biflow['num_src_flows'] = 0 528 | df_biflow['src_ip_dst_prt_delta'] = 0 529 | 530 | biflow_column = 'sec_ip_src' 531 | 532 | if sliding_window == False: 533 | biflow_column = 'ip_src' 534 | 535 | addr_dict = dict(df_biflow[biflow_column].value_counts()) 536 | print(addr_dict) 537 | print('-------------') 538 | print( dict(df_biflow['ip_src'].value_counts())) 539 | print('\nComputing number of flows per source') 540 | for key, value in addr_dict.items(): 541 | df_biflow.loc[df_biflow[biflow_column] == key, 'num_src_flows'] = value 542 | print('\nComputing number of port destinations per source') 543 | for key, value in addr_dict.items(): 544 | rows = df_biflow[df_biflow[biflow_column] == key]['prt_dst'] 545 | l = list(rows) 546 | l.sort() 547 | ave_diff = 0 548 | if len(l) == 1: 549 | ave_diff = l[0] 550 | elif len(l) > 0: 551 | ave_diff = np.absolute(np.diff(l)).mean() 552 | df_biflow.loc[df_biflow[biflow_column] == key, 'src_ip_dst_prt_delta']= ave_diff 553 | 554 | 555 | 556 | df_biflow.to_csv('biflow_' + output_file, sep=',') 557 | 558 | 559 | 560 | # normal.pcap has 3305 packets and 1719 unique flows 561 | 562 | print('Parsing the file took {} seconds'.format(time.time() - start_time)) -------------------------------------------------------------------------------- /print_packets.py: -------------------------------------------------------------------------------- 1 | import dpkt 2 | from dpkt.compat import compat_ord 3 | import socket 4 | 5 | 6 | 7 | 8 | def mac_addr(address): 9 | """Convert a MAC address to a readable/printable string 10 | Args: 11 | address (str): a MAC address in hex form (e.g. '\x01\x02\x03\x04\x05\x06') 12 | Returns: 13 | str: Printable/readable MAC address 14 | """ 15 | return ':'.join('%02x' % compat_ord(b) for b in address) 16 | 17 | 18 | def inet_to_str(inet): 19 | """Convert inet object to a string 20 | Args: 21 | inet (inet struct): inet network address 22 | Returns: 23 | str: Printable/readable IP address 24 | """ 25 | # First try ipv4 and then ipv6 26 | try: 27 | return socket.inet_ntop(socket.AF_INET, inet) 28 | except ValueError: 29 | return socket.inet_ntop(socket.AF_INET6, inet) 30 | 31 | def print_packets(pcap): 32 | """Print out information about each packet in a pcap 33 | Args: 34 | pcap: dpkt pcap reader object (dpkt.pcap.Reader) 35 | """ 36 | # For each packet in the pcap process the contents 37 | for timestamp, buf in pcap: 38 | 39 | # Print out the timestamp in UTC 40 | print('Timestamp: ', str(datetime.datetime.utcfromtimestamp(timestamp))) 41 | 42 | # Unpack the Ethernet frame (mac src/dst, ethertype) 43 | eth = dpkt.ethernet.Ethernet(buf) 44 | print('Ethernet Frame: ', mac_addr(eth.src), mac_addr(eth.dst), eth.type) 45 | 46 | # Make sure the Ethernet data contains an IP packet 47 | if not isinstance(eth.data, dpkt.ip.IP): 48 | print('Non IP Packet type not supported %s\n' % eth.data.__class__.__name__) 49 | continue 50 | 51 | # Now unpack the data within the Ethernet frame (the IP packet) 52 | # Pulling out src, dst, length, fragment info, TTL, and Protocol 53 | ip = eth.data 54 | 55 | # Pull out fragment information (flags and offset all packed into off field, so use bitmasks) 56 | do_not_fragment = bool(ip.off & dpkt.ip.IP_DF) 57 | more_fragments = bool(ip.off & dpkt.ip.IP_MF) 58 | fragment_offset = ip.off & dpkt.ip.IP_OFFMASK 59 | 60 | # Print out the info 61 | print('IP: %s -> %s (len=%d ttl=%d DF=%d MF=%d offset=%d)\n' % \ 62 | (inet_to_str(ip.src), inet_to_str(ip.dst), ip.len, ip.ttl, do_not_fragment, more_fragments, fragment_offset)) 63 | 64 | 65 | def test(): 66 | """Open up a test pcap file and print out the packets""" 67 | with open('data/http.pcap', 'rb') as f: 68 | pcap = dpkt.pcap.Reader(f) 69 | print_packets(pcap) 70 | 71 | 72 | --------------------------------------------------------------------------------