├── .gitignore
├── README.md
├── classification.py
├── pcap_packet_features.py
├── pcap_parser.py
└── print_packets.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | /*.csv
106 | 
107 | *.png
108 | 
109 | *.sh
110 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Machine Learning Based IoT IntrusionDetection System: An MQTT Case Study
 2 | 
 3 | This work uses six different machine learning techniques to classify attacks in an MQTT network.
 4 | 
 5 | ## Dataset Used
 6 | The used dataset is published in [IEEE DataPort](https://ieee-dataport.org/open-access/mqtt-internet-things-intrusion-detection-dataset)
 7 | 
 8 | ````
 9 | @data{bhxy-ep04-20,
10 | doi = {10.21227/bhxy-ep04},
11 | url = {http://dx.doi.org/10.21227/bhxy-ep04},
12 | author = {Hanan Hindy; Christos Tachtatzis; Robert Atkinson; Ethan Bayne; Xavier Bellekens },
13 | publisher = {IEEE Dataport},
14 | title = {MQTT Internet of Things Intrusion Detection Dataset},
15 | year = {2020} } 
16 | ````
17 | 
18 | ## Citation
19 | ```
20 | @article{hindy2020machine,
21 |   title={Machine Learning Based IoT Intrusion Detection System: An MQTT Case Study},
22 |   author={Hindy, Hanan and Bayne, Ethan and Bures, Miroslav and Atkinson, Robert and Tachtatzis, Christos and Bellekens, Xavier},
23 |   journal={arXiv preprint arXiv:2006.15340},
24 |   year={2020}
25 | }
26 | ````
27 | 
28 | # Algorithms Used 
29 | - Logistic Regression
30 | - k-Nearest Neighbours
31 | - Gaussian Naive Bayes
32 | - Decision Trees
33 | - Random Forests
34 | - Support Vector Machine (linear and RBF kernel)
35 | 
36 | 
37 | ## How to Run it:
38 | 
39 | ```
40 | Clone this repository
41 | Download dataset files and extract them in the same directory
42 | run classification.py --mode [0: packet, 1: unidirectional, 2: bidirectional] --output [output_folder] --verbose [True/False]
43 | ```
44 | - The classification outputs are added to the output folder. 
45 | 


--------------------------------------------------------------------------------
/classification.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Thu Aug 29 12:14:12 2019
  5 | 
  6 | @author: hananhindy
  7 | """
  8 | import pandas as pd
  9 | import numpy as np
 10 | import os
 11 | import argparse
 12 | 
 13 | from sklearn.preprocessing import OneHotEncoder
 14 | from sklearn.linear_model import LogisticRegression
 15 | from sklearn.neighbors import KNeighborsClassifier
 16 | from sklearn.svm import SVC, LinearSVC
 17 | from sklearn.naive_bayes import GaussianNB
 18 | from sklearn.tree import DecisionTreeClassifier
 19 | from sklearn.ensemble import RandomForestClassifier
 20 | from sklearn.model_selection import train_test_split, StratifiedKFold
 21 | from sklearn.metrics import classification_report
 22 | 
 23 | # Helper Function
 24 | def str2bool(v):
 25 |     if v.lower() in ('yes', 'true', 't', 'y', '1'):
 26 |         return True
 27 |     elif v.lower() in ('no', 'false', 'f', 'n', '0'):
 28 |         return False
 29 |     else:
 30 |         raise argparse.ArgumentTypeError('Boolean value expected.')
 31 | 
 32 | #protocols = ['ARP', 'CDP', 'CLDAP', 'DATA', 'DNS', 'DTLS', 'DTP', 'ECHO', 'ICMP', 'ISAKMP','MDNS', 'NAT-PMP', 'NBNS', 'NFS', 'NTP', 'PORTMAP', 'RADIUS', 'RIP', 'SRVLOC', 'SNMP',  'SSH', 'STP', 'TCP', 'UDP', 'XDMCP', 'MQTT', 'MPEG_PMT', 'MP2T', 'MPEG_PAT', 'DVB_SDT']
 33 | #label_encoder = LabelEncoder().fit(protocols)
 34 | 
 35 | one_hot_encoder = None
 36 | 
 37 | def load_file(path, mode, is_attack = 1, label = 1, folder_name='Bi/', sliceno = 0, verbose = True):
 38 |     #global label_encoder
 39 |     global one_hot_encoder
 40 |     
 41 |     #attacker_ips = ['192.168.2.5']
 42 |     
 43 |     columns_to_drop_packet = ['timestamp', 'src_ip', 'dst_ip']
 44 |     columns_to_drop_uni = ['proto', 'ip_src', 'ip_dst']
 45 |     columns_to_drop_bi = ['proto', 'ip_src', 'ip_dst']
 46 |     
 47 |     if os.path.getsize(path)//10 ** 9 > 0:
 48 |         x = np.zeros((0,0))
 49 |         for chunk in pd.read_csv(path, chunksize=10 ** 6):
 50 |             chunk.drop(columns = columns_to_drop_packet, inplace = True)
 51 |             chunk = chunk[chunk.columns.drop(list(chunk.filter(regex='mqtt')))]
 52 |                                      
 53 |             chunk = chunk.fillna(-1)
 54 |         
 55 |             with open(folder_name + 'instances_count.csv','a') as f:
 56 |                 f.write('{}, {} \n'.format(path, chunk.shape[0]))
 57 |                 
 58 |             x_temp = chunk.loc[chunk['is_attack'] == is_attack]   
 59 |             x_temp.drop('is_attack', axis = 1, inplace = True)
 60 |             #x_temp['protocol'] = label_encoder.transform(x_temp['protocol'])
 61 |             if one_hot_encoder == None:
 62 |                 one_hot_encoder = OneHotEncoder(categorical_features=[0], n_values=30)
 63 |                 x_temp = one_hot_encoder.fit_transform(x_temp).toarray()
 64 |             else:
 65 |                 x_temp = one_hot_encoder.transform(x_temp).toarray()
 66 |             
 67 |             x_temp = np.unique(x_temp, axis = 0)
 68 |             
 69 |             if x.size == 0:
 70 |                 x = x_temp
 71 |             else:
 72 |                 x = np.concatenate((x, x_temp), axis = 0)
 73 |                 x = np.unique(x, axis = 0)
 74 |     else:
 75 |         dataset = pd.read_csv(path)
 76 |     
 77 |         if mode == 1 or mode == 2:
 78 |             dataset = dataset.loc[dataset['is_attack'] == is_attack]
 79 | #            if is_attack == 0:
 80 | #                dataset = dataset.loc[operator.and_(dataset['ip_src'].isin(attacker_ips) == False, dataset['ip_dst'].isin(attacker_ips) == False)]
 81 | #            else:
 82 | #                dataset = dataset.loc[operator.or_(dataset['ip_src'].isin(attacker_ips), dataset['ip_dst'].isin(attacker_ips))]
 83 | #            
 84 |         if mode == 0:
 85 |             dataset.drop(columns=[columns_to_drop_packet], inplace = True)
 86 |             dataset = dataset[dataset.columns.drop(list(dataset.filter(regex='mqtt')))]
 87 |         elif mode == 1:
 88 |             dataset.drop(columns = columns_to_drop_uni, inplace = True)
 89 |         elif mode == 2:
 90 |             dataset.drop(columns = columns_to_drop_bi, inplace = True)
 91 |         
 92 |         if verbose:                 
 93 |             print(dataset.columns)
 94 |         
 95 |         dataset = dataset.fillna(-1)
 96 |                
 97 |         if mode == 0:
 98 |             x = dataset.loc[dataset['is_attack'] == is_attack]   
 99 |             x.drop('is_attack', axis=1, inplace=True)
100 |             #x['protocol'] = label_encoder.transform(x['protocol'])
101 |             if one_hot_encoder == None:
102 |                 one_hot_encoder = OneHotEncoder(categorical_features=[0], n_values=30)
103 |                 x = one_hot_encoder.fit_transform(x).toarray()
104 |             else:
105 |                 x = one_hot_encoder.transform(x).toarray()
106 |         else:
107 |             x = dataset.values
108 |     
109 |     with open(folder_name + 'instances_count.csv','a') as f:
110 |         f.write('all, {}, {} \n'.format(path, x.shape[0]))
111 |     
112 |     x = np.unique(x, axis = 0)
113 | 
114 |     with open(folder_name + 'instances_count.csv','a') as f:
115 |         f.write('unique, {}, {} \n'.format(path, x.shape[0]))
116 |     
117 |     if (mode == 1 and x.shape[0] > 100000) or (mode == 2 and x.shape[0] > 50000):
118 |             temp = x.shape[0] // 10
119 |             start = sliceno * temp
120 |             end = start + temp - 1 
121 |             x = x[start:end,:] 
122 |             with open(folder_name + 'instances_count.csv','a') as f:
123 |                 f.write('Start, {}, End, {} \n'.format(start, end))
124 |     elif mode == 0:
125 |         if x.shape[0] > 15000000:
126 |             temp = x.shape[0] // 400
127 |             start = sliceno * temp
128 |             end = start + temp - 1 
129 |             x = x[start:end,:] 
130 |             with open(folder_name + 'instances_count.csv','a') as f:
131 |                 f.write('Start, {}, End, {} \n'.format(start, end))
132 |         elif x.shape[0] > 10000000:
133 |             temp = x.shape[0] // 200
134 |             start = sliceno * temp
135 |             end = start + temp - 1 
136 |             x = x[start:end,:] 
137 |             with open(folder_name + 'instances_count.csv','a') as f:
138 |                 f.write('Start, {}, End, {} \n'.format(start, end))
139 |         elif x.shape[0] > 100000:
140 |             temp = x.shape[0] // 10
141 |             start = sliceno * temp
142 |             end = start + temp - 1 
143 |             x = x[start:end,:] 
144 |             with open(folder_name + 'instances_count.csv','a') as f:
145 |                 f.write('Start, {}, End, {} \n'.format(start, end))
146 | 
147 |             
148 |     y = np.full(x.shape[0], label)
149 |     
150 |     with open(folder_name + 'instances_count.csv','a') as f:
151 |         f.write('slice, {}, {} \n'.format(path, x.shape[0]))
152 |         
153 |     return x, y
154 | 
155 | def classify_sub(classifier, x_train, y_train, x_test, y_test, cm_file_name, summary_file_name, classifier_name, verbose = True):
156 |     classifier.fit(x_train, y_train)
157 |     pred = classifier.predict(x_test)
158 |     
159 |     cm = pd.crosstab(y_test, pred)
160 |     cm.to_csv(cm_file_name)    
161 |     
162 |     pd.DataFrame(classification_report(y_test, pred, output_dict = True)).transpose().to_csv(summary_file_name)
163 |     
164 |     if verbose:
165 |         print(classifier_name + ' Done.\n')
166 |     
167 |     del classifier
168 |     del pred
169 |     del cm
170 |     
171 | def classify(random_state, x_train, y_train, x_test, y_test, folder_name, prefix = "", verbose = True):
172 |     confusion_matrix_folder = os.path.join(folder_name, 'Confusion_Matrix/') 
173 |     summary_folder =  os.path.join(folder_name, 'Summary/') 
174 | 
175 |     if os.path.isdir(confusion_matrix_folder) == False:
176 |             os.mkdir(confusion_matrix_folder)
177 |     if os.path.isdir(summary_folder) == False:
178 |             os.mkdir(summary_folder)
179 |             
180 |     # 1- Linear
181 |     linear_classifier = LogisticRegression(random_state = random_state)
182 |     classify_sub(linear_classifier, 
183 |                  x_train, y_train, 
184 |                  x_test, y_test, 
185 |                  confusion_matrix_folder + prefix + '_cm_linear.csv', 
186 |                  summary_folder + prefix + '_summary_linear.csv',
187 |                  'Linear',
188 |                  verbose)
189 |        
190 |     # 2- KNN
191 |     knn_classifier = KNeighborsClassifier()
192 |     classify_sub(knn_classifier, 
193 |                  x_train, y_train, 
194 |                  x_test, y_test, 
195 |                  confusion_matrix_folder + prefix + '_cm_knn.csv', 
196 |                  summary_folder + prefix + '_summary_knn.csv',
197 |                  'KNN',
198 |                  verbose)
199 |     
200 |     #3- RBF SVM
201 |     kernel_svm_classifier = SVC(kernel = 'rbf', random_state = random_state, gamma='scale')
202 |     classify_sub(kernel_svm_classifier, 
203 |                  x_train, y_train, 
204 |                  x_test, y_test, 
205 |                  confusion_matrix_folder + prefix + '_cm_kernel_svm.csv', 
206 |                  summary_folder + prefix + '_summary_kernel_svm.csv',
207 |                  'SVM',
208 |                  verbose)
209 |     
210 |     #4- Naive Bayes
211 |     naive_classifier = GaussianNB()
212 |     classify_sub(naive_classifier, 
213 |                  x_train, y_train, 
214 |                  x_test, y_test, 
215 |                  confusion_matrix_folder + prefix + '_cm_naive.csv', 
216 |                  summary_folder + prefix + '_summary_naive.csv',
217 |                  'Naive',
218 |                  verbose)
219 | 
220 |     #5- Decision Tree
221 |     decision_tree_classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = random_state)
222 |     classify_sub(decision_tree_classifier, 
223 |                  x_train, y_train, 
224 |                  x_test, y_test, 
225 |                  confusion_matrix_folder + prefix + '_cm_decision_tree.csv', 
226 |                  summary_folder + prefix + '_summary_decision_tree.csv',
227 |                  'Decision Tree',
228 |                  verbose)
229 |     
230 |     #6- Random Forest
231 |     random_forest_classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = random_state)
232 |     classify_sub(random_forest_classifier, 
233 |                  x_train, y_train, 
234 |                  x_test, y_test, 
235 |                  confusion_matrix_folder + prefix + '_cm_random_forest.csv', 
236 |                  summary_folder + prefix + '_summary_random_forest.csv',
237 |                  'Random Forest',
238 |                  verbose)
239 | 
240 |     # 7- Linear SVM 
241 |     svm_classifier = LinearSVC(random_state = random_state)
242 |     classify_sub(svm_classifier, 
243 |                  x_train, y_train, 
244 |                  x_test, y_test, 
245 |                  confusion_matrix_folder + prefix + '_cm_svm.csv', 
246 |                  summary_folder + prefix + '_summary_svm.csv',
247 |                  'SVM',
248 |                  verbose)
249 |     
250 | if __name__ == "__main__":
251 |     parser = argparse.ArgumentParser()
252 |     parser.add_argument('--mode', type = int, default = 2)
253 |     parser.add_argument('--output', default='Classification_Bi')
254 |     parser.add_argument('--verbose', type = str2bool, default = True)
255 | 
256 |     args = parser.parse_args()
257 |     
258 |     for slice_number in range(10):
259 |         prefix = ''
260 |         if args.mode == 1:
261 |             prefix = 'uniflow_' 
262 |         elif args.mode == 2:
263 |             prefix = 'biflow_'
264 |         
265 |         if args.verbose:
266 |             print('Starting Slice #: {}'.format(slice_number))
267 |             print('Start Classification')
268 |             
269 |         random_state = 0
270 |         folder_name = '{}_{}/'.format(args.output, slice_number)
271 |         
272 |         if os.path.isdir(folder_name) == False:
273 |             os.mkdir(folder_name)
274 |             
275 |         x, y = load_file(prefix + 'normal.csv', 
276 |                          args.mode, 
277 |                          0, 0, 
278 |                          folder_name, 
279 |                          slice_number,
280 |                          args.verbose)
281 |         
282 |         x_temp, y_temp = load_file(prefix + 'scan_A.csv', 
283 |                                    args.mode, 
284 |                                    1, 1, 
285 |                                    folder_name,
286 |                                    slice_number,
287 |                                    args.verbose)
288 |         
289 |         x = np.concatenate((x, x_temp), axis = 0)
290 |         y = np.append(y, y_temp)
291 |         del x_temp, y_temp
292 |         
293 |         x_temp, y_temp = load_file(prefix + 'scan_sU.csv', 
294 |                                    args.mode, 
295 |                                    1, 2, 
296 |                                    folder_name,
297 |                                    slice_number,
298 |                                    args.verbose)
299 |         
300 |         x = np.concatenate((x, x_temp), axis = 0)
301 |         y = np.append(y, y_temp)
302 |         del x_temp, y_temp
303 |                 
304 |         x_temp, y_temp = load_file(prefix + 'sparta.csv', 
305 |                                    args.mode, 
306 |                                    1, 3,
307 |                                    folder_name,
308 |                                    slice_number,
309 |                                    args.verbose)
310 |         
311 |         x = np.concatenate((x, x_temp), axis = 0)
312 |         y = np.append(y, y_temp)
313 |         del x_temp, y_temp
314 |                 
315 |         x_temp, y_temp = load_file(prefix + 'mqtt_bruteforce.csv', 
316 |                                    args.mode,
317 |                                    1, 4, 
318 |                                    folder_name,
319 |                                    slice_number,
320 |                                    args.verbose)
321 |         
322 |         x = np.concatenate((x, x_temp), axis = 0)
323 |         y = np.append(y, y_temp)
324 |         del x_temp, y_temp
325 |                 
326 |         x_train, x_test, y_train, y_test = train_test_split(x, y, 
327 |                                                             test_size = 0.25,
328 |                                                             random_state = 42)
329 |         
330 |         classify(random_state, x_train, y_train, x_test, y_test, 
331 |                  folder_name, "slice_{}_no_cross_validation".format(slice_number), args.verbose)
332 |        
333 |         kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)
334 |         
335 |         counter = 0
336 |         for train, test in kfold.split(x, y):
337 |             classify(random_state, x[train], y[train], x[test], y[test], 
338 |                      folder_name, "slice_{}_k_{}".format(slice_number, counter), args.verbose)
339 |             counter += 1
340 |             
341 |         del x
342 |         del y
343 |         del x_train
344 |         del x_test
345 |         del y_train
346 |         del y_test
347 | 


--------------------------------------------------------------------------------
/pcap_packet_features.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Mon Aug 26 14:22:32 2019
  5 | 
  6 | @author: hananhindy
  7 | """
  8 | 
  9 | import pyshark
 10 | import csv
 11 | import argparse
 12 | import traceback
 13 | import os
 14 | 
 15 | def str2bool(v):
 16 |     if v.lower() in ('yes', 'true', 't', 'y', '1'):
 17 |         return True
 18 |     elif v.lower() in ('no', 'false', 'f', 'n', '0'):
 19 |         return False
 20 |     else:
 21 |         raise argparse.ArgumentTypeError('Boolean value expected.')
 22 |         
 23 | validation_attributes = ['timestamp', 
 24 |                          'src_ip', 'dst_ip'
 25 |                          ]
 26 | 
 27 | attributes = ['protocol', 
 28 |               'ttl', 'ip_len', 
 29 |               'ip_flags', 
 30 |               'ip_flag_df', 'ip_flag_mf', 'ip_flag_rb',
 31 |               'src_port', 'dst_port', 
 32 |               'tcp_flags',
 33 |               'tcp_flag_res', 'tcp_flag_ns', 'tcp_flag_cwr', 'tcp_flag_ecn', 'tcp_flag_urg', 'tcp_flag_ack', 'tcp_flag_push', 'tcp_flag_reset', 'tcp_flag_syn', 'tcp_flag_fin',
 34 |               'mqtt_messagetype', 'mqtt_messagelength',
 35 |               'mqtt_flags',
 36 |               'mqtt_flag_uname', 'mqtt_flag_passwd', 'mqtt_flag_retain', 'mqtt_flag_qos', 'mqtt_flag_willflag', 'mqtt_flag_clean', 'mqtt_flag_reserved',
 37 |               'is_attack'
 38 |               ]
 39 | 
 40 | 
 41 | def extract_attributes(src, dst, attacker_ip, split_flags = False, include_validation_attributes = False):
 42 |     pcap = pyshark.FileCapture(src_file_name, keep_packets = False)
 43 | 
 44 |     first = True
 45 |     with open(dst_file_name, "a") as csv_file:
 46 |         for packet in pcap: 
 47 |             entry = {}
 48 |             if include_validation_attributes:
 49 |                 for key in validation_attributes:
 50 |                     entry[key] = ''
 51 |                 
 52 |             for key in attributes:
 53 |                 if 'flag_' in key and split_flags == False:
 54 |                     continue
 55 |                 entry[key] = ''
 56 |             
 57 |             try:
 58 |                 entry['is_attack'] = 0
 59 |                 if include_validation_attributes:
 60 |                     entry['timestamp'] = packet.sniff_time.strftime('%m/%d/%Y, %H:%M:%S:%f')
 61 |                     
 62 |                 entry['protocol'] = packet.highest_layer        
 63 |                 
 64 |                 if 'ip' in packet:
 65 |                     if include_validation_attributes:
 66 |                         entry['src_ip'] = packet.ip.src
 67 |                         entry['dst_ip'] = packet.ip.dst
 68 |                     if packet.ip.src == attacker_ip or packet.ip.dst == attacker_ip:
 69 |                         entry['is_attack'] = 1
 70 |                         
 71 |                     entry['ttl'] = packet.ip.ttl
 72 |                     entry['ip_len'] = packet.ip.len
 73 |                     
 74 |                     if split_flags:
 75 |                         entry['ip_flag_df'] = packet.ip.flags_df
 76 |                         entry['ip_flag_mf'] = packet.ip.flags_mf
 77 |                         entry['ip_flag_rb'] = packet.ip.flags_rb
 78 |                     else:
 79 |                         entry['ip_flags'] = packet.ip.flags
 80 |                         
 81 |                 if 'udp' in packet:
 82 |                     entry['src_port'] = packet.udp.srcport
 83 |                     entry['dst_port'] = packet.udp.dstport
 84 |                     
 85 |                 elif 'tcp' in packet:
 86 |                     entry['src_port'] = packet.tcp.srcport
 87 |                     entry['dst_port'] = packet.tcp.dstport
 88 |                     
 89 |                     if split_flags:
 90 |                         entry['tcp_flag_res'] = packet.tcp.flags_res
 91 |                         entry['tcp_flag_ns'] = packet.tcp.flags_ns
 92 |                         entry['tcp_flag_cwr'] = packet.tcp.flags_cwr
 93 |                         entry['tcp_flag_ecn'] = packet.tcp.flags_ecn
 94 |                         entry['tcp_flag_urg'] = packet.tcp.flags_urg
 95 |                         entry['tcp_flag_ack'] = packet.tcp.flags_ack
 96 |                         entry['tcp_flag_push'] = packet.tcp.flags_push
 97 |                         entry['tcp_flag_reset'] = packet.tcp.flags_reset
 98 |                         entry['tcp_flag_syn'] = packet.tcp.flags_syn
 99 |                         entry['tcp_flag_fin'] = packet.tcp.flags_fin
100 |                     else:
101 |                         entry['tcp_flags'] = packet.tcp.flags
102 |                 else:
103 |                     continue
104 |                     
105 |                 if 'mqtt' in packet:
106 |                     entry['mqtt_messagetype'] = packet.mqtt.msgtype
107 |                     entry['mqtt_messagelength'] = packet.mqtt.len
108 |                     
109 |                     if 'conflags' in packet.mqtt.field_names:
110 |                         if split_flags:
111 |                             entry['mqtt_flag_uname'] = packet.mqtt.conflag_uname
112 |                             entry['mqtt_flag_passwd'] = packet.mqtt.conflag_passwd
113 |                             entry['mqtt_flag_retain'] = packet.mqtt.conflag_retain
114 |                             entry['mqtt_flag_qos'] = packet.mqtt.conflag_qos
115 |                             entry['mqtt_flag_willflag'] = packet.mqtt.conflag_willflag
116 |                             entry['mqtt_flag_clean'] = packet.mqtt.conflag_cleansess
117 |                             entry['mqtt_flag_reserved'] = packet.mqtt.conflag_reserved
118 |                         else:
119 |                             entry['mqtt_flags'] = packet.mqtt.conflags
120 |                     
121 |                 
122 |                 writer = csv.DictWriter(csv_file, list(entry.keys()), delimiter=',')
123 |                 if first:
124 |                     writer.writeheader()  
125 |                     first = False
126 |                         
127 |                 writer.writerow(entry) 
128 |                 
129 |             except Exception:
130 |                 traceback.print_exc()
131 |                 break
132 |         
133 |     pcap.close()
134 |         
135 |             
136 | 
137 | 
138 | if __name__ == "__main__":
139 |     parser = argparse.ArgumentParser()
140 |     parser.add_argument('--root', default = './')
141 |     parser.add_argument('--split_flags', default = True, type = str2bool)
142 |     parser.add_argument('--attacker_ip', default = '192.168.2.5')
143 |     parser.add_argument('--include_validation_attributes', default = True, type = str2bool)
144 |     
145 |     args = parser.parse_args()
146 |     root = args.root
147 |     split_flags = args.split_flags
148 |     attacker_ip = args.attacker_ip
149 |     include_validation_attributes = args.include_validation_attributes
150 |     
151 |     for file in os.listdir(root):
152 |         if file.endswith('.pcap'):
153 |             
154 |             src_file_name = os.path.join(root, file) 
155 |             dst_file_name = src_file_name.replace('.pcap', '.csv')
156 |             if os.path.isfile(dst_file_name) == False:
157 |                 print('Start processing: {}'.format(file))
158 |                 extract_attributes(src_file_name, dst_file_name, attacker_ip, split_flags, include_validation_attributes)
159 |                 print('End processing: {}'.format(file))
160 |             


--------------------------------------------------------------------------------
/pcap_parser.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import dpkt
  4 | from print_packets import *
  5 | import time
  6 | import sys
  7 | import datetime
  8 | 
  9 | start_time = time.time()
 10 | 
 11 | output_uniflows_separately = True
 12 | 
 13 | pkt_num_list = []
 14 | time_list = []
 15 | ip_src_list = []
 16 | ip_dst_list = []
 17 | ip_len_list = []
 18 | proto_list = []
 19 | prt_src_list = []
 20 | prt_dst_list = []
 21 | tcp_psh_flag_list = []
 22 | tcp_rst_flag_list = []
 23 | tcp_urg_flag_list = []
 24 | 
 25 | def get_mean(l):
 26 |     if len(l) == 0:
 27 |         return 0
 28 |     elif len(l) == 1:
 29 |         return l[0]
 30 |     else:
 31 |         return np.absolute(np.diff(np.sort(l))).mean()
 32 |     
 33 | packet_dict = {'pkt_num': pkt_num_list,
 34 |                'time': time_list,
 35 |                'ip_src': ip_src_list,
 36 |                'ip_dst': ip_dst_list,
 37 |                'ip_len': ip_len_list,
 38 |                'proto': proto_list,
 39 |                'prt_src': prt_src_list,
 40 |                'prt_dst': prt_dst_list,
 41 |                'tcp_psh': tcp_psh_flag_list,
 42 |                'tcp_rst': tcp_rst_flag_list,
 43 |                'tcp_urg': tcp_urg_flag_list}
 44 | 
 45 | 
 46 | 
 47 | # All traffic is either TCP or UDP
 48 | #f = open('nmap_scan_all_10x_network_sU_Scan.pcap', 'rb')
 49 | #f = open('normal_operation.pcap', 'rb')
 50 | sliding_window = False
 51 | 
 52 | if len(sys.argv) > 1:
 53 |     print(sys.argv[1])
 54 |     f = open(sys.argv[1], 'rb')
 55 |     output_file = sys.argv[1].replace(".pcap", "_WithWindowing.csv")
 56 | else:
 57 |     f = open('bruteforce.pcap', 'rb')
 58 |     output_file = 'bruteforce.csv'
 59 |   
 60 | if len(sys.argv) > 2 and sys.argv[2] == "0":
 61 |     output_file = sys.argv[1].replace(".pcap", ".csv")
 62 |     sliding_window = False
 63 |     
 64 | pcap = dpkt.pcap.Reader(f)
 65 | 
 66 | 
 67 | count = 1
 68 | l2count = 0
 69 | icmpcount = 0
 70 | igmpcount = 0
 71 | udpcount=0
 72 | tcpcount=0
 73 | unknown_transport_layer = 0
 74 | 
 75 | for ts, buf in pcap:
 76 | 
 77 |     if count == 1:
 78 |         global_t0 = datetime.datetime.utcfromtimestamp(ts)
 79 | 
 80 |     if (count > 0):
 81 |         
 82 |         eth = dpkt.ethernet.Ethernet(buf)
 83 |         if not isinstance(eth.data, dpkt.ip.IP):
 84 |             #print('Non IP Packet type not supported %s\n' % eth.data.__class__.__name__)
 85 |             l2count+=1
 86 |             continue
 87 |         l3 = eth.data
 88 |         if isinstance(l3.data, dpkt.icmp.ICMP):
 89 |             icmpcount+=1
 90 |             #print("ICMP Packet disarded")
 91 |             continue
 92 |         
 93 |         if isinstance(l3.data, dpkt.igmp.IGMP):
 94 |             igmpcount+=1
 95 |             continue
 96 |         
 97 |         ###### If packet is icmp then continue
 98 |         
 99 |         
100 |         
101 |         #print(l3)
102 |         l4 = l3.data
103 |         
104 |         if not isinstance(l4, dpkt.tcp.TCP) and not isinstance(l4, dpkt.udp.UDP):
105 |             unknown_transport_layer += 1
106 |             continue
107 |         
108 |         pkt_num_list.append(count)
109 |         time_list.append(ts)
110 |         ip_src_list.append(inet_to_str(l3.src))
111 |         ip_dst_list.append(inet_to_str(l3.dst))
112 |         ip_len_list.append(len(eth.data))
113 |         #ip_tos_list.append(l3.tos)
114 | 
115 |         if isinstance(l4, dpkt.tcp.TCP):
116 |             tcpcount+=1
117 |             proto_list.append('TCP')
118 |             prt_src_list.append(l4.sport)
119 |             prt_dst_list.append(l4.dport)
120 |             #syn_flag = ( l4.flags & dpkt.tcp.TH_SYN ) != 0
121 |             rst_flag = ( l4.flags & dpkt.tcp.TH_RST ) != 0
122 |             psh_flag = ( l4.flags & dpkt.tcp.TH_PUSH) != 0
123 |             #ack_flag = ( l4.flags & dpkt.tcp.TH_ACK ) != 0
124 |             urg_flag = ( l4.flags & dpkt.tcp.TH_URG ) != 0
125 |             tcp_psh_flag_list.append(psh_flag)
126 |             tcp_rst_flag_list.append(rst_flag)
127 |             tcp_urg_flag_list.append(urg_flag)
128 | 
129 | 
130 | 
131 |         if isinstance(l4, dpkt.udp.UDP):
132 |             udpcount+=1
133 |             proto_list.append('UDP')
134 |             prt_src_list.append(l4.sport)
135 |             prt_dst_list.append(l4.dport)
136 |             # Need to add a value to these to maintain consistent rows across lists - will add zeros
137 |             tcp_psh_flag_list.append(False)
138 |             tcp_rst_flag_list.append(False)
139 |             tcp_urg_flag_list.append(False)
140 |     count+=1
141 | 
142 | print("L2 packets dicarded = ", l2count)
143 | print("ICMP packets dicarded = ", icmpcount)
144 | print("IGMP packets dicarded = ", igmpcount)
145 | print("Unknown Trnsport Layer packets dicarded = ", unknown_transport_layer)
146 | print("UDP packets  = ", udpcount)
147 | print("TCP packets  = ", tcpcount)
148 | 
149 | 
150 | 
151 | 
152 | packet_df = pd.DataFrame(packet_dict)
153 | packet_df.set_index('pkt_num', inplace=True)
154 | 
155 | 
156 | #   ************Create a list of tuples that identify each indepent flow
157 | 
158 | tuplist_flowid = {}
159 | flow_count = 0
160 | 
161 | flow_list_dict = {}
162 | tcpflowcount = 0
163 | udpflowcount = 0
164 | 
165 | for index in range(len(pkt_num_list)):
166 |     mytup = (ip_src_list[index], ip_dst_list[index], prt_src_list[index], prt_dst_list[index], proto_list[index])
167 |     
168 |     str_temp = "_".join(str(v) for v in mytup)
169 |     if str_temp not in tuplist_flowid:
170 |         tuplist_flowid[str_temp] = flow_count
171 |         flow_list_dict[flow_count] = []
172 |         flow_count += 1
173 |         
174 |     current_flow_id = tuplist_flowid[str_temp]
175 |     flow_tup = (
176 |         ip_src_list[index], ip_dst_list[index], prt_src_list[index], prt_dst_list[index], proto_list[index],
177 |         pkt_num_list[index], time_list[index], ip_len_list[index], tcp_psh_flag_list[index], tcp_rst_flag_list[index],
178 |         tcp_urg_flag_list[index], current_flow_id)
179 |     
180 |     flow_list_dict[current_flow_id].append(flow_tup)
181 |     
182 |     if len(flow_list_dict[current_flow_id]) == 1:
183 |         if flow_list_dict[current_flow_id][0][4] == 'TCP':
184 |             tcpflowcount+=1
185 |         if flow_list_dict[current_flow_id][0][4] == 'UDP':
186 |             udpflowcount+=1
187 | 
188 | del tuplist_flowid 
189 | 
190 | print("\nNumber of flows = ", flow_count)
191 | 
192 | packet_dict = {'pkt_num': pkt_num_list,
193 |                'time': time_list,
194 |                'ip_src': ip_src_list,
195 |                'ip_dst': ip_dst_list,
196 |                'ip_len': ip_len_list,
197 |                'proto': proto_list,
198 |                'prt_src': prt_src_list,
199 |                'prt_dst': prt_dst_list, 
200 |                'tcp_psh': tcp_psh_flag_list,
201 |                'tcp_rst': tcp_rst_flag_list,
202 |                'tcp_urg': tcp_urg_flag_list}
203 | 
204 | 
205 | 
206 | 
207 | 
208 | print("\nUnique flows = ", len(flow_list_dict))
209 | 
210 | print("\nflow list list element = ", flow_list_dict[0][0])
211 | if len(flow_list_dict[0]) > 1:
212 |     print("\nflow list list element = ", flow_list_dict[0][1])
213 | if len(flow_list_dict[0]) > 2:
214 |     print("\nflow list list element = ", flow_list_dict[0][2])
215 | 
216 | print("UDP flows = ", udpflowcount)
217 | print("TCP flows = ", tcpflowcount)
218 | 
219 | class uniFlow:
220 |     def __init__(self, ip_src, ip_dst, prt_src, prt_dst, proto, num_pkts, 
221 |                  mean_iat, std_iat, min_iat, max_iat, mean_offset, mean_pkt_len, 
222 |                  std_pkt_len, min_pkt_len, max_pkt_len, num_bytes, num_psh_flags,
223 |                  num_rst_flags, num_urg_flags):
224 |         self.ip_src = ip_src
225 |         self.ip_dst = ip_dst
226 |         self.prt_src = prt_src
227 |         self.prt_dst = prt_dst
228 |         self.proto = proto
229 |         self.num_pkts = num_pkts # num pkts in this flow
230 |         self.mean_iat = mean_iat # ave interarrival time
231 |         self.std_iat = std_iat # std dev of IAT (jitter-ish)
232 |         self.min_iat = min_iat
233 |         self.max_iat = max_iat
234 |         self.mean_offset = mean_offset
235 |         self.mean_pkt_len = mean_pkt_len # ave pckt len per flow
236 |         self.std_pkt_len = std_pkt_len # std deviation of packet lengths
237 |         self.max_pkt_len = max_pkt_len
238 |         self.min_pkt_len = min_pkt_len
239 |         self.num_bytes = num_bytes
240 |         self.num_psh_flags = num_psh_flags
241 |         self.num_rst_flags = num_rst_flags
242 |         self.num_urg_flags = num_urg_flags
243 |         self.processed = False
244 |            
245 | meta_list = []
246 | meta_list_time_0 = []
247 | f_count = 0
248 | for key in flow_list_dict:
249 |     flow_list = flow_list_dict[key]
250 |     pkt = flow_list[0] # get first pkt in the flow
251 |     
252 |     
253 |     #0 is ip_src
254 |     #1 is ip_dst
255 |     #2 is prt_src
256 |     #3 is prt_dst
257 |     #4 is proto
258 |     #5 is pkt_num
259 |     #6 is time
260 |     #7 is ip_len
261 |     #8 is tcp_psh_flag
262 |     #9 is tcp_rst_flag
263 |     #10 is tcp_urg_flag
264 |     #11 is flow_id
265 |     
266 |     
267 |     ip_src = pkt[0]
268 |     ip_dst = pkt[1]
269 |     prt_src = pkt[2]
270 |     prt_dst = pkt[3]
271 |     proto = pkt[4]
272 |     if proto == 'TCP':
273 |         proto = 6
274 |     elif proto == 'UDP':
275 |         proto = 17
276 |     num_pkts = len(flow_list)
277 |     # need to calc inter-arrival time and ave pkt length
278 |     length_list = []
279 |     time_list = []
280 |     psh_list = []
281 |     rst_list = []
282 |     urg_list = []
283 |     for p in flow_list:
284 |         length_list.append(p[7])
285 |         time_list.append(p[6])
286 |         psh_list.append(p[8])
287 |         rst_list.append(p[9])
288 |         urg_list.append(p[10])
289 |     mean_pkt_len = sum(length_list) / num_pkts
290 |     pkt_len_arry = np.array(length_list)
291 |     std_pkt_len = float(np.std(pkt_len_arry))
292 |     min_pkt_len = float(min(pkt_len_arry))
293 |     max_pkt_len = float(max(pkt_len_arry))
294 |     num_bytes = sum(length_list)
295 |     num_psh_flags = sum(psh_list)
296 |     num_rst_flags = sum(rst_list)
297 |     num_urg_flags = sum(urg_list)
298 |     if num_pkts > 1:
299 |         time_list.sort(reverse = True) # put times in descending order
300 |         t_diff = abs(np.diff(time_list))
301 |         mean_iat = sum(t_diff) / (num_pkts - 1)
302 |         std_iat = np.std(t_diff) # std dev of IAT
303 |         min_iat = min(t_diff)
304 |         max_iat = max(t_diff)
305 |         # Kenzi's apparently good feature is the mean time between the first
306 |         # packet and each sucessive packet: (t2-t1) + (t3-t1) + (t4-t1) / n
307 |         time_list.sort() # sort into ascending order now
308 |         t0 = time_list[0]
309 |         time_total = 0.0
310 |         for f in range(1, num_pkts):
311 |             time_total += abs(t0 - time_list[f])
312 |         mean_offset = time_total / (num_pkts - 1)
313 |             
314 |     else:
315 |         mean_iat = 0.0
316 |         std_iat = 0.0
317 |         min_iat = 0.0
318 |         max_iat = 0.0
319 |         mean_offset = 0.0
320 |     uniflow = uniFlow(ip_src, ip_dst, prt_src, prt_dst, proto, num_pkts, mean_iat, 
321 |                       std_iat, min_iat, max_iat, mean_offset, mean_pkt_len, std_pkt_len,
322 |                       min_pkt_len, max_pkt_len, num_bytes, num_psh_flags,
323 |                       num_rst_flags, num_urg_flags)
324 |     meta_list.append(uniflow)
325 |     meta_list_time_0.append((datetime.datetime.utcfromtimestamp(time_list[0]) - global_t0).seconds // 60)
326 |     f_count +=1
327 | 
328 | 
329 | def uniFlow2df(uniflow):
330 |     df = pd.DataFrame(columns=['ip_src', 'ip_dst', 'prt_src', 'prt_dst', 'proto', 'num_pkts', 
331 |                                'mean_iat', 'std_iat', 'min_iat', 'max_iat', 'mean_offset', 'mean_pkt_len',
332 |                                'std_pkt_len', 'min_pkt_len', 'max_pkt_len', 'num_bytes',
333 |                                'num_psh_flags', 'num_rst_flags', 'num_urg_flags'])
334 |     df.loc[0,'ip_src'] = str(uniflow.ip_src)
335 |     df.loc[0,'ip_dst'] = str(uniflow.ip_dst)
336 |     df.loc[0,'prt_src'] = int(uniflow.prt_src)
337 |     df.loc[0,'prt_dst'] = int(uniflow.prt_dst)
338 |     df.loc[0,'proto'] = int(uniflow.proto)
339 |     df.loc[0,'num_pkts'] = int(uniflow.num_pkts)
340 |     df.loc[0,'mean_iat'] = float(uniflow.mean_iat)
341 |     df.loc[0,'std_iat'] = float(uniflow.std_iat)
342 |     df.loc[0,'min_iat'] = float(uniflow.min_iat)
343 |     df.loc[0,'max_iat'] = float(uniflow.max_iat)
344 |     df.loc[0,'mean_offset'] = float(uniflow.mean_offset)
345 |     df.loc[0,'mean_pkt_len'] = float(uniflow.mean_pkt_len)
346 |     df.loc[0,'std_pkt_len'] = float(uniflow.std_pkt_len)
347 |     df.loc[0,'min_pkt_len'] = float(uniflow.min_pkt_len)
348 |     df.loc[0,'max_pkt_len'] = float(uniflow.max_pkt_len)
349 |     df.loc[0,'num_bytes'] = int(uniflow.num_bytes)
350 |     df.loc[0,'num_psh_flags'] = int(uniflow.num_psh_flags)
351 |     df.loc[0,'num_rst_flags'] = int(uniflow.num_rst_flags)
352 |     df.loc[0,'num_urg_flags'] = int(uniflow.num_urg_flags)
353 |     return df
354 | 
355 | 
356 | if output_uniflows_separately:
357 | #feature_df = pd.DataFrame()
358 |     feature_df = pd.DataFrame(columns=['ip_src', 'ip_dst', 'prt_src', 'prt_dst', 'proto', 
359 |                                        'num_pkts', 'mean_iat', 'std_iat', 'min_iat',
360 |                                        'max_iat', 'mean_offset', 'mean_pkt_len', 'num_bytes', 'num_psh_flags',
361 |                                        'num_rst_flags', 'num_urg_flags'])
362 |     
363 |     for flow in meta_list:
364 |         flow_df = uniFlow2df(flow)
365 |         feature_df = feature_df.append(flow_df, ignore_index=True, sort=False)
366 |         
367 |     
368 |     #feature_df.to_csv('robert_stealth.csv', sep=',') 
369 |     feature_df.to_csv('uniflow_' + output_file, sep=',') 
370 | 
371 | print('\nAll uniflows processed')
372 | 
373 | # No convert uniflows into biflows
374 | #ßfor uniflow in feature_df:
375 |         
376 | ##################################
377 | # Combine uniflows into biflows
378 | 
379 | df_biflow = pd.DataFrame(columns=['ip_src', 'ip_dst', 'prt_src', 'prt_dst', 'proto', 'fwd_num_pkts', 'bwd_num_pkts',
380 |                  'fwd_mean_iat', 'bwd_mean_iat', 'fwd_std_iat', 'bwd_std_iat', 'fwd_min_iat', 'bwd_min_iat',
381 |                  'fwd_max_iat', 'bwd_max_iat','fwd_mean_offset', 'bwd_mean_offset', 'fwd_mean_pkt_len', 'bwd_mean_pkt_len',
382 |                  'fwd_std_pkt_len', 'bwd_std_pkt_len', 'fwd_min_pkt_len', 'bwd_min_pkt_len',
383 |                  'fwd_max_pkt_len', 'bwd_max_pkt_len', 'fwd_num_bytes', 'bwd_num_bytes', 
384 |                  'fwd_num_psh_flags', 'bwd_num_psh_flags',
385 |                  'fwd_num_rst_flags', 'bwd_num_rst_flags', 'fwd_num_urg_flags', 'bwd_num_urg_flags'])
386 | 
387 | #feature_df['processed'] = False
388 | 
389 | #feature_row = feature_df.iloc[0,:].copy()
390 | # process the TCP flows
391 | print('\nProcessing TCP flows')
392 | sibilings_counts = {}
393 | delta_avg = {}
394 | bi_flow_time = []
395 | 
396 | num_uniflows = len(meta_list)
397 | for row_num in range(num_uniflows):
398 |     current = meta_list[row_num]
399 |     current_time = meta_list_time_0[row_num]
400 |     if (current.processed == False):
401 |         ip_src=current.ip_src
402 |         ip_dst=current.ip_dst
403 |         prt_src=current.prt_src
404 |         prt_dst = current.prt_dst
405 |         proto = current.proto
406 |         # Get reverse tuple values
407 |         rev_ip_src = ip_dst
408 |         rev_ip_dst = ip_src
409 |         rev_prt_src = prt_dst
410 |         rev_prt_dst = prt_src
411 |         for inner_row in range(row_num, num_uniflows):
412 |             if (current.processed == True):
413 |                 continue;
414 |                 
415 |             inner = meta_list[inner_row]
416 |             inner_ip_src=inner.ip_src
417 |             inner_ip_dst=inner.ip_dst
418 |             inner_prt_src=inner.prt_src
419 |             inner_prt_dst = inner.prt_dst
420 |             inner_proto = inner.proto
421 |             
422 |             if (rev_ip_src == inner_ip_src) and (rev_ip_dst == inner_ip_dst) and (rev_prt_src == inner_prt_src) and (rev_prt_dst == inner_prt_dst) and (proto == inner_proto):
423 |                 # matching flow found!
424 |                 meta_list[row_num].processed = True
425 |                 meta_list[inner_row].processed = True
426 |                 
427 |                 biflowlist = [str(current_time)+'_'+current.ip_src, current.ip_src, current.ip_dst, current.prt_src, current.prt_dst, current.proto,
428 |                                   current.num_pkts, inner.num_pkts, current.mean_iat, inner.mean_iat, current.std_iat,
429 |                                   inner.std_iat, current.min_iat, inner.min_iat, current.max_iat, inner.max_iat,current.mean_offset, inner.mean_offset,
430 |                                   current.mean_pkt_len, inner.mean_pkt_len, current.std_pkt_len, inner.std_pkt_len,
431 |                                   current.min_pkt_len, inner.min_pkt_len, current.max_pkt_len, inner.max_pkt_len,
432 |                                   current.num_bytes, inner.num_bytes, current.num_psh_flags, inner.num_psh_flags,
433 |                                   current.num_rst_flags, inner.num_rst_flags, current.num_urg_flags, inner.num_urg_flags]
434 |                 columns_list=['sec_ip_src', 'ip_src', 'ip_dst', 'prt_src', 'prt_dst', 
435 |                                                                                    'proto', 'fwd_num_pkts', 'bwd_num_pkts',
436 |                                                                                    'fwd_mean_iat', 'bwd_mean_iat', 'fwd_std_iat', 
437 |                                                                                    'bwd_std_iat', 'fwd_min_iat', 'bwd_min_iat',
438 |                                                                                    'fwd_max_iat', 'bwd_max_iat', 'fwd_mean_offset', 'bwd_mean_offset', 'fwd_mean_pkt_len', 
439 |                                                                                    'bwd_mean_pkt_len', 'fwd_std_pkt_len', 'bwd_std_pkt_len', 
440 |                                                                                    'fwd_min_pkt_len', 'bwd_min_pkt_len', 
441 |                                                                                    'fwd_max_pkt_len', 'bwd_max_pkt_len', 'fwd_num_bytes', 
442 |                                                                                    'bwd_num_bytes', 'fwd_num_psh_flags', 'bwd_num_psh_flags',
443 |                                                                                    'fwd_num_rst_flags', 'bwd_num_rst_flags', 'fwd_num_urg_flags', 
444 |                                                                                    'bwd_num_urg_flags']
445 |                                    
446 |                 df_biflow = df_biflow.append(pd.DataFrame([biflowlist], columns = columns_list), ignore_index=True, sort=False)
447 |             else:
448 |                 continue
449 |     else:
450 |         continue
451 | 
452 |     
453 | 
454 | print('\nProcessing UDP flows')
455 | # Process the UDP flows
456 | for row_num in range(num_uniflows):
457 |     current = meta_list[row_num]
458 |     current_time = meta_list_time_0[row_num]
459 |     if (current.processed == False):
460 |         ip_src=current.ip_src
461 |         ip_dst=current.ip_dst
462 |         prt_src=current.prt_src
463 |         prt_dst = current.prt_dst
464 |         proto = current.proto
465 |         # Get reverse tuple values
466 |         rev_ip_src = ip_dst
467 |         rev_ip_dst = ip_src
468 |         rev_prt_src = prt_dst
469 |         rev_prt_dst = prt_src
470 |         if proto == 17:
471 |             meta_list[row_num].processed = True
472 |             # UDP flows have no reverse direction so i have filled the redundant fields with
473 |             # dupicate forward direction data 
474 |             biflowlist = [str(current_time)+'_'+current.ip_src,current.ip_src, current.ip_dst, current.prt_src, current.prt_dst, current.proto,
475 |                           current.num_pkts, current.num_pkts, current.mean_iat, current.mean_iat, current.std_iat,
476 |                           current.std_iat, current.min_iat, current.min_iat, current.max_iat, current.max_iat, current.mean_offset, current.mean_offset,
477 |                           current.mean_pkt_len, current.mean_pkt_len, current.std_pkt_len, current.std_pkt_len,
478 |                           current.min_pkt_len, current.min_pkt_len, current.max_pkt_len, current.max_pkt_len,
479 |                           current.num_bytes, current.num_bytes, current.num_psh_flags, current.num_psh_flags,
480 |                           current.num_rst_flags, current.num_rst_flags, current.num_urg_flags, current.num_urg_flags]
481 |             columns_list=['sec_ip_src','ip_src', 'ip_dst', 'prt_src', 'prt_dst', 
482 |                                                                                    'proto', 'fwd_num_pkts', 'bwd_num_pkts',
483 |                                                                                    'fwd_mean_iat', 'bwd_mean_iat', 'fwd_std_iat', 
484 |                                                                                    'bwd_std_iat', 'fwd_min_iat', 'bwd_min_iat',
485 |                                                                                    'fwd_max_iat', 'bwd_max_iat','fwd_mean_offset', 'bwd_mean_offset','fwd_mean_pkt_len', 
486 |                                                                                    'bwd_mean_pkt_len', 'fwd_std_pkt_len', 'bwd_std_pkt_len', 
487 |                                                                                    'fwd_min_pkt_len', 'bwd_min_pkt_len', 
488 |                                                                                    'fwd_max_pkt_len', 'bwd_max_pkt_len', 'fwd_num_bytes', 
489 |                                                                                    'bwd_num_bytes', 'fwd_num_psh_flags', 'bwd_num_psh_flags',
490 |                                                                                    'fwd_num_rst_flags', 'bwd_num_rst_flags', 'fwd_num_urg_flags', 
491 |                                                                                    'bwd_num_urg_flags']   
492 |                      
493 |             df_biflow = df_biflow.append(pd.DataFrame([biflowlist], columns = columns_list), ignore_index=True, sort=False)
494 |         else:
495 |             continue
496 | 
497 | 
498 | 
499 | del pkt_num_list
500 | del proto_list
501 | del prt_dst_list
502 | del prt_src_list
503 | del tcp_psh_flag_list
504 | del tcp_rst_flag_list
505 | del time_list
506 | del ip_dst_list
507 | del ip_src_list
508 | del ip_len_list
509 | del tcp_urg_flag_list
510 | del packet_df
511 | del packet_dict
512 | del meta_list
513 | del flow_list
514 | del flow_list_dict
515 | if 'feature_df' in globals():
516 |     del feature_df
517 |     
518 | 
519 | # Now add flow-bundle data
520 | # Add the numbe of flowws from each IP address and measure of the
521 | # variability of destination port numbers that packets are sent to
522 | # we will sort the port numbers in order then take the mean difference
523 | # a value of 1 should indicate an incremental port scanner
524 | 
525 | print('Number of bi flows = {}'.format(np.size(df_biflow, axis = 0)))
526 | 
527 | df_biflow['num_src_flows'] = 0
528 | df_biflow['src_ip_dst_prt_delta'] = 0
529 | 
530 | biflow_column = 'sec_ip_src'
531 | 
532 | if sliding_window == False:
533 |     biflow_column = 'ip_src'
534 | 
535 | addr_dict = dict(df_biflow[biflow_column].value_counts())
536 | print(addr_dict)
537 | print('-------------')
538 | print( dict(df_biflow['ip_src'].value_counts()))
539 | print('\nComputing number of flows per source')
540 | for key, value in addr_dict.items():
541 |     df_biflow.loc[df_biflow[biflow_column] == key, 'num_src_flows'] = value
542 | print('\nComputing number of port destinations per source')
543 | for key, value in addr_dict.items():
544 |     rows = df_biflow[df_biflow[biflow_column] == key]['prt_dst']
545 |     l = list(rows)
546 |     l.sort()
547 |     ave_diff = 0
548 |     if len(l) == 1:
549 |         ave_diff = l[0]
550 |     elif len(l) > 0:
551 |         ave_diff = np.absolute(np.diff(l)).mean()
552 |     df_biflow.loc[df_biflow[biflow_column] == key, 'src_ip_dst_prt_delta']= ave_diff
553 | 
554 | 
555 | 
556 | df_biflow.to_csv('biflow_' + output_file, sep=',') 
557 | 
558 | 
559 | 
560 | # normal.pcap has 3305 packets and 1719 unique flows
561 | 
562 | print('Parsing the file took {} seconds'.format(time.time() - start_time))


--------------------------------------------------------------------------------
/print_packets.py:
--------------------------------------------------------------------------------
 1 | import dpkt
 2 | from dpkt.compat import compat_ord
 3 | import socket
 4 | 
 5 | 
 6 | 
 7 | 
 8 | def mac_addr(address):
 9 |     """Convert a MAC address to a readable/printable string
10 |        Args:
11 |            address (str): a MAC address in hex form (e.g. '\x01\x02\x03\x04\x05\x06')
12 |        Returns:
13 |            str: Printable/readable MAC address
14 |     """
15 |     return ':'.join('%02x' % compat_ord(b) for b in address)
16 | 
17 | 
18 | def inet_to_str(inet):
19 |     """Convert inet object to a string
20 |         Args:
21 |             inet (inet struct): inet network address
22 |         Returns:
23 |             str: Printable/readable IP address
24 |     """
25 |     # First try ipv4 and then ipv6
26 |     try:
27 |         return socket.inet_ntop(socket.AF_INET, inet)
28 |     except ValueError:
29 |         return socket.inet_ntop(socket.AF_INET6, inet)
30 | 
31 | def print_packets(pcap):
32 |     """Print out information about each packet in a pcap
33 |        Args:
34 |            pcap: dpkt pcap reader object (dpkt.pcap.Reader)
35 |     """
36 |     # For each packet in the pcap process the contents
37 |     for timestamp, buf in pcap:
38 | 
39 |         # Print out the timestamp in UTC
40 |         print('Timestamp: ', str(datetime.datetime.utcfromtimestamp(timestamp)))
41 | 
42 |         # Unpack the Ethernet frame (mac src/dst, ethertype)
43 |         eth = dpkt.ethernet.Ethernet(buf)
44 |         print('Ethernet Frame: ', mac_addr(eth.src), mac_addr(eth.dst), eth.type)
45 | 
46 |         # Make sure the Ethernet data contains an IP packet
47 |         if not isinstance(eth.data, dpkt.ip.IP):
48 |             print('Non IP Packet type not supported %s\n' % eth.data.__class__.__name__)
49 |             continue
50 | 
51 |         # Now unpack the data within the Ethernet frame (the IP packet)
52 |         # Pulling out src, dst, length, fragment info, TTL, and Protocol
53 |         ip = eth.data
54 | 
55 |         # Pull out fragment information (flags and offset all packed into off field, so use bitmasks)
56 |         do_not_fragment = bool(ip.off & dpkt.ip.IP_DF)
57 |         more_fragments = bool(ip.off & dpkt.ip.IP_MF)
58 |         fragment_offset = ip.off & dpkt.ip.IP_OFFMASK
59 | 
60 |         # Print out the info
61 |         print('IP: %s -> %s   (len=%d ttl=%d DF=%d MF=%d offset=%d)\n' % \
62 |               (inet_to_str(ip.src), inet_to_str(ip.dst), ip.len, ip.ttl, do_not_fragment, more_fragments, fragment_offset))
63 | 
64 | 
65 | def test():
66 |     """Open up a test pcap file and print out the packets"""
67 |     with open('data/http.pcap', 'rb') as f:
68 |         pcap = dpkt.pcap.Reader(f)
69 |         print_packets(pcap)
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------