├── README.md ├── classifier ├── audi.py ├── base.py ├── big_data.py ├── byteiot.py ├── iot_sense.py └── tmc.py ├── dataset ├── private.py ├── unsw.py ├── utils.py └── yourthings.py └── main.py /README.md: -------------------------------------------------------------------------------- 1 | # IoTClassifier 2 | This repository contains the re-implementations of several IoT device classification systems, also known as passive device fingerprinting. 3 | 4 | The implemented algorithms process and analyze traffic data in an offline manner. TShark is used to extract original traffic features and export them as `CSV` format. 5 | 6 | 7 | ## Paper References 8 | - [1] Marchal, Samuel, et al. "Audi: Toward autonomous iot device-type identification using periodic communication." IEEE Journal on Selected Areas in Communications 37.6 (2019): 1402-1412. 9 | - [2] Shahid, Mustafizur R., et al. "IoT devices recognition through network traffic analysis." 2018 IEEE international conference on big data (big data). IEEE, 2018. 10 | - [3] Bezawada, B., Bachani, M., Peterson, J., Shirazi, H., Ray, I., & Ray, I. (2018). "Iotsense: Behavioral fingerprinting of iot devices". arXiv preprint arXiv:1804.03852. 11 | - [4] Sivanathan, Arunan, et al. "Classifying IoT devices in smart environments using network traffic characteristics." IEEE Transactions on Mobile Computing 18.8 (2018): 1745-1759. 12 | - [5] Chenxin D., Hao G., Guanglei S., Jiahai Y. and Zhiliang W. "ByteIoT: a Practical IoT Device Identification System based on Packet Length Distribution", in IEEE Transactions on Network and Service Management, 2021. doi: 10.1109/TNSM.2021.3130312. 13 | 14 | ## Other Related Works 15 | - [6] Meidan, Yair, et al. "ProfilIoT: A machine learning approach for IoT device identification based on network traffic analysis." Proceedings of the symposium on applied computing. 2017. 16 | - [7] Miettinen, Markus, et al. "Iot sentinel: Automated device-type identification for security enforcement in iot." 2017 IEEE 37th International Conference on Distributed Computing Systems (ICDCS). IEEE, 2017. 17 | - [8] Meidan Y, Bohadana M, Shabtai A, et al. Detection of unauthorized IoT devices using machine learning techniques[J]. arXiv preprint arXiv:1709.04647, 2017. 18 | - [9] Lopez-Martin, Manuel, et al. "Network traffic classifier with convolutional and recurrent neural networks for Internet of Things." IEEE Access 5 (2017): 18042-18050. 19 | - [10] J. Ortiz, C. Crawford, and F. Le, “Devicemien: Network device behavior modeling for identifying unknown iot devices,” in Proceedings of the International Conference on Internet of Things Design and Implementation, ser. IoTDI’19.New York, NY, USA: ACM, 2019, pp. 106–117. 20 | - [11] BremlerBarr, Anat, Haim Levy, and Zohar Yakhini. "IoT or NoT: Identifying IoT Devices in a Short Time Scale." NOMS 2020-2020 IEEE/IFIP Network Operations and Management Symposium. IEEE, 2020. 21 | - [12] Kolcun, Roman, et al. "The case for retraining of ML models for IoT device identification at the edge." arXiv preprint arXiv:2011.08605 (2020). 22 | - [13] Pinheiro, Antônio J., et al. "Identifying IoT devices and events based on packet length from encrypted traffic." Computer Communications 144 (2019): 8-17. 23 | 24 | ## Open Datasets 25 | There are some open datasets which collect traffic traces generated by different IoT devices and can be used to validate and reproduce the results presented in the above papers. 26 | 27 | - UNSW Dataset: https://iotanalytics.unsw.edu.au/iottraces.html 28 | - YourThings Dataset: https://yourthings.info/data/ 29 | -------------------------------------------------------------------------------- /classifier/audi.py: -------------------------------------------------------------------------------- 1 | from classifier.base import Classifier 2 | 3 | from scipy.fftpack import fft 4 | from sklearn.neighbors import KNeighborsClassifier 5 | from sklearn.preprocessing import MinMaxScaler 6 | import numpy as np 7 | 8 | import math 9 | 10 | PERIODIC_FLOWS_COUNTER = 0 # f1 11 | PERIODIC_FLOWS_LAYER = 1 # f2 12 | MEAN_PERIOD = 2 # f3 13 | SD_PERIOD = 3 # f4 14 | FLOWS_ONLY_PERIOD = 4 # f5 15 | FLOWS_MULTI_PERIOD = 5 # f6 16 | FLOWS_STATIC_SRC_PORT = 6 # f7 17 | FLOWS_MEAN_PORT_CHANGE = 7 # f8 18 | FLOWS_SD_PORT_CHANGE = 8 # f9 19 | PERIODS_IN_ALL_SUB_CAPTURES = 9 # f10 20 | MEAN_PERIOD_INFER_SUCCESS = 10 # f11 21 | SD_PERIOD_INFER_SUCCESS = 11 # f12 22 | PERIODS_5_29 = 12 # f13 23 | PERIODS_30_59 = 13 # f14 24 | PERIODS_60_119 = 14 # f15 25 | PERIODS_120_600 = 15 # f16 26 | MEAN_R_02_07 = 16 # f17 27 | MEAN_R_07_1 = 17 # f18 28 | MEAN_R_1_2 = 18 # f19 29 | MEAN_R_2 = 19 # f20 30 | SD_R_0_002 = 20 # f21 31 | SD_R_002_01 = 21 # f22 32 | SD_R_01 = 22 # f23 33 | MEAN_RN_02_07 = 23 # f24 34 | MEAN_RN_07_1 = 24 # f25 35 | MEAN_RN_1_2 = 25 # f26 36 | MEAN_RN_2 = 26 # f27 37 | SD_RN_0_002 = 27 # f28 38 | SD_RN_002_01 = 28 # f29 39 | SD_RN_01 = 29 # f30 40 | MEAN_RN_R_0_002 = 30 # f31 41 | MEAN_RN_R_002_01 = 31 # f32 42 | MEAN_RN_R_01 = 32 # f33 43 | 44 | 45 | class AuDIClassifier(Classifier): 46 | ''' 47 | Marchal, Samuel, et al. "Audi: Toward autonomous iot device-type 48 | identification using periodic communication." IEEE Journal on Selected 49 | Areas in Communications 37.6 (2019): 1402-1412. 50 | ''' 51 | 52 | def __init__(self, interval=1800): 53 | super(AuDIClassifier, self).__init__() 54 | self.tag = 'audi' 55 | self.interval = interval 56 | self.selected_features = ['timestamp', 'address_src', 'eth_type', 'address_dst', 'ip_proto', 'tcp_srcport', 57 | 'tcp_dstport', 'udp_srcport', 'udp_dstport'] 58 | 59 | def get_dataset(self, raw_dataset, generator): 60 | counter = {address: {} for address in raw_dataset.iot_list.values()} 61 | dataset = {address: [] for address in raw_dataset.iot_list.values()} 62 | instance_index = 0 63 | for t, addr_src, addr_dst, eth_type, ip_proto, tcp_srcport, tcp_dstport, udp_srcport, udp_dstport in generator: 64 | t = int(float(t)) 65 | if instance_index and t // self.interval != instance_index: 66 | for address, c in counter.items(): 67 | if c: 68 | feature = self.get_feature(c) 69 | dataset[address].append(feature) 70 | instance_index = t // self.interval 71 | counter = {address: {} for address in raw_dataset.iot_list.values()} 72 | if not instance_index: 73 | instance_index = t // self.interval 74 | if addr_src in counter.keys(): 75 | if '2-' + eth_type in counter[addr_src]: 76 | counter[addr_src]['2-' + eth_type]['s'][t - instance_index * self.interval] = 1 77 | else: 78 | counter[addr_src]['2-' + eth_type] = {} 79 | counter[addr_src]['2-' + eth_type]['s'] = [0] * self.interval 80 | counter[addr_src]['2-' + eth_type]['s'][t - instance_index * self.interval] = 1 81 | if ip_proto: 82 | if '3-' + ip_proto in counter[addr_src]: 83 | counter[addr_src]['3-' + ip_proto]['s'][t - instance_index * self.interval] = 1 84 | else: 85 | counter[addr_src]['3-' + ip_proto] = {} 86 | counter[addr_src]['3-' + ip_proto]['s'] = [0] * self.interval 87 | counter[addr_src]['3-' + ip_proto]['s'][t - instance_index * self.interval] = 1 88 | if tcp_dstport: 89 | if '4-t-' + tcp_dstport in counter[addr_src]: 90 | counter[addr_src]['4-t-' + tcp_dstport]['s'][t - instance_index * self.interval] = 1 91 | if tcp_srcport != counter[addr_src]['4-t-' + tcp_dstport]['last_port_src']: 92 | counter[addr_src]['4-t-' + tcp_dstport]['last_port_src'] = tcp_srcport 93 | counter[addr_src]['4-t-' + tcp_dstport]['port_src_change_interval'].append( 94 | t - counter[addr_src]['4-t-' + tcp_dstport]['last_port_src_time']) 95 | counter[addr_src]['4-t-' + tcp_dstport]['last_port_src'] = tcp_srcport 96 | counter[addr_src]['4-t-' + tcp_dstport]['last_port_src_time'] = t 97 | else: 98 | counter[addr_src]['4-t-' + tcp_dstport] = {} 99 | counter[addr_src]['4-t-' + tcp_dstport]['s'] = [0] * self.interval 100 | counter[addr_src]['4-t-' + tcp_dstport]['s'][t - instance_index * self.interval] = 1 101 | counter[addr_src]['4-t-' + tcp_dstport]['last_port_src'] = tcp_srcport 102 | counter[addr_src]['4-t-' + tcp_dstport]['last_port_src_time'] = t 103 | counter[addr_src]['4-t-' + tcp_dstport]['port_src_change_interval'] = [] 104 | if udp_dstport: 105 | if '4-u-' + udp_dstport in counter[addr_src]: 106 | counter[addr_src]['4-u-' + udp_dstport]['s'][t - instance_index * self.interval] = 1 107 | if tcp_srcport != counter[addr_src]['4-u-' + udp_dstport]['last_port_src']: 108 | counter[addr_src]['4-u-' + udp_dstport]['last_port_src'] = tcp_srcport 109 | counter[addr_src]['4-u-' + udp_dstport]['port_src_change_interval'].append( 110 | t - counter[addr_src]['4-u-' + udp_dstport]['last_port_src_time']) 111 | counter[addr_src]['4-u-' + udp_dstport]['last_port_src'] = tcp_srcport 112 | counter[addr_src]['4-u-' + udp_dstport]['last_port_src_time'] = t 113 | else: 114 | counter[addr_src]['4-u-' + udp_dstport] = {} 115 | counter[addr_src]['4-u-' + udp_dstport]['s'] = [0] * self.interval 116 | counter[addr_src]['4-u-' + udp_dstport]['s'][t - instance_index * self.interval] = 1 117 | counter[addr_src]['4-u-' + udp_dstport]['last_port_src'] = tcp_srcport 118 | counter[addr_src]['4-u-' + udp_dstport]['last_port_src_time'] = t 119 | counter[addr_src]['4-u-' + udp_dstport]['port_src_change_interval'] = [] 120 | for address, c in counter.items(): 121 | if c: 122 | feature = self.get_feature(c) 123 | dataset[address].append(feature) 124 | return dataset 125 | 126 | def train_model(self, dataset, training_set_archive): 127 | x_train, y_train = self.get_training_dataset(dataset, training_set_archive) 128 | scalar = MinMaxScaler() 129 | scalar.fit(x_train) 130 | x_train = scalar.transform(x_train) 131 | k_nn = KNeighborsClassifier() 132 | k_nn.fit(x_train, y_train) 133 | self.model = k_nn 134 | self.preprocessor = scalar.transform 135 | 136 | @staticmethod 137 | def _f_period(x): 138 | y = fft(x) 139 | amplitudes = abs(y) 140 | t_amplitude = amplitudes.max() * 0.1 141 | candidate_period = [] 142 | for i in range(1, len(amplitudes) - 1): 143 | if amplitudes[i] >= t_amplitude and amplitudes[i] > amplitudes[i - 1] and amplitudes[i] > amplitudes[i + 1]: 144 | candidate_period.append(i) 145 | candidate_period_t = [] 146 | for i in range(0, len(candidate_period)): 147 | t = len(x) / candidate_period[i] 148 | t_upper_bound = int((1.1 * t)) 149 | t_lower_bound = math.ceil(0.9 * t) 150 | for j in range(t_lower_bound, t_upper_bound): 151 | candidate_period_t.append(j) 152 | candidate_period_t = list(set(candidate_period_t)) 153 | return candidate_period_t 154 | 155 | @staticmethod 156 | def _r_rn(x, i): 157 | n = len(x) 158 | if i >= (n - 1) or i < 1: 159 | return [] 160 | r_yy_i = np.dot(x[i:], x[:n - i]) 161 | r_yy_i_l1 = np.dot(x[i - 1:], x[:n - i + 1]) 162 | r_yy_i_u1 = np.dot(x[i + 1:], x[:n - i - 1]) 163 | if r_yy_i <= r_yy_i_l1 or r_yy_i <= r_yy_i_u1: 164 | return [] 165 | r = i * r_yy_i / n 166 | r_n = i * (r_yy_i + r_yy_i_l1 + r_yy_i_u1) / n 167 | return [r, r_n] 168 | 169 | def flow_data_process(self, x): 170 | n = len(x) 171 | result = {} 172 | 173 | # total flow 174 | total_result = {} 175 | candidate_period = self._f_period(x) 176 | for i in candidate_period: 177 | r_rn_result = self._r_rn(x, i) 178 | if r_rn_result: 179 | total_result[i] = r_rn_result 180 | # top_half_flow 181 | top_half_flow = x[:n // 2] 182 | candidate_period = self._f_period(top_half_flow) 183 | top_half_result = {} 184 | for i in candidate_period: 185 | r_rn_result = self._r_rn(top_half_flow, i) 186 | if r_rn_result: 187 | top_half_result[i] = r_rn_result 188 | 189 | # middle_half_flow 190 | middle_half_flow = x[n // 4:n * 3 // 4] 191 | candidate_period = self._f_period(middle_half_flow) 192 | middle_half_result = {} 193 | for i in candidate_period: 194 | r_rn_result = self._r_rn(middle_half_flow, i) 195 | if r_rn_result: 196 | middle_half_result[i] = r_rn_result 197 | 198 | # latter_half_flow 199 | latter_half_flow = x[n // 2:] 200 | candidate_period = self._f_period(latter_half_flow) 201 | latter_half_result = {} 202 | for i in candidate_period: 203 | r_rn_result = self._r_rn(latter_half_flow, i) 204 | if r_rn_result: 205 | latter_half_result[i] = r_rn_result 206 | 207 | result['total'] = total_result 208 | result['top_half'] = top_half_result 209 | result['middle_half'] = middle_half_result 210 | result['latter_half'] = latter_half_result 211 | return result 212 | 213 | def get_feature(self, c): 214 | feature = [0] * 33 215 | all_period, all_period_candidate_match = [], [] 216 | all_r_mean, all_r_sd, all_rn_mean, all_rn_sd = [], [], [], [] 217 | src_port_counter = [] 218 | for flow_key, flow in c.items(): 219 | flow_result = self.flow_data_process(flow['s']) 220 | r_list = [] 221 | rn_list = [] 222 | period_set = set() 223 | if 'port_src_change_interval' in flow: 224 | src_port_counter.append(len(flow['port_src_change_interval'])) 225 | for index, v in flow_result['total'].items(): 226 | temporary_counter = 0 227 | if index in flow_result['top_half']: 228 | temporary_counter += 1 229 | if index in flow_result['middle_half']: 230 | temporary_counter += 1 231 | if index in flow_result['latter_half']: 232 | temporary_counter += 1 233 | if temporary_counter >= 2: 234 | all_period_candidate_match.append(temporary_counter) 235 | period_set.add(index) 236 | all_period.append(index) 237 | r_list.append(v[0]) 238 | rn_list.append(v[1]) 239 | if temporary_counter == 3: 240 | feature[PERIODS_IN_ALL_SUB_CAPTURES] += 1 241 | if period_set: 242 | feature[PERIODIC_FLOWS_COUNTER] += 1 243 | feature[PERIODIC_FLOWS_LAYER] = max(feature[PERIODIC_FLOWS_LAYER], int(flow_key[0])) 244 | if len(period_set) == 1: 245 | feature[FLOWS_ONLY_PERIOD] += 1 246 | else: 247 | feature[FLOWS_MULTI_PERIOD] += 1 248 | if len(r_list) != 0: 249 | all_r_mean.append(np.mean(r_list)) 250 | all_rn_mean.append(np.mean(rn_list)) 251 | if len(r_list) > 1: 252 | all_r_sd.append(np.std(r_list, ddof=1)) 253 | all_rn_sd.append(np.std(rn_list, ddof=1)) 254 | else: 255 | all_r_sd.append(0) 256 | all_rn_sd.append(0) 257 | if all_period: 258 | feature[MEAN_PERIOD] = np.mean(all_period) 259 | feature[FLOWS_STATIC_SRC_PORT] = src_port_counter.count(0) 260 | if src_port_counter: 261 | feature[FLOWS_MEAN_PORT_CHANGE] = np.mean(src_port_counter) 262 | if len(src_port_counter) >= 2: 263 | feature[FLOWS_SD_PORT_CHANGE] = np.std(src_port_counter, ddof=1) 264 | if len(all_period) >= 2: 265 | feature[SD_PERIOD] = np.std(all_period, ddof=1) 266 | if all_period_candidate_match: 267 | feature[MEAN_PERIOD_INFER_SUCCESS] = np.mean(all_period_candidate_match) 268 | if len(all_period_candidate_match) >= 2: 269 | feature[SD_PERIOD_INFER_SUCCESS] = np.std(all_period_candidate_match, ddof=1) 270 | for i in all_period: 271 | if i < 5 or i > 600: 272 | continue 273 | elif i < 30: 274 | feature[PERIODS_5_29] += 1 275 | elif i < 60: 276 | feature[PERIODS_30_59] += 1 277 | elif i < 120: 278 | feature[PERIODS_60_119] += 1 279 | else: 280 | feature[PERIODS_120_600] += 1 281 | for i in all_r_mean: 282 | if i < 0.2: 283 | continue 284 | if i < 0.7: 285 | feature[MEAN_R_02_07] += 1 286 | elif i < 1: 287 | feature[MEAN_R_07_1] += 1 288 | elif i < 2: 289 | feature[MEAN_R_1_2] += 1 290 | else: 291 | feature[MEAN_R_2] += 1 292 | for i in all_r_sd: 293 | if i < 0.02: 294 | feature[SD_R_0_002] += 1 295 | elif i < 0.1: 296 | feature[SD_R_002_01] += 1 297 | else: 298 | feature[SD_R_01] += 1 299 | for i in all_rn_mean: 300 | if i < 0.2: 301 | continue 302 | if i < 0.7: 303 | feature[MEAN_RN_02_07] += 1 304 | elif i < 1: 305 | feature[MEAN_RN_07_1] += 1 306 | elif i < 2: 307 | feature[MEAN_RN_1_2] += 1 308 | else: 309 | feature[MEAN_RN_2] += 1 310 | for i in all_rn_sd: 311 | if i < 0.02: 312 | feature[SD_RN_0_002] += 1 313 | elif i < 0.1: 314 | feature[SD_RN_002_01] += 1 315 | else: 316 | feature[SD_RN_01] += 1 317 | for i in range(0, len(all_r_mean)): 318 | t = all_rn_mean[i] - all_r_mean[i] 319 | if t < 0.02: 320 | feature[MEAN_RN_R_0_002] += 1 321 | elif t < 0.1: 322 | feature[MEAN_RN_R_002_01] += 1 323 | else: 324 | feature[MEAN_RN_R_01] += 1 325 | return feature 326 | -------------------------------------------------------------------------------- /classifier/base.py: -------------------------------------------------------------------------------- 1 | from dataset import unsw, yourthings, private 2 | 3 | import pickle 4 | 5 | import numpy as np 6 | 7 | 8 | class Classifier(object): 9 | def __init__(self): 10 | self._dataset = { 11 | 'UNSW': unsw.UNSWDataset(), 12 | 'Yourthings': yourthings.YourthingsDataset(), 13 | 'Private': private.PrivateDataset() 14 | } 15 | self.selected_features = [] 16 | self.tag = 'base' 17 | self.model = None 18 | self.preprocessor = None 19 | 20 | def get_dataset(self, raw_dataset, generator): 21 | raise NotImplementedError 22 | 23 | def train_model(self, dataset, training_set_archive): 24 | raise NotImplementedError 25 | 26 | @staticmethod 27 | def get_training_dataset(dataset, training_set_archive): 28 | with open(training_set_archive, 'rb') as f: 29 | train_set = pickle.load(f) 30 | x, y = [], [] 31 | for address, features in train_set.items(): 32 | for feature in features: 33 | x.append(feature) 34 | y.append(dataset.label_map[address]) 35 | x_train, y_train = np.array(x), np.array(y) 36 | # print(x_train.shape, y_train.shape) 37 | return x_train, y_train 38 | 39 | def get_archived_dataset(self, dataset_tag, train_range=None, test_range=None): 40 | if dataset_tag not in self._dataset: 41 | raise ValueError("Unsupported Dataset") 42 | raw_dataset = self._dataset[dataset_tag] 43 | if not train_range: 44 | train_range = raw_dataset.default_training_range['train'] 45 | generator = raw_dataset.data_generator(**train_range, features=self.selected_features) 46 | training_set = self.get_dataset(raw_dataset, generator) 47 | with open(self.tag + '-train.pkl', 'wb') as f: 48 | pickle.dump(training_set, f) 49 | 50 | if not test_range: 51 | test_range = raw_dataset.default_training_range['test'] 52 | generator = raw_dataset.data_generator(**test_range, features=self.selected_features) 53 | test_set = self.get_dataset(raw_dataset, generator) 54 | with open(self.tag + '-test.pkl', 'wb') as f: 55 | pickle.dump(test_set, f) 56 | 57 | def train_on_unsw_dataset(self): 58 | training_set = self.tag + '-train.pkl' 59 | self.train_model(self._dataset['UNSW'], training_set) 60 | 61 | def train_on_yourthings_dataset(self): 62 | training_set = self.tag + '-train.pkl' 63 | self.train_model(self._dataset['Yourthings'], training_set) 64 | 65 | def train_on_private_dataset(self): 66 | training_set = self.tag + '-train.pkl' 67 | self.train_model(self._dataset['Private'], training_set) 68 | 69 | def test(self, dataset, test_set_archive=None): 70 | true_count, false_count = 0, 0 71 | with open(test_set_archive, 'rb') as f: 72 | test_set = pickle.load(f) 73 | for address, features in test_set.items(): 74 | for feature in features: 75 | x_test = np.array([feature]) 76 | if self.preprocessor: 77 | x_test = self.preprocessor(x_test) 78 | y_predict = self.model.predict(x_test) 79 | if y_predict[0] == dataset.label_map[address]: 80 | true_count += 1 81 | else: 82 | false_count += 1 83 | print(true_count, true_count + false_count, true_count / (true_count + false_count)) 84 | accuracy = true_count / (true_count + false_count) 85 | print(accuracy) 86 | return accuracy 87 | 88 | 89 | -------------------------------------------------------------------------------- /classifier/big_data.py: -------------------------------------------------------------------------------- 1 | from classifier.base import Classifier 2 | 3 | from sklearn.ensemble import RandomForestClassifier 4 | 5 | 6 | class BigDataClassifier(Classifier): 7 | def __init__(self, interval=1800, n=10): 8 | ''' 9 | Shahid, Mustafizur R., et al. "IoT devices recognition through network traffic 10 | analysis." 2018 IEEE international conference on big data (big data). IEEE, 2018. 11 | ''' 12 | super(BigDataClassifier, self).__init__() 13 | self.interval = interval 14 | self.n = n 15 | self.tag = 'big-data' 16 | self.selected_features = ['timestamp', 'size', 'address_src', 'address_dst', 'tcp_stream'] 17 | 18 | @staticmethod 19 | def _get_sample(stream_packet, dataset, n, n1, n2): 20 | for address, stream in stream_packet.items(): 21 | features = [] 22 | for _, s_series in stream.items(): 23 | feature = [] 24 | if len(s_series['s-out']) < n: 25 | feature.extend(s_series['s-out']) 26 | feature.extend([0] * (n - len(s_series['s-out']))) 27 | for t_i in range(1, len(s_series['t-out'])): 28 | feature.append(s_series['t-out'][t_i] - s_series['t-out'][t_i - 1]) 29 | feature.extend([0] * (n1 - len(feature))) 30 | else: 31 | feature.extend(s_series['s-out'][:n]) 32 | for t_i in range(1, n): 33 | feature.append(s_series['t-out'][t_i] - s_series['t-out'][t_i - 1]) 34 | if len(s_series['s-in']) < n: 35 | feature.extend(s_series['s-in']) 36 | feature.extend([0] * (n - len(s_series['s-in']))) 37 | for t_i in range(1, len(s_series['t-in'])): 38 | feature.append(s_series['t-in'][t_i] - s_series['t-in'][t_i - 1]) 39 | feature.extend([0] * (n2 - len(feature))) 40 | else: 41 | feature.extend(s_series['s-in'][:n]) 42 | for t_i in range(1, n): 43 | feature.append(s_series['t-in'][t_i] - s_series['t-in'][t_i - 1]) 44 | if feature: 45 | features.append(feature) 46 | if features: 47 | dataset[address].extend(features) 48 | 49 | def get_dataset(self, raw_dataset, generator): 50 | instance_index = 0 51 | n1, n2 = 2*self.n - 1, 4*self.n - 2 52 | dataset = {address: [] for address in raw_dataset.iot_list.values()} 53 | stream_packet = {address: {} for address in raw_dataset.iot_list.values()} 54 | for t, size, address_src, address_dst, tcp_stream in generator: 55 | t, size = float(t), int(size) 56 | if instance_index and int(t) // self.interval != instance_index: 57 | self._get_sample(stream_packet, dataset, self.n, n1, n2) 58 | instance_index = int(t) // self.interval 59 | stream_packet = {address: {} for address in raw_dataset.iot_list.values()} 60 | if not instance_index: 61 | instance_index = int(t) // self.interval 62 | if tcp_stream: 63 | if address_src in raw_dataset.iot_list.values(): 64 | if not stream_packet[address_src].get(int(tcp_stream), None): 65 | stream_packet[address_src][int(tcp_stream)] = {'s-in': [], 's-out': [], 't-in': [], 't-out': []} 66 | stream_packet[address_src][int(tcp_stream)]['s-out'].append(size) 67 | stream_packet[address_src][int(tcp_stream)]['t-out'].append(t) 68 | if address_dst in raw_dataset.iot_list.values(): 69 | if not stream_packet[address_dst].get(int(tcp_stream), None): 70 | stream_packet[address_dst][int(tcp_stream)] = {'s-in': [], 's-out': [], 't-in': [], 't-out': []} 71 | stream_packet[address_dst][int(tcp_stream)]['s-in'].append(size) 72 | stream_packet[address_dst][int(tcp_stream)]['t-in'].append(t) 73 | self._get_sample(stream_packet, dataset, self.n, n1, n2) 74 | return dataset 75 | 76 | def train_model(self, dataset, training_set_archive): 77 | x_train, y_train = self.get_training_dataset(dataset, training_set_archive) 78 | rf = RandomForestClassifier(n_estimators=100) 79 | rf.fit(x_train, y_train) 80 | self.model = rf 81 | 82 | -------------------------------------------------------------------------------- /classifier/byteiot.py: -------------------------------------------------------------------------------- 1 | from classifier.base import Classifier 2 | 3 | import math 4 | import pickle 5 | 6 | 7 | class ByteIoTClassifier(Classifier): 8 | def __init__(self, k=1, interval=1800, bidirectional=True, metric='hellinger'): 9 | super(ByteIoTClassifier, self).__init__() 10 | self.k = k 11 | self.interval = interval 12 | self.bidirectional = bidirectional 13 | self.metric = { 14 | 'hellinger': self.calculate_hellinger_distance, 15 | 'total-variation': self.calculate_total_variation_distance 16 | }.get(metric) 17 | self.selected_features = ['timestamp', 'size', 'address_src', 'address_dst', 'ip_src', 'ip_dst', 'ip_proto'] 18 | self.tag = 'byteiot' 19 | self._sample_set = None 20 | 21 | @staticmethod 22 | def _get_sample(counter, dataset, half_dataset): 23 | for addr, c in counter.items(): 24 | if c: 25 | total = sum((v for v in c.values())) 26 | half_c = {k: v for k, v in c.items() if k[1] == 0x01} 27 | half_total = sum((v for v in half_c.values())) 28 | for k in c.keys(): 29 | c[k] = c[k] / total 30 | for k in half_c.keys(): 31 | half_c[k] = half_c[k] / half_total 32 | dataset[addr].append(c) 33 | if half_c: 34 | half_dataset[addr].append(half_c) 35 | 36 | def get_dataset(self, raw_dataset, generator): 37 | counter = {addr: {} for addr in raw_dataset.iot_list.values()} 38 | dataset = {addr: [] for addr in raw_dataset.iot_list.values()} 39 | half_dataset = {addr: [] for addr in raw_dataset.iot_list.values()} 40 | instance_index = 0 41 | for t, size, addr_src, addr_dst, _ip_src, _ip_dst, _ip_proto in generator: 42 | t = int(float(t)) 43 | size = int(size) 44 | if instance_index and t // self.interval != instance_index: 45 | self._get_sample(counter, dataset, half_dataset) 46 | instance_index = t // self.interval 47 | counter = {addr: {} for addr in raw_dataset.iot_list.values()} 48 | if not instance_index: 49 | instance_index = t // self.interval 50 | if addr_src in counter.keys(): 51 | counter[addr_src][(size, 0x01)] = counter[addr_src].get((size, 0x01), 0) + 1 52 | if addr_dst in counter.keys(): 53 | counter[addr_dst][(size, 0x00)] = counter[addr_dst].get((size, 0x00), 0) + 1 54 | self._get_sample(counter, dataset, half_dataset) 55 | if self.bidirectional: 56 | return dataset 57 | else: 58 | return half_dataset 59 | 60 | def train_model(self, dataset, training_set_archive): 61 | with open(training_set_archive, 'rb') as f: 62 | train_set = pickle.load(f) 63 | self._sample_set = train_set 64 | 65 | def test(self, dataset, test_set_archive=None): 66 | true_count, false_count = 0, 0 67 | with open(test_set_archive, 'rb') as f: 68 | test_set = pickle.load(f) 69 | for address, sample in test_set.items(): 70 | for s in sample: 71 | d, result, nearest_neighbors = 1.0, '', [] 72 | for train_addr, train_samples in self._sample_set.items(): 73 | for train_s in train_samples: 74 | distance = self.metric(s, train_s) 75 | if len(nearest_neighbors) < self.k: 76 | nearest_neighbors.append((distance, train_addr)) 77 | else: 78 | max_distance = max([nn[0] for nn in nearest_neighbors]) 79 | if distance < max_distance: 80 | nearest_neighbors = [nn for nn in nearest_neighbors if nn[0] < max_distance] 81 | nearest_neighbors.append((distance, train_addr)) 82 | counter, min_distance, max_count = {}, {}, 0 83 | for nn in nearest_neighbors: 84 | counter[nn[1]] = counter.get(nn[1], 0) + 1 85 | if nn[0] < min_distance.get(nn[1], 1.0): 86 | min_distance[nn[1]] = nn[0] 87 | for nn in nearest_neighbors: 88 | if counter[nn[1]] > max_count or (counter[nn[1]] == max_count and min_distance[nn[1]] < d): 89 | d, result = nn 90 | if result == address: 91 | true_count += 1 92 | else: 93 | false_count += 1 94 | print(true_count, true_count + false_count, true_count / (true_count + false_count)) 95 | accuracy = true_count / (true_count + false_count) 96 | print(accuracy) 97 | return accuracy 98 | 99 | @staticmethod 100 | def calculate_total_variation_distance(d1, d2): 101 | s1 = set(d1.keys()) 102 | s2 = set(d2.keys()) 103 | s = s1 | s2 104 | d = 0.0 105 | for packer_header in s: 106 | d += abs(d1.get(packer_header, 0.0) - d2.get(packer_header, 0.0)) 107 | d = d / 2 108 | return d 109 | 110 | @staticmethod 111 | def calculate_hellinger_distance(d1, d2): 112 | s1 = set(d1.keys()) 113 | s2 = set(d2.keys()) 114 | s = s1 | s2 115 | d = 0.0 116 | for packer_header in s: 117 | p1 = d1.get(packer_header, 0.0) 118 | p2 = d2.get(packer_header, 0.0) 119 | d += (math.sqrt(p1) - math.sqrt(p2)) ** 2 120 | d = math.sqrt(d) / math.sqrt(2) 121 | return d 122 | 123 | -------------------------------------------------------------------------------- /classifier/iot_sense.py: -------------------------------------------------------------------------------- 1 | from classifier.base import Classifier 2 | 3 | import pickle 4 | 5 | from sklearn.ensemble import GradientBoostingClassifier 6 | import numpy as np 7 | 8 | IP = 0 9 | ICMP = 1 10 | ICMPv6 = 2 11 | EAPoL = 3 12 | TCP = 4 13 | UDP = 5 14 | HTTP = 6 15 | HTTPS = 7 16 | DHCP = 8 17 | BOOTP = 9 18 | SSDP = 10 19 | DNS = 11 20 | MDNS = 12 21 | NTP = 13 22 | IP_OPTION_PADDING = 14 23 | IP_OPTION_RA = 15 24 | ENTROPY = 16 25 | TCP_WINDOW_SIZE = 17 26 | TCP_PAYLOAD_LENGTH = 18 27 | 28 | 29 | class IoTSenseClassifier(Classifier): 30 | ''' 31 | Bezawada, B., Bachani, M., Peterson, J., Shirazi, H., Ray, I., & Ray, I. (2018). 32 | Iotsense: Behavioral fingerprinting of iot devices. 33 | arXiv preprint arXiv:1804.03852. 34 | ''' 35 | 36 | def __init__(self, interval, entropy_feature_archive): 37 | super(IoTSenseClassifier, self).__init__() 38 | self.tag = 'iot-sense' 39 | self.interval = interval 40 | self._entropy_feature_archive = entropy_feature_archive 41 | self.selected_features = ['timestamp', 'size', 'address_src', 'address_dst', 'ip_proto', 'ip_opt_padding', 42 | 'ip_opt_ra', 'tcp_srcport', 'tcp_dstport', 'tcp_window_size', 'tcp_len', 43 | 'udp_srcport', 'udp_dstport', 'http', 'ntp' 44 | ] 45 | 46 | @staticmethod 47 | def _get_sample(packet_series, dataset): 48 | for addr, packet_vectors in packet_series.items(): 49 | interval_vector = [] 50 | for i in range(0, len(packet_vectors), 5): 51 | feature_vector = [] 52 | if i + 5 < len(packet_vectors): 53 | for j in range(i, i + 5): 54 | feature_vector.extend(packet_vectors[j]) 55 | interval_vector.append(feature_vector) 56 | if interval_vector: 57 | dataset[addr].append(interval_vector) 58 | 59 | def get_dataset(self, raw_dataset, generator): 60 | instance_index = 0 61 | dataset = {addr: [] for addr in raw_dataset.iot_list.values()} 62 | packet_series = {addr: [] for addr in raw_dataset.iot_list.values()} 63 | entropy_feature = open(self._entropy_feature_archive, 'r') 64 | for t, addr_src, addr_dst, ip_proto, eth_type, ip_option_padding, ip_option_ra, tcp_srcport, tcp_dstport, \ 65 | tcp_window_size, tcp_len, udp_srcport, udp_dstport, http, ntp in generator: 66 | t = int(float(t)) 67 | entropy = entropy_feature.readline().split(',')[-1].strip() 68 | entropy = float(entropy) if entropy else 0 69 | if instance_index and t // self.interval != instance_index: 70 | self._get_sample(packet_series, dataset) 71 | instance_index = t // self.interval 72 | packet_series = {mac: [] for mac in raw_dataset.iot_list.values()} 73 | if not instance_index: 74 | instance_index = t // self.interval 75 | packet_vector = [0] * 19 76 | if eth_type == '0x00008e88': 77 | packet_vector[EAPoL] = 1 78 | elif eth_type == '0x00000800' or eth_type == '0x000086dd': 79 | packet_vector[IP] = 1 80 | packet_vector[ENTROPY] = entropy 81 | if ip_option_padding: 82 | packet_vector[IP_OPTION_PADDING] = 1 83 | if ip_option_ra: 84 | packet_vector[IP_OPTION_RA] = 1 85 | if ip_proto == '6': 86 | packet_vector[TCP] = 1 87 | packet_vector[TCP_WINDOW_SIZE] = int(tcp_window_size) if tcp_window_size else 0 88 | packet_vector[TCP_PAYLOAD_LENGTH] = int(tcp_len) if tcp_len else 0 89 | if http: 90 | packet_vector[HTTP] = 1 91 | if tcp_srcport == '443' or tcp_dstport == '443': 92 | packet_vector[HTTPS] = 1 93 | elif ip_proto == '17': 94 | packet_vector[UDP] = 1 95 | if ntp: 96 | packet_vector[NTP] = 1 97 | elif udp_srcport == '53' or udp_dstport == '53': 98 | packet_vector[DNS] = 1 99 | elif udp_srcport == '5353' or udp_dstport == '5353': 100 | packet_vector[MDNS] = 1 101 | elif udp_srcport in ['67', '68'] or udp_dstport in ['67', '68']: 102 | packet_vector[DHCP] = 1 103 | packet_vector[BOOTP] = 1 104 | elif udp_srcport == '1900' or udp_dstport == '1900': 105 | packet_vector[SSDP] = 1 106 | elif ip_proto == '1': 107 | packet_vector[ICMP] = 1 108 | else: 109 | continue 110 | if addr_src in raw_dataset.iot_list.values(): 111 | packet_series[addr_src].append(packet_vector) 112 | if addr_dst in raw_dataset.iot_list.values(): 113 | packet_series[addr_dst].append(packet_vector) 114 | self._get_sample(packet_series, dataset) 115 | return dataset 116 | 117 | def train_model(self, dataset, training_set_archive): 118 | with open(training_set_archive, 'rb') as f: 119 | train_set = pickle.load(f) 120 | models = {addr: None for addr in dataset.iot_list.values()} 121 | for addr in dataset.iot_list.values(): 122 | x, y = [], [] 123 | for sample_addr, features in train_set.items(): 124 | for feature in features: 125 | x.append(feature) 126 | if sample_addr == addr: 127 | y.append(1) 128 | else: 129 | y.append(2) 130 | x_train, y_train = np.array(x), np.array(y) 131 | gbdt = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1) 132 | gbdt.fit(x_train, y_train) 133 | models[addr] = gbdt 134 | final_models = {k: v for k, v in models.items() if v} 135 | self.model = final_models 136 | 137 | def test(self, dataset, test_set_archive=None): 138 | with open(test_set_archive, 'rb') as f: 139 | test_set = pickle.load(f) 140 | true_count, false_count = 0, 0 141 | for addr, features in test_set.items(): 142 | for feature in features: 143 | y_counter = {} 144 | for F in feature: 145 | x_test = np.array([F]) 146 | result, p = '', 0.0 147 | for test_addr, model in self.model.items(): 148 | y_predict = model.predict_proba(x_test) 149 | if y_predict[0][0] > y_predict[0][1] and y_predict[0][0] > p: 150 | p = y_predict[0][0] 151 | result = test_addr 152 | if result: 153 | y_counter[result] = y_counter.get(result, 0) + 1 154 | result, max_count = -1, 0 155 | for k, v in y_counter.items(): 156 | if v > max_count: 157 | max_count = v 158 | result = k 159 | if result == addr: 160 | true_count += 1 161 | else: 162 | false_count += 1 163 | print(true_count, true_count + false_count, true_count / (true_count + false_count)) 164 | accuracy = true_count / (true_count + false_count) 165 | print(accuracy) 166 | return accuracy 167 | -------------------------------------------------------------------------------- /classifier/tmc.py: -------------------------------------------------------------------------------- 1 | from classifier.base import Classifier 2 | 3 | from sklearn.naive_bayes import MultinomialNB 4 | from sklearn.ensemble import RandomForestClassifier 5 | import numpy as np 6 | 7 | 8 | class TMCClassifier(Classifier): 9 | ''' 10 | Sivanathan, Arunan, et al. "Classifying IoT devices in smart environments 11 | using network traffic characteristics." IEEE Transactions on Mobile 12 | Computing 18.8 (2018): 1745-1759. 13 | ''' 14 | 15 | def __init__(self, interval=1800): 16 | super(TMCClassifier, self).__init__() 17 | self.interval = interval 18 | self.tag = 'TMC' 19 | self.selected_features = ['timestamp', 'size', 'address_src', 'address_dst', 'tcp_dstport', 'ssl_ciphersuite', 20 | 'udp_dstport', 'dns_query_name', 'ntp'] 21 | self._mnb1, self._mnb2, self._mnb3 = None, None, None 22 | self._reverse_c, self._reverse_r, self._reverse_d = None, None, None 23 | 24 | @staticmethod 25 | def _get_sample(counter, dataset): 26 | for addr, c in counter.items(): 27 | if c['v']: 28 | feature = { 29 | 'dns': c['dns'], 30 | 'rp': c['rp'], 31 | 'cs': c['cs'], 32 | 'flow_volume': c['v'], 33 | 'flow_duration': c['last-packet'] - c['first-packet'], 34 | 'sleep_time': c['t'] 35 | } 36 | if feature['flow_duration']: 37 | feature['flow_radio'] = feature['flow_volume'] / feature['flow_duration'] 38 | else: 39 | feature['flow_radio'] = 0 40 | if len(c['dt']) == 0: 41 | feature['dns_interval'] = -1 42 | elif len(c['dt']) == 1: 43 | feature['dns_interval'] = 0 44 | else: 45 | s = 0 46 | for i in range(1, len(c['dt'])): 47 | s += c['dt'][i] - c['dt'][i - 1] 48 | feature['dns_interval'] = s / (len(c['dt']) - 1) 49 | if len(c['nt']) == 0: 50 | feature['ntp_interval'] = -1 51 | elif len(c['nt']) == 1: 52 | feature['ntp_interval'] = 0 53 | else: 54 | s = 0 55 | for i in range(1, len(c['nt'])): 56 | s += c['nt'][i] - c['nt'][i - 1] 57 | feature['ntp_interval'] = s / (len(c['nt']) - 1) 58 | dataset[addr].append(feature) 59 | 60 | def get_dataset(self, raw_dataset, generator): 61 | def _get_feature(v, address, packet_time, packet_size): 62 | if 'first-packet' not in v[address]: 63 | v[address]['first-packet'] = packet_time 64 | if v[address]['last-packet'] and packet_time - v[address]['last-packet'] > v[address]['t']: 65 | v[address]['t'] = packet_time - v[address]['last-packet'] 66 | v[address]['last-packet'] = packet_time 67 | v[address]['v'] += packet_size 68 | 69 | counter = { 70 | address: {'dns': [], 'rp': [], 'cs': [], 'v': 0, 't': 0, 'dt': [], 'nt': [], 'last-packet': 0} 71 | for address in raw_dataset.device_list.values() 72 | } 73 | dataset = {address: [] for address in raw_dataset.device_list.values()} 74 | instance_index = 0 75 | for t, size, addr_src, addr_dst, tcp_dstport, ciphersuite, udp_dstport, dns, ntp in generator: 76 | t, size = float(t), int(size) 77 | if instance_index and t // self.interval != instance_index: 78 | self._get_sample(counter, dataset) 79 | instance_index = t // self.interval 80 | counter = { 81 | address: {'dns': [], 'rp': [], 'cs': [], 'v': 0, 't': 0, 'dt': [], 'nt': [], 'last-packet': 0} 82 | for address in raw_dataset.device_list.values() 83 | } 84 | if not instance_index: 85 | instance_index = t // self.interval 86 | if addr_src in counter.keys(): 87 | _get_feature(counter, addr_src, t, size) 88 | if dns: 89 | for dns_i in dns.split(','): 90 | counter[addr_src]['dns'].append(dns_i) 91 | counter[addr_src]['dt'].append(t) 92 | if ntp: 93 | counter[addr_src]['nt'].append(t) 94 | if ciphersuite: 95 | cipher_suite = ciphersuite.split(',') 96 | for cs in cipher_suite: 97 | for cs_i in cs.split(';'): 98 | counter[addr_src]['cs'].append(int(cs_i)) 99 | if tcp_dstport: 100 | counter[addr_src]['rp'].append(int(tcp_dstport)) 101 | if udp_dstport: 102 | counter[addr_src]['rp'].append(int(udp_dstport)) 103 | if addr_dst in counter.keys(): 104 | _get_feature(counter, addr_dst, t, size) 105 | self._get_sample(counter, dataset) 106 | bidirectional_non_iot = [] 107 | for mac in raw_dataset.non_iot_list.values(): 108 | bidirectional_non_iot.extend(dataset[mac]) 109 | dataset.pop(mac) 110 | dataset['non-iot'] = bidirectional_non_iot 111 | if not self._mnb1: 112 | self._train_preprocessor(dataset, raw_dataset) 113 | return self._get_final_dataset(dataset, raw_dataset) 114 | 115 | def _get_frequency_features(self, raw_feature): 116 | x_dns, x_rp, x_cs = [0] * len(self._reverse_d), [0] * len(self._reverse_r), [0] * len(self._reverse_c) 117 | for domain_name in raw_feature['dns']: 118 | if domain_name in self._reverse_d: 119 | x_dns[self._reverse_d[domain_name]] += 1 120 | for port in raw_feature['rp']: 121 | if port in self._reverse_r: 122 | x_rp[self._reverse_r[port]] += 1 123 | for cipher_suite in raw_feature['cs']: 124 | if cipher_suite in self._reverse_c: 125 | x_cs[self._reverse_c[cipher_suite]] += 1 126 | return x_dns, x_rp, x_cs 127 | 128 | def _train_preprocessor(self, train_set, dataset): 129 | dns_dictionary = set() 130 | rp_dictionary = set() 131 | cs_dictionary = set() 132 | for addr, features in train_set.items(): 133 | for feature in features: 134 | for dns in feature['dns']: 135 | dns_dictionary.add(dns) 136 | for rp in feature['rp']: 137 | rp_dictionary.add(rp) 138 | for cs in feature['cs']: 139 | cs_dictionary.add(cs) 140 | d_d = {i: v for i, v in enumerate(dns_dictionary)} 141 | r_d = {i: v for i, v in enumerate(rp_dictionary)} 142 | c_d = {i: v for i, v in enumerate(cs_dictionary)} 143 | self._reverse_d = {v: k for k, v in d_d.items()} 144 | self._reverse_r = {v: k for k, v in r_d.items()} 145 | self._reverse_c = {v: k for k, v in c_d.items()} 146 | x_d, x_r, x_c, y_d, y_r, y_c = [], [], [], [], [], [] 147 | 148 | for addr, features in train_set.items(): 149 | for feature in features: 150 | x_d_i, x_r_i, x_c_i = self._get_frequency_features(feature) 151 | x_d.append(x_d_i) 152 | x_c.append(x_c_i) 153 | x_r.append(x_r_i) 154 | y_d.append(dataset.label_map[addr]) 155 | y_r.append(dataset.label_map[addr]) 156 | y_c.append(dataset.label_map[addr]) 157 | x_d, x_r, x_c = np.array(x_d), np.array(x_r), np.array(x_c) 158 | y_d, y_c, y_r = np.array(y_d), np.array(y_c), np.array(y_r) 159 | mnb1, mnb2, mnb3 = MultinomialNB(), MultinomialNB(), MultinomialNB() 160 | mnb1.fit(x_d, y_d) 161 | mnb2.fit(x_r, y_r) 162 | if x_c: 163 | mnb3.fit(x_c, y_c) 164 | else: 165 | mnb3 = None 166 | self._mnb1, self._mnb2, self._mnb3 = mnb1, mnb2, mnb3 167 | 168 | def _get_final_dataset(self, original_dataset, raw_dataset): 169 | final_features = {addr: [] for addr in raw_dataset.addr_device_map.keys()} 170 | for addr, features in original_dataset.items(): 171 | for feature in features: 172 | feature_vector = [feature['flow_volume'], feature['flow_duration'], feature['flow_radio'], 173 | feature['sleep_time'], feature['dns_interval'], feature['ntp_interval']] 174 | x_d_i, x_r_i, x_c_i = self._get_frequency_features(feature) 175 | y_d_i = self._mnb1.predict(np.array([x_d_i])) 176 | y_d_i_p = self._mnb1.predict_proba(np.array([x_d_i])) 177 | y_r_i = self._mnb2.predict(np.array([x_r_i])) 178 | y_r_i_p = self._mnb2.predict_proba(np.array([x_r_i])) 179 | 180 | feature_vector.append(y_d_i[0]) 181 | feature_vector.extend(list(y_d_i_p[0])) 182 | feature_vector.append(y_r_i[0]) 183 | feature_vector.extend(list(y_r_i_p[0])) 184 | 185 | if self._mnb3: 186 | y_c_i = self._mnb3.predict(np.array([x_c_i])) 187 | y_c_i_p = self._mnb3.predict_proba(np.array([x_c_i])) 188 | feature_vector.append(y_c_i[0]) 189 | feature_vector.extend(list(y_c_i_p[0])) 190 | final_features[addr].append(feature_vector) 191 | return final_features 192 | 193 | def train_model(self, dataset, training_set_archive): 194 | x_train, y_train = self.get_training_dataset(dataset, training_set_archive) 195 | rf = RandomForestClassifier(n_estimators=100) 196 | rf.fit(x_train, y_train) 197 | self.model = rf 198 | 199 | -------------------------------------------------------------------------------- /dataset/private.py: -------------------------------------------------------------------------------- 1 | from dataset.utils import get_entropy_feature, generate_feature 2 | 3 | import os 4 | 5 | 6 | class PrivateDataset(object): 7 | DEVICE_IOT_LIST = ''' 8 | YeeLinkLight 50:ec:50:7a:b3:d0 9 | MijiaCamera 5c:e5:0c:a9:71:fa 10 | MiDoorbell2L d4:d2:d6:4e:9f:d5 11 | ChuangMiPlug 44:23:7c:57:c9:94 12 | DMakerFan 64:90:c1:b5:76:bb 13 | DreamerHumidifier 64:90:c1:db:a9:f3 14 | MIAISoundBox ec:fa:5c:0a:f8:a3 15 | ZHIMI-AirPurifier 5c:e5:0c:ba:c4:ff 16 | HuaweiSmartPlug cc:50:e3:dc:f8:d7 17 | TP-LinkCamera 80:ea:07:aa:40:3b 18 | HuaweiSmartCamera 48:46:c1:51:b6:46 19 | WizLight a8:bb:50:22:d2:55 20 | BloodPressureMeasure d0:49:00:47:5e:90 21 | AISpeakerMini 48:3f:e9:8d:4d:e5 22 | DataFrame 00:e0:4c:8e:c9:52 23 | TuyaSmartPlug c4:4f:33:99:ab:a2 24 | AquraGateway 54:ef:44:cb:1e:38 25 | '''.split() 26 | 27 | # WaterLoggingSensor 28 | # BodySensor 29 | # Temperature&HumiditySensor 30 | # SmokeAlarm 31 | # VibrationSensor 32 | # Door&WindowSensor 33 | # CubeController 34 | 35 | def __init__(self): 36 | self.iot_list = {} 37 | for i in range(0, len(PrivateDataset.DEVICE_IOT_LIST), 2): 38 | self.iot_list[PrivateDataset.DEVICE_IOT_LIST[i]] = PrivateDataset.DEVICE_IOT_LIST[i + 1] 39 | self.non_iot_list = {} 40 | self.device_list = self.iot_list 41 | self.addr_device_map = {v: k for k, v in self.iot_list.items()} 42 | self.label_map = {addr: i for i, addr in enumerate(self.addr_device_map.keys())} 43 | self._feature_map = {'address_src': 'eth_src', 'address_dst': 'eth_dst'} 44 | self.month = [11] * 13 + [12] * 31 45 | self.date = [17, 18] + list(range(20, 31)) + list(range(1, 32)) 46 | self.default_training_range = { 47 | 'train': {'month': self.month[:22], 'date': self.date[:22]}, 48 | 'test': {'month': self.month[22:], 'date': self.date[22:]} 49 | } 50 | self.feature_list = ['index', 'timestamp', 'size', 'eth_src', 'eth_dst', 51 | 'eth_type', 'ip_src', 'ip_dst', 'ip_proto', 'ip_opt_padding', 52 | 'ip_opt_ra', 'tcp_srcport', 'tcp_dstport', 'tcp_stream', 'tcp_window_size', 'tcp_len', 53 | 'ssl_ciphersuite', 'udp_srcport', 'udp_dstport', 'udp_stream', 'dns_query_name', 'http', 54 | 'ntp'] 55 | 56 | def run_tshark(self): 57 | base_dir = os.getcwd() 58 | command = 'tshark -r {}/silent-test/pcap/{}.pcap -T fields -E separator=$ -e frame.number -e frame.time_epoch '\ 59 | '-e frame.len -e eth.src -e eth.dst ' \ 60 | '-e eth.type -e ip.src -e ip.dst -e ip.proto -e ip.opt.padding -e ip.opt.ra -e tcp.srcport -e ' \ 61 | 'tcp.dstport -e tcp.stream -e tcp.window_size -e tcp.len -e ssl.handshake.ciphersuite -e ' \ 62 | 'udp.srcport -e udp.dstport -e udp.stream -e dns.qry.name -e http -e ntp >{}/silent-test/csv/{}.csv' 63 | for m, d in zip(self.month, self.date): 64 | file_name = '2020{:02d}{:02d}'.format(m, d) 65 | print(command.format(base_dir, file_name, base_dir, file_name)) 66 | os.system(command.format(base_dir, file_name, base_dir, file_name)) 67 | 68 | def get_entropy_feature(self): 69 | for m, d in zip(self.month, self.date): 70 | file_name = '2020{:02d}{:02d}'.format(m, d) 71 | pcap_file = './silent-test/pcap/{}.pcap'.format(file_name) 72 | output_file = open('./silent-test/entropy/{}.csv'.format(file_name), 'w') 73 | get_entropy_feature(pcap_file, output_file) 74 | 75 | def data_generator(self, month, date, features): 76 | if len(month) != len(date): 77 | raise ValueError("invalid parameter: len(month) != len(date)") 78 | for m, d in zip(month, date): 79 | feature_path = './silent-test/csv/2020{:02d}{:02d}.csv'.format(m, d) 80 | f = open(feature_path, 'r') 81 | yield from generate_feature(f, self.feature_list, features, self._feature_map) 82 | print('finish reading {}-{}'.format(m, d)) 83 | 84 | 85 | -------------------------------------------------------------------------------- /dataset/unsw.py: -------------------------------------------------------------------------------- 1 | from dataset.utils import get_entropy_feature, generate_feature 2 | 3 | import os 4 | 5 | 6 | class UNSWDataset(object): 7 | DEVICE_IOT_LIST = ''' 8 | SmartThings d0:52:a8:00:67:5e 9 | AmazonEcho 44:65:0d:56:cc:d3 10 | NetatmoWelcome 70:ee:50:18:34:43 11 | TP-LinkDayNightCloudCamera f4:f2:6d:93:51:f1 12 | SamsungSmartCam 00:16:6c:ab:6b:88 13 | Dropcam 30:8c:fb:2f:e4:b2 14 | InsteonCamera 00:62:6e:51:27:2e 15 | Unknown e8:ab:fa:19:de:4f 16 | WithingsSmartBabyMonitor 00:24:e4:11:18:a8 17 | BelkinWemoSwitch ec:1a:59:79:f4:89 18 | TP-LinkSmartPlug 50:c7:bf:00:56:39 19 | iHome 74:c6:3b:29:d7:1d 20 | BelkinWemoMotionSensor ec:1a:59:83:28:11 21 | NESTProtectSmokeAlarm 18:b4:30:25:be:e4 22 | NetatmoWeatherStation 70:ee:50:03:b8:ac 23 | WithingsSmartScale 00:24:e4:1b:6f:96 24 | BlipcareBloodPressureMeter 74:6a:89:00:2e:25 25 | WithingsAuraSmartSleepSensor 00:24:e4:20:28:c6 26 | LightBulbsLiFXSmartBulb d0:73:d5:01:83:08 27 | TribySpeaker 18:b7:9e:02:20:44 28 | PIX-STARPhoto-frame e0:76:d0:33:bb:85 29 | HPPrinter 70:5a:0f:e4:9b:c0 30 | '''.split() 31 | 32 | DEVICE_NONIOT_LIST = ''' 33 | SamsungGalaxyTab 08:21:ef:3b:fc:e3 34 | AndroidPhone 40:f3:08:ff:1e:da 35 | Laptop 74:2f:68:81:69:42 36 | MacBook ac:bc:32:d4:6f:2f 37 | AndroidPhone b4:ce:f6:a7:a3:c2 38 | IPhone d0:a6:37:df:a1:e1 39 | MacBook/Iphone f4:5c:89:93:cc:85 40 | '''.split() 41 | 42 | # TPLinkRouterBridgeLAN(Gateway) 14:cc:20:51:33:ea 43 | # NestDropcam 30:8c:fb:b6:ea:45 44 | 45 | def __init__(self): 46 | self.device_list, self.iot_list, self.non_iot_list = {}, {}, {} 47 | for i in range(0, len(UNSWDataset.DEVICE_IOT_LIST), 2): 48 | self.iot_list[UNSWDataset.DEVICE_IOT_LIST[i]] = UNSWDataset.DEVICE_IOT_LIST[i + 1] 49 | self.device_list[UNSWDataset.DEVICE_IOT_LIST[i]] = UNSWDataset.DEVICE_IOT_LIST[i + 1] 50 | for i in range(0, len(UNSWDataset.DEVICE_NONIOT_LIST), 2): 51 | self.non_iot_list[UNSWDataset.DEVICE_NONIOT_LIST[i]] = \ 52 | UNSWDataset.DEVICE_NONIOT_LIST[i + 1] 53 | self.device_list[UNSWDataset.DEVICE_NONIOT_LIST[i]] = \ 54 | UNSWDataset.DEVICE_NONIOT_LIST[i + 1] 55 | self.addr_device_map = {v: k for k, v in self.iot_list.items()} 56 | self.addr_device_map['non-iot'] = 'non-iot' 57 | self.label_map = {addr: i for i, addr in enumerate(self.addr_device_map.keys())} 58 | self.month = [9] * 8 + [10] * 12 59 | self.date = list(range(23, 31)) + list(range(1, 13)) 60 | self.feature_list = ['index', 'timestamp', 'size', 'eth_src', 'eth_dst', 61 | 'eth_type', 'ip_src', 'ip_dst', 'ip_proto', 'ip_opt_padding', 62 | 'ip_opt_ra', 'tcp_srcport', 'tcp_dstport', 'tcp_stream', 'tcp_window_size', 'tcp_len', 63 | 'ssl_ciphersuite', 'udp_srcport', 'udp_dstport', 'udp_stream', 'dns_query_name', 'http', 64 | 'ntp'] 65 | self._feature_map = {'address_src': 'eth_src', 'address_dst': 'eth_dst'} 66 | self.default_training_range = { 67 | 'train': {'month': self.month[:10], 'date': self.date[:10]}, 68 | 'test': {'month': self.month[10:], 'date': self.date[10:]} 69 | } 70 | 71 | def run_tshark(self): 72 | base_dir = os.getcwd() 73 | command = 'tshark -r {}/UNSWData/pcap-raw/{}.pcap -T fields -E separator=$ -e frame.number '\ 74 | '-e frame.time_epoch -e frame.len -e eth.src -e eth.dst ' \ 75 | '-e eth.type -e ip.src -e ip.dst -e ip.proto -e ip.opt.padding -e ip.opt.ra -e tcp.srcport -e ' \ 76 | 'tcp.dstport -e tcp.stream -e tcp.window_size -e tcp.len -e ssl.handshake.ciphersuite -e ' \ 77 | 'udp.srcport -e udp.dstport -e udp.stream -e dns.qry.name -e http -e ntp >{}/UNSWData/features/{}.csv' 78 | for m, d in zip(self.month, self.date): 79 | file_name = '16-{:02d}-{:02d}'.format(m, d) 80 | os.system(command.format(base_dir, file_name, base_dir, file_name)) 81 | 82 | def get_entropy_feature(self): 83 | for m, d in zip(self.month, self.date): 84 | file_name = '16-{:02d}-{:02d}'.format(m, d) 85 | pcap_file = './pcap-raw/{}.pcap'.format(file_name) 86 | output_file = open('./entropy/{}.csv'.format(file_name), 'w') 87 | get_entropy_feature(pcap_file, output_file) 88 | 89 | def data_generator(self, month, date, features): 90 | if len(month) != len(date): 91 | raise ValueError("invalid parameter: len(month) != len(date)") 92 | for m, d in zip(month, date): 93 | feature_path = './UNSWData/features/16-{:02d}-{:02d}.csv'.format(m, d) 94 | f = open(feature_path, 'r') 95 | yield from generate_feature(f, self.feature_list, features, self._feature_map) 96 | print('finish reading {}-{}'.format(m, d)) 97 | 98 | -------------------------------------------------------------------------------- /dataset/utils.py: -------------------------------------------------------------------------------- 1 | from scapy.all import * 2 | from scapy.utils import PcapReader 3 | 4 | 5 | def get_entropy_feature(pcap_file, output_file): 6 | def calculate_entropy(packet_payload): 7 | byte_map = {} 8 | for byte in packet_payload: 9 | byte_map[byte] = byte_map.get(byte, 0) + 1 10 | p = {k: v / len(payload) for k, v in byte_map.items()} 11 | e = 0 12 | for p_i in p.values(): 13 | if p_i: 14 | e += -p_i * math.log(p_i, 256) 15 | return e 16 | 17 | with PcapReader(pcap_file) as pcapfile: 18 | for index, packet in enumerate(pcapfile): 19 | if IP in packet: 20 | if TCP in packet[IP]: 21 | payload = bytes(packet[IP][TCP].payload) 22 | entropy = calculate_entropy(payload) 23 | output_file.write('{},{}\n'.format(index, entropy)) 24 | elif UDP in packet[IP]: 25 | payload = bytes(packet[IP][UDP].payload) 26 | entropy = calculate_entropy(payload) 27 | output_file.write('{},{}\n'.format(index, entropy)) 28 | else: 29 | output_file.write('{},\n'.format(index)) 30 | else: 31 | output_file.write('{},\n'.format(index)) 32 | output_file.close() 33 | 34 | 35 | def generate_feature(file, feature_list, selected_feature, alias): 36 | line = file.readline() 37 | while line: 38 | fields = line.split('$') 39 | feature_vector = tuple( 40 | fields[feature_list.index(alias.get(feature_name, feature_name))] for feature_name in selected_feature) 41 | yield feature_vector 42 | line = file.readline() 43 | file.close() 44 | -------------------------------------------------------------------------------- /dataset/yourthings.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from dataset.utils import generate_feature 4 | 5 | 6 | class YourthingsDataset(object): 7 | DEVICE_IOT_LIST = ''' 8 | GoogleOnHub 192.168.0.2 9 | SamsungSmartThingsHub 192.168.0.4 10 | PhilipsHUEHub 192.168.0.5 11 | InsteonHub 192.168.0.6 12 | Sonos 192.168.0.7 13 | SecurifiAlmond 192.168.0.8 14 | NestCamera 192.168.0.10 15 | BelkinWeMoMotionSensor 192.168.0.12 16 | LIFXVirtualBulb 192.168.0.13 17 | BelkinWeMoSwitch 192.168.0.14 18 | AmazonEcho 192.168.0.15 19 | WinkHub 192.168.0.16 20 | BelkinNetcam 192.168.0.18 21 | RingDoorbell 192.168.0.19 22 | RokuTV 192.168.0.21 23 | Roku4 192.168.0.22 24 | AmazonFireTV 192.168.0.23 25 | nVidiaShield 192.168.0.24 26 | AppleTV(4thGen) 192.168.0.25 27 | BelkinWeMoLink 192.168.0.26 28 | NetgearArloCamera 192.168.0.27 29 | D-LinkDCS-5009LCamera 192.168.0.28 30 | LogitechLogiCircle 192.168.0.29 31 | Canary 192.168.0.30 32 | PiperNV 192.168.0.31 33 | WithingsHome 192.168.0.32 34 | WeMoCrockpot 192.168.0.33 35 | MiCasaVerdeVeraLite 192.168.0.34 36 | ChineseWebcam 192.168.0.35 37 | AugustDoorbellCam 192.168.0.36 38 | TP-LinkWiFiPlug 192.168.0.37 39 | ChamberlainmyQGarageOpener 192.168.0.38 40 | LogitechHarmonyHub 192.168.0.39 41 | CasetaWirelessHub 192.168.0.41 42 | GoogleHomeMini 192.168.0.42 43 | GoogleHome 192.168.0.43 44 | BoseSoundTouch10 192.168.0.44 45 | HarmonKardonInvoke 192.168.0.45 46 | AppleHomePod 192.168.0.47 47 | Roomba 192.168.0.48 48 | SamsungSmartTV 192.168.0.49 49 | KoogeekLightbulb 192.168.0.50 50 | TP-LinkSmartWiFiLEDBulb 192.168.0.51 51 | Wink2Hub 192.168.0.52 52 | NestCamIQ 192.168.0.53 53 | NestGuard 192.168.0.54 54 | '''.split() 55 | 56 | def __init__(self): 57 | self.iot_list = {} 58 | for i in range(0, len(YourthingsDataset.DEVICE_IOT_LIST), 2): 59 | self.iot_list[YourthingsDataset.DEVICE_IOT_LIST[i]] = YourthingsDataset.DEVICE_IOT_LIST[i + 1] 60 | self.addr_device_map = {v: k for k, v in self.iot_list.items()} 61 | self.non_iot_list = {} 62 | self.device_list = self.iot_list 63 | self.label_map = {addr: i for i, addr in enumerate(self.addr_device_map.keys())} 64 | self.dates = list(range(10, 20)) 65 | self.feature_list = ['index', 'timestamp', 'size', 'eth_type', 'ip_src', 'ip_dst', 'ip_proto', 'ip_opt_padding', 66 | 'ip_opt_ra', 'tcp_srcport', 'tcp_dstport', 'tcp_stream', 'tcp_window_size', 'tcp_len', 67 | 'ssl_ciphersuite', 'udp_srcport', 'udp_dstport', 'udp_stream', 'dns_query_name', 'http', 68 | 'ntp'] 69 | self._feature_map = {'address_src': 'ip_src', 'address_dst': 'ip_dst'} 70 | 71 | def run_tshark(self): 72 | base_dir = os.getcwd() 73 | command = 'tshark -r {}/Yourthings/pcap-raw/{}.pcap -T fields -E separator=$ -e frame.number -e ' \ 74 | 'frame.time_epoch -e frame.len ' \ 75 | '-e eth.type -e ip.src -e ip.dst -e ip.proto -e ip.opt.padding -e ip.opt.ra -e tcp.srcport -e ' \ 76 | 'tcp.dstport -e tcp.stream -e tcp.window_size -e tcp.len -e ssl.handshake.ciphersuite -e ' \ 77 | 'udp.srcport -e udp.dstport -e udp.stream -e dns.qry.name -e http -e ntp ' \ 78 | '>{}/Yourthings/features/10-{}.csv ' 79 | for d in self.dates: 80 | file_name = '{:02}'.format(d) 81 | os.system(command.format(base_dir, file_name, base_dir, file_name)) 82 | 83 | def data_generator(self, dates, features): 84 | file_path = './Yourthings/features/10-{}.csv' 85 | for date in dates: 86 | file_name = file_path.format(date) 87 | csv_file = open(file_name, 'r') 88 | yield from generate_feature(csv_file, self.feature_list, features, self._feature_map) 89 | print('finish reading {}'.format(file_name)) 90 | 91 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from dataset.private import PrivateDataset 2 | from classifier import byteiot, tmc, big_data, audi 3 | 4 | 5 | def preprocess_private_dataset(): 6 | private_dataset = PrivateDataset() 7 | private_dataset.run_tshark() 8 | private_dataset.get_entropy_feature() 9 | 10 | 11 | if __name__ == '__main__': 12 | preprocess_dataset() 13 | byteiot_classifier = byteiot.ByteIoTClassifier() 14 | byteiot_classifier.get_archived_dataset('Private') 15 | byteiot_classifier.train_on_private_dataset() 16 | print('training phase completed') 17 | 18 | byteiot_classifier.test(PrivateDataset(), './byteiot-test.pkl') 19 | print('test phase completed') 20 | --------------------------------------------------------------------------------