├── Architecture and Preprocessing ├── CBS-Architecture.svg └── CBS-Preprocessing.svg ├── Bi-LSTM_Traffic_classification .py ├── Bi-LSTM_build_model.py ├── Break_CSV_File.py ├── Break_Data_File.py ├── Dataset ├── ISCX VPN-NonVPN 2016.md └── dataset-more description1.md ├── FC-traffic-classification.py ├── ISCX-Analysis ├── README.md └── session features.py ├── README.md ├── SAE_Traffic_classification.py ├── SAE_build_model.py ├── cnn_Traffic_classification.py ├── cnn_build_model.py ├── compare-accuracy-code.py ├── defin_1D-CNN_model_params.py ├── define_Bi-LSTM_model_params.py ├── define_FC_model_params.py ├── define_GAN_model.py ├── define_SAE_model_params.py ├── define_autoencoder.py ├── extract_header_payload_packets.py ├── fc_build_model.py ├── gausian-compare-accuracy-code.py ├── gausian-validation-train-acc.py ├── gausian-validation-training-loss.py ├── histogram_Dataset.py ├── ip_masking.py ├── load_pcap_datatype.py ├── main.py ├── memory usage-execution time.py ├── metric-evaluation.py ├── network_parameters_initializer.py ├── packet_normalization.py ├── packet_zero_pading.py ├── plot_heatmap_result.py ├── pmf_Dataset.py ├── preprocessing-traffic-label.py ├── print_summary.py ├── process_metadata_pcap.py ├── process_pcap.py ├── read_pcap_files.py ├── transform_pcap_to_dataframe.py └── var_function_initializer.py /Bi-LSTM_Traffic_classification .py: -------------------------------------------------------------------------------- 1 | # in this function we build CNN for Traffic Classification 2 | from memory usage-execution time import measure_execution_memory 3 | from BiLSTM_build_model import BiLSTM_build_model 4 | @measure_execution_memory 5 | def Bi-LSTM_Traffic_classification(root_normalized_dir,net_parameters,model_params): 6 | df_normalized = pd.DataFrame(columns=['packet_normalized_data', 'class_label']) 7 | df_train = pd.DataFrame(columns=['packet_normalized_data']) 8 | binary = "{0:08b}".format(int("1a", 16)) 9 | col_list = ['packet_normalized_data', 'class_label'] 10 | 11 | # list out keys and values separately 12 | key_list = list(net_parameters.keys()) 13 | val_list = list(net_parameters.values()) 14 | 15 | # list out keys and values separately 16 | key_list1 = list(model_params.keys()) 17 | val_list1 = list(model_params.values()) 18 | DENSE_LAYER = val_list1[key_list1.index("dense_neurons")] 19 | # network parameters 20 | BATCH_SIZE = val_list[key_list.index("BATCH_SIZE")] 21 | EPOCH = val_list[key_list.index("EPOCH")] 22 | VERBOSE = val_list[key_list.index("VERBOSE")] 23 | #OPTIMIZER = tf.keras.optimizers.Adam() 24 | VALIDATION_SPLIT = val_list[key_list.index("VALIDATION_SPLIT")] 25 | #NUM_CLASSES = val_list[key_list.index("NUM_CLASSES")] 26 | OPTIMIZER = val_list[key_list.index("OPTIMIZER")] 27 | LOSS_FUNCTION = val_list[key_list.index("LOSS_FUNCTION")] 28 | METRICS = val_list[key_list.index("METRICS")[0], key_list.index("METRICS")[1](),key_list.index("METRICS")[2](),key_list.index("METRICS")[3]()] 29 | DROPOUT = val_list[key_list.index("DROPOUT")] 30 | 31 | for path in os.listdir(root_normalized_dir): 32 | full_path = os.path.join(root_normalized_dir, path) 33 | df = pd.read_csv(full_path, usecols=col_list) 34 | model = Bi-LSTM_build_model(model_params) 35 | model.compile(loss=LOSS_FUNCTION, 36 | optimizer=OPTIMIZER, 37 | metrics=METRICS) 38 | model.summary() 39 | print("this is running Bi-LSTM model: ") 40 | 41 | # train on model 42 | X = df.iloc[:,0:1] # Data 43 | Y = df.iloc[:,1:2] # Label 44 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42) 45 | # prepare label of packet for deep NN 46 | train_label_data_list = [] 47 | test_label_data_list = [] 48 | pkt_train_label_data = np.zeros([len(y_train), 1]) 49 | pkt_test_label_data = np.zeros([len(y_test), 1]) 50 | for i in range(len(y_train)): 51 | pkt_train_label_data[i,0] = y_train.iloc[i,0] 52 | #train_label_data_list.append(pkt_train_label_data) 53 | train_label = np.array(pkt_train_label_data) 54 | train_label = train_label[:,0] 55 | train_label = train_label.astype(np.int) 56 | for i in range(len(y_test)): 57 | pkt_test_label_data[i,0] = y_test.iloc[i,0] 58 | #test_label_data_list.append(pkt_test_label_data) 59 | test_label = np.array(pkt_test_label_data) 60 | test_label = test_label[:, 0] 61 | test_label = test_label.astype(np.int) 62 | 63 | #y_train = y_train.to_numpy() 64 | #y_train = y_train.T 65 | # To create a x-by-y-by-z 3D list with initial values: 66 | 67 | data_list = [] 68 | test_list = [] 69 | 70 | pkt_data = np.zeros([len(X_train.iloc[0, 0].split(',')), 1]) 71 | for i in range(len(X_train)): 72 | print("trian preparing data i {}".format(i)) 73 | pkt_train_data = np.zeros([len(X_train.iloc[0, 0].split(',')), 1]) 74 | temp_train_list = X_train.iloc[i, 0].split(',')[:] 75 | for j in range(len(temp_train_list)): 76 | #print("test preparing data j {}".format(j)) 77 | pkt_train_data[j,0] = float(temp_train_list[j]) 78 | data_list.append(pkt_train_data) 79 | 80 | train_data = np.array(data_list) 81 | for i in range(len(X_test)): 82 | print("test preparing data i {}".format(i)) 83 | pkt_test_data = np.zeros([len(X_test.iloc[0, 0].split(',')), 1]) 84 | temp_test_list = X_test.iloc[i, 0].split(',')[:] 85 | for j in range(len(temp_test_list)): 86 | #print("test preparing data j {}".format(j)) 87 | pkt_test_data[j,0] = float(temp_test_list[j]) 88 | test_list.append(pkt_test_data) 89 | test_data = np.array(test_list) 90 | # convert class vectors to binary class matrices 91 | train_label = tf.keras.utils.to_categorical(train_label, NUM_CLASSES) 92 | test_label = tf.keras.utils.to_categorical(test_label, NUM_CLASSES) 93 | 94 | train_data = train_data.reshape((len(train_data), len(X_train.iloc[0,0].split(',')), 1, 1)) 95 | test_data = test_data.reshape((len(test_data), len(X_test.iloc[0, 0].split(',')), 1, 1)) 96 | #model = create_1dcnn_model() 97 | 98 | 99 | model.fit(train_data, train_label, batch_size=BATCH_SIZE, 100 | epochs=EPOCH,verbose=VERBOSE,validation_split= VALIDATION_SPLIT ) 101 | #score = model.evaluate(test_data, test_label, 102 | # batch_size=BATCH_SIZE) 103 | (loss, accuracy, f1_score, precision, recall) = model.evaluate(test_data, test_label, 104 | batch_size=BATCH_SIZE) 105 | score = [] 106 | score[0] = loss 107 | score[1]= accuracy 108 | score[2] = f1_score 109 | score[3] = precision 110 | score[4] = recall 111 | 112 | print("\nTest loss:", score[0]) 113 | print('Test accuracy:', score[1]) 114 | print('Test f1_score:', score[2]) 115 | print('Test precision:', score[3]) 116 | print('Test recall:', score[4]) 117 | saved_models = [] 118 | saved_weights = [] 119 | save_model_weights_dir = 'media/mehdi/linux/normalized_data/' 120 | # save model architecture 121 | model.save(save_model_weights_dir + 'model_architecture_bilstm.h5') 122 | saved_models.append('model_architecture_bilstm.h5') 123 | # save model weights 124 | model.save_weights(save_model_weights_dir + 'model_weights_bilstm.h5') 125 | saved_weights.append('model_weights_bilstm.h5') 126 | # Get the output of the last connected layer 127 | last_dense_output = model.layers[-len(DENSE_LAYER)].output 128 | return last_dense_output,saved_models,saved_weights,save_model_weights_dir 129 | 130 | -------------------------------------------------------------------------------- /Bi-LSTM_build_model.py: -------------------------------------------------------------------------------- 1 | def BiLSTM_build_model(params): 2 | 3 | 4 | # Define the model architecture 5 | model = tf.keras.Sequential() 6 | 7 | # Add the Bi-LSTM layers 8 | model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(params['input_shape'][1], return_sequences=True), input_shape=params['input_shape'])) 9 | model.add(tf.keras.layers.BatchNormalization()) 10 | model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(params['input_shape'][1], return_sequences=True))) 11 | model.add(tf.keras.layers.BatchNormalization()) 12 | model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(params['input_shape'][1], return_sequences=True))) 13 | model.add(tf.keras.layers.BatchNormalization()) 14 | 15 | # Add the attention layer 16 | model.add(tf.keras.layers.Attention()) 17 | 18 | # Add the fully connected layers (MLP) 19 | model.add(tf.keras.layers.Dense(params['dense_neurons'][0], activation=params['activation'])) 20 | model.add(tf.keras.layers.BatchNormalization()) 21 | model.add(tf.keras.layers.Dense(params['dense_neurons'][1], activation=params['activation'])) 22 | model.add(tf.keras.layers.BatchNormalization()) 23 | model.add(tf.keras.layers.Dense(params['dense_neurons'][2], activation=params['activation'])) 24 | model.add(tf.keras.layers.BatchNormalization()) 25 | model.add(tf.keras.layers.Dense(params['dense_neurons'][3], activation=params['activation'])) 26 | model.add(tf.keras.layers.BatchNormalization()) 27 | # Save the model 28 | model.save('bilstm_model.h5') 29 | 30 | return model 31 | -------------------------------------------------------------------------------- /Break_CSV_File.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | The code you have provided is a function called Break_CSV_File(). This function breaks a large CSV file into smaller files of a specified size. 4 | 5 | The function first creates a list to store the number of rows in each chunk. 6 | 7 | The function then iterates over the CSV file in chunks of the specified size. For each chunk, the function adds the number of rows in the chunk to the list. 8 | 9 | If the number of chunks is greater than or equal to 2, then the function breaks the CSV file into smaller files. The function creates a new file for each chunk and writes the chunk to the file. 10 | 11 | The function also removes the original CSV file. 12 | 13 | The first line defines the chunks_list variable to store the number of rows in each chunk. 14 | 15 | The next line defines the normalized_dir variable as the directory where the broken CSV files will be saved. 16 | 17 | The next line iterates over the CSV file in chunks of the specified size. For each chunk, the function adds the number of rows in the chunk to the chunks_list variable. 18 | 19 | The next line checks if the number of chunks is greater than or equal to 2. If it is, then the function breaks the CSV file into smaller files. 20 | 21 | The next few lines create a new file for each chunk and writes the chunk to the file. 22 | 23 | The last line removes the original CSV file. 24 | """ 25 | 26 | def Break_CSV_File(filename,chunk_size,normalized_dir): 27 | chunks_list = [] 28 | #normalized_dir = 'media/mehdi/linux/normalized_data/' 29 | for chunk in pd.read_csv(filename, iterator=True, chunksize=chunk_size): 30 | chunks_list.append(len(chunk)) 31 | if(len(chunks_list) >=2): 32 | base_filename = os.path.basename(filename) 33 | for i, chunk in enumerate(pd.read_csv(filename, chunksize=chunk_size)): 34 | chunk.to_csv(normalized_dir + base_filename +'_chunk' + '{}'.format(i), index=False) 35 | if os.path.exists(normalized_dir+os.path.basename(filename)): 36 | os.remove(normalized_dir+os.path.basename(filename)) 37 | print("file with name {} has been chunked and rermoved ".format(base_filename)) 38 | return -------------------------------------------------------------------------------- /Break_Data_File.py: -------------------------------------------------------------------------------- 1 | """ 2 | The code you have provided is a function called Break_Data_File(). This function breaks a large pcap file into smaller files of a specified size. 3 | 4 | The function first creates a directory with the same name as the pcap file. If the directory already exists, then the function deletes all the files in the directory. 5 | 6 | The function then creates a tcpdump script that breaks the pcap file into smaller files of the specified size. The tcpdump script is executed using the os.system() function. 7 | 8 | The function also creates an editcap script that can be used to further break the pcap files into smaller files. However, the editcap script is not executed by the function. 9 | 10 | The first line defines the directory variable as the name of the directory that will be created to store the broken pcap files. The parent_dir variable is the parent directory of the pcap file. The path variable is the full path to the directory that will be created. 11 | 12 | The next four lines check if the directory directory exists. If it does not exist, then the function creates the directory. If the directory does exist, then the function deletes all the files in the directory. 13 | 14 | The next line defines the tcpdump_script variable as the tcpdump script that will be used to break the pcap file into smaller files. The editcap_script variable is defined as the editcap script that can be used to further break the pcap files into smaller files. 15 | 16 | The next line defines the new_filename variable as the name of the broken pcap file. The saved_break_file variable is the full path to the broken pcap file. 17 | 18 | The next line executes the tcpdump_script using the os.system() function. 19 | 20 | The last line commented out executes the editcap_script using the os.system() function. 21 | """ 22 | 23 | def Break_Data_File(filename,chunk_size): 24 | # create a floder as name as file for breaked files 25 | # Directory 26 | directory = os.path.splitext(os.path.basename(filename))[0] 27 | # Parent Directory path 28 | parent_dir = os.path.dirname(filename) 29 | # Path 30 | path = os.path.join(parent_dir, directory) 31 | # Create the directory 32 | if(os.path.isdir(path) == False): 33 | os.mkdir(path) 34 | else: 35 | for f in os.listdir(path): 36 | os.remove(os.path.join(path, f)) 37 | print("Directory '% s' created" % directory) 38 | # in this section large file greater than chunk size will be breake 39 | tcpdump_script = "" 40 | editcap_script = "" 41 | #new_dir_filename = os.path.dirname(filename) 42 | new_filename = "breaked_" + os.path.splitext(os.path.basename(filename))[0] 43 | #saved_break_file = new_dir_filename + "/" + new_filename 44 | saved_break_file = path + "/" + new_filename 45 | tcpdump_script = "tcpdump -r " + filename + " -w " + saved_break_file + " -C " + str(chunk_size) 46 | editcap_script = "editcap -c 100000 " + filename + " " + saved_break_file 47 | os.system(tcpdump_script) 48 | # os.system(editcap_script) 49 | 50 | -------------------------------------------------------------------------------- /Dataset/ISCX VPN-NonVPN 2016.md: -------------------------------------------------------------------------------- 1 | # ISCX VPN-nonVPN Dataset (ISCXVPN2016) 2 | 3 | This dataset is sourced from the [UNB Cybersecurity Research Group](https://www.unb.ca/cic/) and contains network traffic data for research purposes. The information below is based on the dataset's official website: [ISCX VPN-nonVPN dataset](http://205.174.165.80/CICDataset/ISCX-VPN-NonVPN-2016/). 4 | 5 | ## Dataset Overview 6 | 7 | The ISCX VPN-nonVPN dataset was created to represent real-world network traffic diversity and quantity. The dataset includes accounts for users named Alice and Bob, allowing for the use of various services like Skype, Facebook, and more. It captures both regular and VPN sessions, resulting in a total of 14 traffic categories, including VOIP, P2P, and more. 8 | 9 | ## Traffic Types and Applications 10 | 11 | Here is a list of different types of traffic and applications considered in the dataset: 12 | 13 | 1. **Browsing**: HTTPS traffic generated while users browse the web or perform tasks involving a browser. 14 | 15 | 2. **Email**: Traffic samples generated using a Thunderbird client, configured for mail delivery through SMTP/S and receipt through POP3/SSL or IMAP/SSL. 16 | 17 | 3. **Chat**: Instant messaging applications, including Facebook, Hangouts (via web browsers), Skype, IAM, and ICQ (using the Pidgin application). 18 | 19 | 4. **Streaming**: Multimedia applications requiring a continuous stream of data, including YouTube and Vimeo services using Chrome and Firefox. 20 | 21 | 5. **File Transfer**: Traffic applications designed for sending or receiving files and documents. This includes Skype file transfers, FTP over SSH (SFTP), and FTP over SSL (FTPS) traffic sessions. 22 | 23 | 6. **VoIP**: Voice over IP traffic, encompassing voice calls using Facebook, Hangouts, and Skype. 24 | 25 | 7. **TraP2P**: Identification of file-sharing protocols like BitTorrent, captured by downloading .torrent files from a public repository and using uTorrent and Transmission applications. 26 | 27 | ## Dataset Details 28 | 29 | - **Data Size**: The dataset includes 28GB of captured traffic data. 30 | 31 | - **VPN Usage**: An external VPN service provider was used for VPN sessions, connected via OpenVPN (UDP mode). 32 | 33 | - **SFTP and FTPS**: For SFTP and FTPS traffic, external service providers and Filezilla as a client were used. 34 | 35 | - **Filtering**: To simplify the labeling process, unnecessary services and applications were closed during traffic capture. Only the objective application (e.g., Skype voice call, SFTP file transfer) was active. 36 | 37 | - **Filtering by IP**: A filter was applied to capture only packets with source or destination IP addresses matching the local client's address (Alice or Bob). 38 | 39 | ## Data Processing 40 | 41 | Scapy is used to read the pcap files and create CSV files based on selected features. 42 | 43 | ## Dataset Availability 44 | 45 | The UNB ISCX Network Traffic (VPN-nonVPN) dataset is available for research purposes. It includes labeled network traffic, full packet data in pcap format, and CSV files (flows generated by ISCXFlowMeter). Researchers can access the dataset through the following link: 46 | 47 | [Dataset Download Link](http://205.174.165.80/CICDataset/ISCX-VPN-NonVPN-2016/) 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /Dataset/dataset-more description1.md: -------------------------------------------------------------------------------- 1 | # ISCX VPN Non-VPN 2016 Dataset 2 | 3 | The "ISCX VPN Non-VPN 2016 dataset" is a network traffic dataset used for research and analysis in the field of network security and intrusion detection. This dataset provides valuable insights into VPN (Virtual Private Network) and non-VPN network traffic. 4 | 5 | ## Origin and Source 6 | 7 | - The dataset was collected as part of the ISCX (Information Security Centre of Excellence) project. 8 | - It contains network traffic data captured in a controlled environment for research purposes. 9 | 10 | ## Purpose 11 | 12 | - The main purpose of this dataset is to facilitate research in the detection of VPN (Virtual Private Network) and non-VPN network traffic. 13 | - Researchers often use it to develop and evaluate intrusion detection systems and network traffic analysis techniques. 14 | 15 | ## Dataset Contents 16 | 17 | - The dataset typically consists of network traffic captures in PCAP (Packet Capture) format. 18 | - It is divided into two main categories: VPN traffic and non-VPN traffic. 19 | - Each category contains network traffic data for various network activities. 20 | 21 | ### VPN Traffic 22 | 23 | - This category includes network traffic generated by VPN connections. VPNs are commonly used for secure and private communication over the internet. 24 | - The dataset may contain VPN traffic for different VPN protocols, such as OpenVPN, PPTP, L2TP, etc. 25 | - VPN traffic often includes encrypted communication, making it challenging to analyze for security purposes. 26 | 27 | ### Non-VPN Traffic 28 | 29 | - This category includes regular network traffic that does not involve VPN connections. 30 | - It can include various types of network activities, such as web browsing, email communication, file transfers, and more. 31 | - Non-VPN traffic is often used as a baseline for comparison when detecting unusual or potentially malicious network behavior. 32 | 33 | ## Use Cases 34 | 35 | Researchers and cybersecurity professionals can use this dataset to: 36 | 37 | - Develop and evaluate intrusion detection systems (IDS) to identify VPN-based attacks or anomalies. 38 | - Study and analyze network traffic patterns for both VPN and non-VPN scenarios. 39 | - Improve network security by identifying potentially malicious VPN traffic. 40 | 41 | ## Challenges 42 | 43 | - Analyzing VPN traffic can be challenging due to encryption, making it difficult to inspect the content of packets. 44 | - Distinguishing between legitimate VPN usage and malicious activities is a common challenge in network security research. 45 | 46 | ## Privacy and Ethics 47 | 48 | - When using network traffic datasets, it's important to consider privacy and ethical concerns. Care should be taken to ensure that personally identifiable information (PII) or sensitive data is not exposed. 49 | 50 | ## Availability 51 | 52 | - The dataset is publicly available for research purposes, often through academic or research institutions or cybersecurity organizations. 53 | 54 | ## PCAP Files 55 | 56 | The PCAP files in the ISCX VPN-NonVPN 2016 dataset contain the following information for each packet: 57 | 58 | - Timestamp 59 | - Source IP address 60 | - Destination IP address 61 | - Source port 62 | - Destination port 63 | - Protocol 64 | - Packet length 65 | - Packet payload 66 | -------------------------------------------------------------------------------- /FC-traffic-classification.py: -------------------------------------------------------------------------------- 1 | from memory usage-execution time import measure_execution_memory 2 | from fc_build_model import fc_build_model 3 | @measure_execution_memory 4 | def FC-traffic-classification(root_normalized_dir,net_params,model_params, 1d-cnn_path,1d-cnn_model,bi-lstm_path,bi-lstm_model,sae_path,sae_model,sae_directory_features): 5 | 6 | df_normalized = pd.DataFrame(columns=['packet_normalized_data', 'class_label']) 7 | df_train = pd.DataFrame(columns=['packet_normalized_data']) 8 | binary = "{0:08b}".format(int("1a", 16)) 9 | col_list = ['packet_normalized_data', 'class_label'] 10 | 11 | # list out keys and values separately 12 | key_list = list(net_parameters.keys()) 13 | val_list = list(net_parameters.values()) 14 | 15 | # list out keys and values separately 16 | key_list1 = list(model_params.keys()) 17 | val_list1 = list(model_params.values()) 18 | DENSE_LAYER = val_list1[key_list1.index("dense_neurons")] 19 | # network parameters 20 | BATCH_SIZE = val_list[key_list.index("BATCH_SIZE")] 21 | EPOCH = val_list[key_list.index("EPOCH")] 22 | VERBOSE = val_list[key_list.index("VERBOSE")] 23 | #OPTIMIZER = tf.keras.optimizers.Adam() 24 | VALIDATION_SPLIT = val_list[key_list.index("VALIDATION_SPLIT")] 25 | NUM_CLASSES = val_list[key_list.index("NUM_CLASSES")] 26 | OPTIMIZER = val_list[key_list.index("OPTIMIZER")] 27 | LOSS_FUNCTION = val_list[key_list.index("LOSS_FUNCTION")] 28 | METRICS = val_list[key_list.index("METRICS")[0], key_list.index("METRICS")[1](),key_list.index("METRICS")[2](),key_list.index("METRICS")[3]()] 29 | DROPOUT = val_list[key_list.index("DROPOUT")] 30 | #load models weights 31 | 1d-cnn_model.load_weights(1d-cnn_path) 32 | bi-lstm_model.load_weights(bi-lstm_path) 33 | sae_model.load_weights(sae_path) 34 | train_data_list = [] 35 | train_label_list = [] 36 | test_data_list = [] 37 | test_lable_list = [] 38 | sae_train_data_list[] 39 | sae_train_label_list =[] 40 | sae_test_data_list =[] 41 | sae_test_label_list = [] 42 | for path in os.listdir(root_normalized_dir): 43 | full_path = os.path.join(root_normalized_dir, path) 44 | df = pd.read_csv(full_path, usecols=col_list) 45 | 46 | 47 | # train on model 48 | X = df.iloc[:,0:1] # Data 49 | Y = df.iloc[:,1:2] # Label 50 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42) 51 | # prepare label of packet for deep NN 52 | train_label_data_list = [] 53 | test_label_data_list = [] 54 | pkt_train_label_data = np.zeros([len(y_train), 1]) 55 | pkt_test_label_data = np.zeros([len(y_test), 1]) 56 | for i in range(len(y_train)): 57 | pkt_train_label_data[i,0] = y_train.iloc[i,0] 58 | #train_label_data_list.append(pkt_train_label_data) 59 | train_label = np.array(pkt_train_label_data) 60 | train_label = train_label[:,0] 61 | train_label = train_label.astype(np.int) 62 | for i in range(len(y_test)): 63 | pkt_test_label_data[i,0] = y_test.iloc[i,0] 64 | #test_label_data_list.append(pkt_test_label_data) 65 | test_label = np.array(pkt_test_label_data) 66 | test_label = test_label[:, 0] 67 | test_label = test_label.astype(np.int) 68 | 69 | #y_train = y_train.to_numpy() 70 | #y_train = y_train.T 71 | # To create a x-by-y-by-z 3D list with initial values: 72 | 73 | data_list = [] 74 | test_list = [] 75 | 76 | pkt_data = np.zeros([len(X_train.iloc[0, 0].split(',')), 1]) 77 | for i in range(len(X_train)): 78 | print("trian preparing data i {}".format(i)) 79 | pkt_train_data = np.zeros([len(X_train.iloc[0, 0].split(',')), 1]) 80 | temp_train_list = X_train.iloc[i, 0].split(',')[:] 81 | for j in range(len(temp_train_list)): 82 | #print("test preparing data j {}".format(j)) 83 | pkt_train_data[j,0] = float(temp_train_list[j]) 84 | data_list.append(pkt_train_data) 85 | 86 | train_data = np.array(data_list) 87 | for i in range(len(X_test)): 88 | print("test preparing data i {}".format(i)) 89 | pkt_test_data = np.zeros([len(X_test.iloc[0, 0].split(',')), 1]) 90 | temp_test_list = X_test.iloc[i, 0].split(',')[:] 91 | for j in range(len(temp_test_list)): 92 | #print("test preparing data j {}".format(j)) 93 | pkt_test_data[j,0] = float(temp_test_list[j]) 94 | test_list.append(pkt_test_data) 95 | test_data = np.array(test_list) 96 | # convert class vectors to binary class matrices 97 | train_label = tf.keras.utils.to_categorical(train_label, NUM_CLASSES) 98 | train_label_list.append(train_label) 99 | test_label = tf.keras.utils.to_categorical(test_label, NUM_CLASSES) 100 | test_label_list.append(test_label) 101 | train_data = train_data.reshape((len(train_data), len(X_train.iloc[0,0].split(',')), 1, 1)) 102 | train_data_list.append(train_data) 103 | test_data = test_data.reshape((len(test_data), len(X_test.iloc[0, 0].split(',')), 1, 1)) 104 | test_data_list.append(test_data) 105 | 106 | # read sae features from specified directory 107 | for path in os.listdir(sae_directory_features): 108 | 109 | full_path = os.path.join(sae_feature_dir, path) 110 | df = pd.read_csv(full_path, usecols=col_list) 111 | 112 | # train on model 113 | X = df.iloc[:,0:1] # Data 114 | Y = df.iloc[:,1:2] # Label 115 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42) 116 | # prepare label of packet for deep NN 117 | train_label_data_list = [] 118 | test_label_data_list = [] 119 | train_label_data = np.zeros([len(y_train), 1]) 120 | test_label_data = np.zeros([len(y_test), 1]) 121 | for i in range(len(y_train)): 122 | train_label_data[i,0] = y_train.iloc[i,0] 123 | #train_label_data_list.append(pkt_train_label_data) 124 | train_label = np.array(train_label_data) 125 | train_label = train_label[:,0] 126 | train_label = train_label.astype(np.int) 127 | for i in range(len(y_test)): 128 | test_label_data[i,0] = y_test.iloc[i,0] 129 | #test_label_data_list.append(pkt_test_label_data) 130 | test_label = np.array(test_label_data) 131 | test_label = test_label[:, 0] 132 | test_label = test_label.astype(np.int) 133 | # convert class vectors to binary class matrices 134 | train_label = tf.keras.utils.to_categorical(train_label, NUM_CLASSES) 135 | sae_train_label_list.append(train_label) 136 | test_label = tf.keras.utils.to_categorical(test_label, NUM_CLASSES) 137 | sae_test_label_list.append(test_label) 138 | train_data = train_data.reshape((len(train_data), len(X_train.iloc[0,0].split(',')), 1, 1)) 139 | sae_train_data_list.append(train_data) 140 | test_data = test_data.reshape((len(test_data), len(X_test.iloc[0, 0].split(',')), 1, 1)) 141 | sae_test_data_list.append(test_data) 142 | 143 | 144 | #compile and fit final combined model 145 | model = fc_build_model(model_params) 146 | model.compile(loss=LOSS_FUNCTION, 147 | optimizer=OPTIMIZER, 148 | metrics=METRICS) 149 | model.summary() 150 | print("this is running Bi-LSTM model: ") 151 | # create output of 1D-CNN, Bi-LSTM ans SAE 152 | x_combined = np.concatenate((1d-cnn_model.predict(train_data_list), bi-lstm_model.predict(train_data_list),sae_model(sae_train_data_list), axis=1) 153 | 154 | model.fit(x_combined, train_label, batch_size=BATCH_SIZE, 155 | epochs=EPOCH,verbose=VERBOSE,validation_split= VALIDATION_SPLIT ) 156 | x_combined_test = np.concatenate((1d-cnn_model.predict(test_data_list), bi-lstm_model.predict(test_data_list),sae_model(sae_test_data_list), axis=1) 157 | (loss, accuracy, f1_score, precision, recall) = model.evaluate(x_combined_test, test_label, 158 | batch_size=BATCH_SIZE) 159 | score = [] 160 | score[0] = loss 161 | score[1]= accuracy 162 | score[2] = f1_score 163 | score[3] = precision 164 | score[4] = recall 165 | 166 | print("\nTest loss:", score[0]) 167 | print('Test accuracy:', score[1]) 168 | print('Test f1_score:', score[2]) 169 | print('Test precision:', score[3]) 170 | print('Test recall:', score[4]) 171 | -------------------------------------------------------------------------------- /ISCX-Analysis/README.md: -------------------------------------------------------------------------------- 1 | # Network Traffic Session Features Extractor 2 | 3 | The code provided in `session_features.py` extracts statistical features from network traffic sessions in PCAP files and saves them to a CSV file. These features provide valuable insights into network traffic characteristics. The extracted statistical features include: 4 | 5 | - **Interarrival Times**: Minimum, maximum, median, and standard deviation of the time between packets in a session. 6 | - **Packet Lengths**: Minimum, maximum, median, and standard deviation of the packet lengths in a session. 7 | - **Payload Sizes**: Minimum, maximum, median, and standard deviation of the payload sizes in a session. 8 | - **Session Duration**: Total time of the session. 9 | - **Active Time**: Total time that packets were being sent or received in the session. 10 | - **Idle Time**: Total time that there was no packet activity in the session. 11 | - **Packet Truncation**: Number of packets that were truncated in the session. 12 | - **Total Packets**: Total number of packets in the session. 13 | - **Bytes per Second**: Average number of bytes transmitted per second in the session. 14 | - **Packets per Second**: Average number of packets transmitted per second in the session. 15 | 16 | ## How It Works 17 | 18 | 1. **Data Preparation**: The code begins by reading PCAP files and extracting individual packets. 19 | 20 | 2. **Session Identification**: It groups the packets into sessions based on specific session attributes such as source IP address, destination IP address, source port, and destination port. Each session represents a distinct flow of network traffic. 21 | 22 | 3. **Feature Extraction**: For each session, the code calculates the statistical features listed above. 23 | 24 | 4. **CSV File Output**: The extracted statistical features are saved to a CSV file for further analysis and utilization. 25 | 26 | ## Use Cases 27 | 28 | This code can be used for various network-related purposes, including: 29 | 30 | - **Network Traffic Analysis**: The statistical features enable the identification of patterns and trends in network traffic. This information can be used to improve network performance and security. 31 | 32 | - **Network Intrusion Detection**: The extracted features can serve as inputs to machine learning models for detecting malicious network traffic or anomalies. 33 | 34 | - **Network Performance Monitoring**: By analyzing the statistics, you can monitor the performance of network applications and services, helping with troubleshooting and optimization. 35 | 36 | ## Conclusion 37 | 38 | The `session_features.py` code provides a valuable tool for extracting statistical features from network traffic sessions. These features can be utilized for a wide range of network analysis and monitoring tasks, ultimately contributing to better network performance and security. 39 | 40 | -------------------------------------------------------------------------------- /ISCX-Analysis/session features.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from scapy.all import rdpcap 4 | from collections import defaultdict 5 | import statistics 6 | import numpy as np 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.preprocessing import StandardScaler 9 | from sklearn.neural_network import MLPClassifier 10 | from sklearn.metrics import accuracy_score 11 | 12 | # Function to calculate statistical features 13 | def calculate_statistics(values): 14 | if not values: 15 | return None, None, None, None # Return None for min, max, median, and standard deviation if the list is empty 16 | return min(values), max(values), statistics.median(values), statistics.stdev(values) 17 | 18 | # Initialize data structures 19 | session_data = defaultdict(list) 20 | 21 | # Replace 'path/to/your/dataset/folder' with the actual path to your dataset folder 22 | dataset_folder = 'path/to/your/dataset/folder' 23 | output_csv = 'session_stats.csv' 24 | 25 | # List of PCAP files in the dataset folder 26 | pcap_files = [file for file in os.listdir(dataset_folder) if file.endswith('.pcap')] 27 | 28 | # Loop through the PCAP files 29 | for pcap_file in pcap_files: 30 | pcap_path = os.path.join(dataset_folder, pcap_file) 31 | packets = rdpcap(pcap_path) 32 | current_session = [] 33 | last_time = None 34 | 35 | # Extract label from file name (assuming filenames are formatted as "label_filename.pcap") 36 | label = int(pcap_file.split('_')[0]) 37 | 38 | # Loop through the packets in the PCAP file 39 | for packet in packets: 40 | if 'IP' in packet and 'TCP' in packet: 41 | session_key = (packet['IP'].src, packet['IP'].dst, packet['TCP'].sport, packet['TCP'].dport) 42 | if not current_session: 43 | current_session.append(packet) 44 | else: 45 | inter_arrival_time = packet.time - last_time 46 | current_session.append(packet) 47 | if packet['TCP'].flags & 0x02: # Check if it's a SYN packet (start of a new session) 48 | if session_key not in session_data: 49 | session_data[session_key] = { 50 | 'InterarrivalTimes': [], 51 | 'PacketLengths': [], 52 | 'PayloadSizes': [], 53 | 'SessionDuration': 0, 54 | 'Label': label 55 | } 56 | session_data[session_key]['InterarrivalTimes'].append(inter_arrival_time) 57 | session_data[session_key]['PacketLengths'].append(len(packet)) 58 | if 'Raw' in packet: 59 | session_data[session_key]['PayloadSizes'].append(len(packet['Raw'])) 60 | current_session = [] 61 | last_time = packet.time 62 | 63 | # Initialize lists to store the calculated features 64 | session_features = [] 65 | 66 | # Loop through the sessions and calculate the features 67 | for session_key, session_info in session_data.items(): 68 | interarrival_min, interarrival_max, interarrival_median, interarrival_std = calculate_statistics(session_info['InterarrivalTimes']) 69 | packet_length_min, packet_length_max, packet_length_median, packet_length_std = calculate_statistics(session_info['PacketLengths']) 70 | payload_size_min, payload_size_max, payload_size_median, payload_size_std = calculate_statistics(session_info['PayloadSizes']) 71 | session_duration = sum(session_info['InterarrivalTimes']) 72 | active_time_min, active_time_max, active_time_median, active_time_std = calculate_statistics([session_duration]) 73 | idle_time_min, idle_time_max, idle_time_median, idle_time_std = calculate_statistics([session_duration - sum(session_info['InterarrivalTimes'])]) 74 | total_packets = len(session_info['InterarrivalTimes']) + 1 # Adding 1 to account for the first packet 75 | packet_truncation = total_packets - len(session_info['InterarrivalTimes']) - 1 # Subtracting 1 to account for the last packet 76 | 77 | bytes_per_second = sum(session_info['PacketLengths']) / session_duration 78 | packets_per_second = total_packets / session_duration 79 | 80 | session_features.append({ 81 | 'Min_Interarrival': interarrival_min, 82 | 'Max_Interarrival': interarrival_max, 83 | 'Median_Interarrival': interarrival_median, 84 | 'Std_Interarrival': interarrival_std, 85 | 'Min_Packet_Length': packet_length_min, 86 | 'Max_Packet_Length': packet_length_max, 87 | 'Median_Packet_Length': packet_length_median, 88 | 'Std_Packet_Length': packet_length_std, 89 | 'Min_Payload_Size': payload_size_min, 90 | 'Max_Payload_Size': payload_size_max, 91 | 'Median_Payload_Size': payload_size_median, 92 | 'Std_Payload_Size': payload_size_std, 93 | 'Min_Active_Time': active_time_min, 94 | 'Max_Active_Time': active_time_max, 95 | 'Median_Active_Time': active_time_median, 96 | 'Std_Active_Time': active_time_std, 97 | 'Min_Idle_Time': idle_time_min, 98 | 'Max_Idle_Time': idle_time_max, 99 | 'Median_Idle_Time': idle_time_median, 100 | 'Std_Idle_Time': idle_time_std, 101 | 'Packet_Truncation': packet_truncation, 102 | 'Total_Packets': total_packets, 103 | 'Session_Duration': session_duration, 104 | 'Bytes_Per_Second': bytes_per_second, 105 | 'Packets_Per_Second': packets_per_second, 106 | 'Label': session_info['Label'] 107 | }) 108 | 109 | # Create a DataFrame from the session features 110 | df = pd.DataFrame(session_features) 111 | 112 | # Save the statistical features to a CSV file 113 | df.to_csv(output_csv, index=False) 114 | 115 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CBS - Encrypted Traffic Classification using Deep Learning 2 | 3 | ## Overview 4 | 5 | CBS is a platform designed and implemented for encrypted traffic classification using deep learning. The goal of this project is to accurately classify network traffic into different types and applications, such as email, P2P, Skype, and more. 6 | 7 | ## General Algorithm 8 | 9 | The general algorithm for performing classification is as follows: 10 | 11 | 1. **Data Extraction:** PCAP files containing packets related to the type of traffic and application are extracted from the dataset. 12 | 13 | 2. **Preprocessing:** The extracted PCAP files undergo preprocessing, which includes removing unused packets such as DNS, DHCP, and other non-essential data. 14 | 15 | 3. **Feature Extraction:** Important parts of the payload and header of each packet are extracted, and a fixed-length record (e.g., 1500-length) is created. These records are added to the new dataset for learning. 16 | 17 | 4. **Data Augmentation:** To address imbalances in the dataset, especially when some traffic types have fewer samples than others, Generative Adversarial Networks (GANs) are used to synthesize new samples. 18 | 19 | 5. **Spatial Feature Extraction:** Spatial features are extracted from the data using a 1D Convolutional Neural Network (1D-CNN). 20 | 21 | 6. **Temporal Feature Extraction:** Temporal features are extracted using an attention mechanism-based Bidirectional Long Short-Term Memory (Bi-LSTM) network. 22 | 23 | 7. **Statistical Feature Extraction:** Statistical features are extracted through a Stacked Autoencoder (SAE). 24 | 25 | 8. **Feature Aggregation:** The outputs of the 1D-CNN, attention Bi-LSTM, and SAE are aggregated and fed into a fully connected neural network. 26 | 27 | 9. **Classification:** The fully connected network learns from the aggregated features and performs the final classification of traffic into 12 types of traffic (e.g., email, P2P) and 17 types of applications (e.g., Skype, Vimeo). 28 | -------------------------------------------------------------------------------- /SAE_Traffic_classification.py: -------------------------------------------------------------------------------- 1 | # in this function we build CNN for Traffic Classification 2 | from memory usage-execution time import measure_execution_memory 3 | from SAE_build_model import SAE_build_model 4 | @measure_execution_memory 5 | def SAE_Traffic_classification(sae_feature_dir,net_parameters,model_params): 6 | 7 | df_normalized = pd.DataFrame(columns=['sae_normalized_features', 'class_label']) 8 | df_train = pd.DataFrame(columns=['sae_normalized_features']) 9 | binary = "{0:08b}".format(int("1a", 16)) 10 | col_list = ['sae_normalized_features', 'class_label'] 11 | 12 | 13 | 14 | 15 | for path in os.listdir(sae_feature_dir): 16 | 17 | full_path = os.path.join(sae_feature_dir, path) 18 | df = pd.read_csv(full_path, usecols=col_list) 19 | 20 | # train on model 21 | X = df.iloc[:,0:1] # Data 22 | Y = df.iloc[:,1:2] # Label 23 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42) 24 | # prepare label of packet for deep NN 25 | train_label_data_list = [] 26 | test_label_data_list = [] 27 | train_label_data = np.zeros([len(y_train), 1]) 28 | test_label_data = np.zeros([len(y_test), 1]) 29 | for i in range(len(y_train)): 30 | train_label_data[i,0] = y_train.iloc[i,0] 31 | #train_label_data_list.append(pkt_train_label_data) 32 | train_label = np.array(train_label_data) 33 | train_label = train_label[:,0] 34 | train_label = train_label.astype(np.int) 35 | for i in range(len(y_test)): 36 | test_label_data[i,0] = y_test.iloc[i,0] 37 | #test_label_data_list.append(pkt_test_label_data) 38 | test_label = np.array(test_label_data) 39 | test_label = test_label[:, 0] 40 | test_label = test_label.astype(np.int) 41 | # convert class vectors to binary class matrices 42 | train_label = tf.keras.utils.to_categorical(train_label, NUM_CLASSES) 43 | test_label = tf.keras.utils.to_categorical(test_label, NUM_CLASSES) 44 | 45 | train_data = train_data.reshape((len(train_data), len(X_train.iloc[0,0].split(',')), 1, 1)) 46 | test_data = test_data.reshape((len(test_data), len(X_test.iloc[0, 0].split(',')), 1, 1)) 47 | stack_autoencoder_list,code_layer_output = SAE_build_model(model_params,net_parameters,train_data,test_data,train_label,test_label,LOSS_FUNCTION,OPTIMIZER,METRICS) 48 | 49 | 50 | 51 | 52 | 53 | saved_models = [] 54 | saved_weights = [] 55 | save_model_weights_dir = 'media/mehdi/linux/normalized_data/' 56 | for i, model in enumerate(stack_autoencoder_list): 57 | # save model architecture 58 | tf.keras.models.save_model(model, save_model_weights_dir + f'model_architecture_sae{i}.h5') 59 | saved_models.append(f'model_architecture_sae{i}.h5') 60 | tf.keras.models.save_weights(model, save_model_weights_dir + f'model_weights_sae{i}.h5') 61 | saved_weights.append(f'model_weights_sae{i}.h5') 62 | # save model weights 63 | return code_layer_output,save_models,saved_weights, save_model_weights_dir 64 | -------------------------------------------------------------------------------- /SAE_build_model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras import layers 3 | import pandas as pd 4 | 5 | def SAE_build_moel(params,net_parameters,train_data,test_data,train_label,test_label,LOSS_FUNCTION,OPTIMIZER,METRIC): 6 | 7 | encoder_neurons_sae = params['encoder_neurons_sae'] 8 | code_neurons_sae = params['code_neurons_sae'] 9 | decoder_neurons_sae = params['decoder_neurons_sae'] 10 | output_neurons_sae = params['output_neurons_sae'] 11 | autoencoders = [] # Create an empty list to store the output of define_autoencoder 12 | # list out keys and values separately 13 | key_list = list(net_parameters.keys()) 14 | val_list = list(net_parameters.values()) 15 | # network parameters 16 | BATCH_SIZE = val_list[key_list.index("BATCH_SIZE")] 17 | EPOCH = val_list[key_list.index("EPOCH")] 18 | VERBOSE = val_list[key_list.index("VERBOSE")] 19 | #OPTIMIZER = tf.keras.optimizers.Adam() 20 | VALIDATION_SPLIT = val_list[key_list.index("VALIDATION_SPLIT")] 21 | #NUM_CLASSES = val_list[key_list.index("NUM_CLASSES")] 22 | OPTIMIZER = val_list[key_list.index("OPTIMIZER")] 23 | LOSS_FUNCTION = val_list[key_list.index("LOSS_FUNCTION")] 24 | METRICS = val_list[key_list.index("METRICS")[0], key_list.index("METRICS")[1](),key_list.index("METRICS")[2](),key_list.index("METRICS")[3]()] 25 | DROPOUT = val_list[key_list.index("DROPOUT")] 26 | stack = [] 27 | 28 | # Process each parameter individually 29 | for i in range(6): 30 | encoder_neurons = encoder_neurons_sae[i] 31 | code_neurons = code_neurons_sae[i] 32 | decoder_neurons = decoder_neurons_sae[i] 33 | output_neurons = output_neurons_sae[i] 34 | # Call the autoencoder function with each set of parameters 35 | autoencoder = define_autoencoder(encoder_neurons, code_neurons, decoder_neurons, output_neurons) 36 | autoencoders.append(autoencoder) # Append the output to the list 37 | for model in autoencoders: 38 | model.compile(loss=LOSS_FUNCTION, 39 | optimizer=OPTIMIZER, 40 | metrics=METRICS) 41 | model.summary() 42 | if(autoencoders.index(model) == 1): 43 | stack.append([autoencoders.index(model)]) = model.fit(train_data, train_data, batch_size=BATCH_SIZE, 44 | epochs=EPOCH,verbose=VERBOSE, validation_data=(test_data, test_data) ) 45 | else: 46 | temp_input = model.predict(train_data) 47 | temp_input = np.concatenate((temp_input , train_data)) 48 | stack.append([autoencoders.index(model)]) = model.fit(temp_input, temp_input, batch_size=BATCH_SIZE, 49 | epochs=EPOCH,verbose=VERBOSE, validation_data=(test_data, test_data) ) 50 | train_data = temp_input 51 | return stack, stack[6].get_layer('code').output) -------------------------------------------------------------------------------- /cnn_Traffic_classification.py: -------------------------------------------------------------------------------- 1 | # in this function we build CNN for Traffic Classification 2 | from memory usage-execution time import measure_execution_memory 3 | from cnn_build_model import cnn_build_model 4 | @measure_execution_memory 5 | def cnn_Traffic_classification(root_normalized_dir,net_parameters,model_parameters): 6 | df_normalized = pd.DataFrame(columns=['packet_normalized_data', 'class_label']) 7 | df_train = pd.DataFrame(columns=['packet_normalized_data']) 8 | binary = "{0:08b}".format(int("1a", 16)) 9 | col_list = ['packet_normalized_data', 'class_label'] 10 | 11 | # list out keys and values separately 12 | key_list = list(net_parameters.keys()) 13 | val_list = list(net_parameters.values()) 14 | 15 | # list out keys and values separately 16 | key_list1 = list(model_parameters.keys()) 17 | val_list1 = list(model_parameters.values()) 18 | DENSE_LAYER = val_list1[key_list1.index("DENSE_LAYER")] 19 | 20 | # network parameters 21 | BATCH_SIZE = val_list[key_list.index("BATCH_SIZE")] 22 | EPOCH = val_list[key_list.index("EPOCH")] 23 | VERBOSE = val_list[key_list.index("VERBOSE")] 24 | #OPTIMIZER = tf.keras.optimizers.Adam() 25 | VALIDATION_SPLIT = val_list[key_list.index("VALIDATION_SPLIT")] 26 | NUM_CLASSES = val_list[key_list.index("NUM_CLASSES")] 27 | OPTIMIZER = val_list[key_list.index("OPTIMIZER")] 28 | LOSS_FUNCTION = val_list[key_list.index("LOSS_FUNCTION")] 29 | METRICS = val_list[key_list.index("METRICS")[0], key_list.index("METRICS")[1](),key_list.index("METRICS")[2](),key_list.index("METRICS")[3]()] 30 | DROPOUT = val_list[key_list.index("DROPOUT")] 31 | 32 | for path in os.listdir(root_normalized_dir): 33 | full_path = os.path.join(root_normalized_dir, path) 34 | df = pd.read_csv(full_path, usecols=col_list) 35 | model = cnn_build_model(model_parameters) 36 | model.compile(loss=LOSS_FUNCTION, 37 | optimizer=OPTIMIZER, 38 | metrics=METRICS) 39 | model.summary() 40 | print("this is running 1D-CNN model: ") 41 | 42 | # train on model 43 | X = df.iloc[:,0:1] # Data 44 | Y = df.iloc[:,1:2] # Label 45 | X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42) 46 | # prepare label of packet for deep NN 47 | train_label_data_list = [] 48 | test_label_data_list = [] 49 | pkt_train_label_data = np.zeros([len(y_train), 1]) 50 | pkt_test_label_data = np.zeros([len(y_test), 1]) 51 | for i in range(len(y_train)): 52 | pkt_train_label_data[i,0] = y_train.iloc[i,0] 53 | #train_label_data_list.append(pkt_train_label_data) 54 | train_label = np.array(pkt_train_label_data) 55 | train_label = train_label[:,0] 56 | train_label = train_label.astype(np.int) 57 | for i in range(len(y_test)): 58 | pkt_test_label_data[i,0] = y_test.iloc[i,0] 59 | #test_label_data_list.append(pkt_test_label_data) 60 | test_label = np.array(pkt_test_label_data) 61 | test_label = test_label[:, 0] 62 | test_label = test_label.astype(np.int) 63 | 64 | #y_train = y_train.to_numpy() 65 | #y_train = y_train.T 66 | # To create a x-by-y-by-z 3D list with initial values: 67 | 68 | data_list = [] 69 | test_list = [] 70 | 71 | pkt_data = np.zeros([len(X_train.iloc[0, 0].split(',')), 1]) 72 | for i in range(len(X_train)): 73 | print("trian preparing data i {}".format(i)) 74 | pkt_train_data = np.zeros([len(X_train.iloc[0, 0].split(',')), 1]) 75 | temp_train_list = X_train.iloc[i, 0].split(',')[:] 76 | for j in range(len(temp_train_list)): 77 | #print("test preparing data j {}".format(j)) 78 | pkt_train_data[j,0] = float(temp_train_list[j]) 79 | data_list.append(pkt_train_data) 80 | 81 | train_data = np.array(data_list) 82 | for i in range(len(X_test)): 83 | print("test preparing data i {}".format(i)) 84 | pkt_test_data = np.zeros([len(X_test.iloc[0, 0].split(',')), 1]) 85 | temp_test_list = X_test.iloc[i, 0].split(',')[:] 86 | for j in range(len(temp_test_list)): 87 | #print("test preparing data j {}".format(j)) 88 | pkt_test_data[j,0] = float(temp_test_list[j]) 89 | test_list.append(pkt_test_data) 90 | test_data = np.array(test_list) 91 | # convert class vectors to binary class matrices 92 | train_label = tf.keras.utils.to_categorical(train_label, NUM_CLASSES) 93 | test_label = tf.keras.utils.to_categorical(test_label, NUM_CLASSES) 94 | 95 | train_data = train_data.reshape((len(train_data), len(X_train.iloc[0,0].split(',')), 1, 1)) 96 | test_data = test_data.reshape((len(test_data), len(X_test.iloc[0, 0].split(',')), 1, 1)) 97 | #model = create_1dcnn_model() 98 | 99 | 100 | model.fit(train_data, train_label, batch_size=BATCH_SIZE, 101 | epochs=EPOCH,verbose=VERBOSE,validation_split= VALIDATION_SPLIT ) 102 | #score = model.evaluate(test_data, test_label, 103 | # batch_size=BATCH_SIZE) 104 | (loss, accuracy, f1_score, precision, recall) = model.evaluate(test_data, test_label, 105 | batch_size=BATCH_SIZE) 106 | score = [] 107 | score[0] = loss 108 | score[1]= accuracy 109 | score[2] = f1_score 110 | score[3] = precision 111 | score[4] = recall 112 | 113 | print("\nTest loss:", score[0]) 114 | print('Test accuracy:', score[1]) 115 | print('Test f1_score:', score[2]) 116 | print('Test precision:', score[3]) 117 | print('Test recall:', score[4]) 118 | 119 | saved_models = [] 120 | saved_weights = [] 121 | save_model_weights_dir = 'media/mehdi/linux/normalized_data/' 122 | # save model architecture 123 | model.save(save_model_weights_dir + 'model_architecture_cnn.h5') 124 | saved_models.append(save_model_weights_dir +'model_architecture_cnn.h5') 125 | # save model weights 126 | model.save_weights('model_weights_cnn.h5') 127 | saved_weights.append('model_weights_cnn.h5') 128 | # Get the output of the last connected layer 129 | last_dense_output = model.layers[-DENSE_LAYER[0]].output 130 | 131 | print(f"GAN training completed in {execution_time:.2f} seconds ({execution_time_minutes:.2f} minutes).") 132 | return last_dense_output,saved_models,saved_weights,save_model_weights_dir 133 | -------------------------------------------------------------------------------- /cnn_build_model.py: -------------------------------------------------------------------------------- 1 | def cnn_build_model(parameters): 2 | # list out keys and values separately 3 | key_list = list(parameters.keys()) 4 | val_list = list(parameters.values()) 5 | #FILTERS = val_list[key_list.index("FILTERS")] 6 | KERNEL_SIZE = val_list[key_list.index("KERNEL_SIZE")] 7 | STRIDES = val_list[key_list.index("STRIDES")] 8 | PADDING = val_list[key_list.index("PADDING")] 9 | POOL-TYPE = val_list[key_list.index("POOL-TYPE")] 10 | POOL_SIZE = val_list[key_list.index("POOL_SIZE")] 11 | POOL_STRIDE = val_list[key_list.index("POOL_STRIDE")] 12 | HIDEN_ACTIVATION_FUNCTION = val_list[key_list.index("HIDEN_ACTIVATION_FUNCTION")] 13 | OUTPUT_ACTIVATION_FUNCTION = val_list[key_list.index("OUTPUT_ACTIVATION_FUNCTION")] 14 | INPUT_DATA_SHAPE = val_list[key_list.index("INPUT_SHAPE")] 15 | INPUT_SHAPE = (val_list[key_list.index("INPUT_SHAPE")][0],val_list[key_list.index("INPUT_SHAPE")][1],1) 16 | CNN_LAYER_SPEC = val_list[key_list.index("CNN_LAYER_SPEC")] 17 | DENSE_LAYER = val_list[key_list.index("DENSE_LAYER")] 18 | DENSE_LAYER_ACTIVATION_FUNCTION = val_list[key_list.index("DENSE_LAYER_ACTIVATION_FUNCTION")] 19 | SOFTMAX_LAYER = val_list[key_list.index("SOFTMAX_LAYER")] 20 | SOFTMAX_LAYER_ACTIVATION_FUNCTION = val_list[key_list.index("SOFTMAX_LAYER_ACTIVATION_FUNCTION")] 21 | 22 | 23 | model = ks.models.Sequential() 24 | # this CNN has been implemented based on DEEP PACKET Paper 25 | for i in range(CNN_LAYER_SPEC[0]): 26 | if i == 0 : 27 | model.add(ks.layers.Convolution2D(CNN_LAYER_SPEC[i+1], (KERNEL_SIZE[i],KERNEL_SIZE[i+1] ),padding=PADDING, 28 | strides=(STRIDES[i],STRIDES[i+1]),activation = HIDEN_ACTIVATION_FUNCTION, input_shape=INPUT_SHAPE)) 29 | model.add(tf.keras.layers.BatchNormalization()) 30 | else: 31 | 32 | model.add(ks.layers.Convolution2D(CNN_LAYER_SPEC[i+1],(KERNEL_SIZE[2*i],KERNEL_SIZE[2*i+1] ) ,padding = PADDING, 33 | strides=(STRIDES[2*i],STRIDES[2*i+1]),activation=HIDEN_ACTIVATION_FUNCTION)) 34 | model.add(tf.keras.layers.BatchNormalization()) 35 | # Add AveragePooling1D layer based on POOL-TYPE if 'POOL-TYPE' is defined 36 | if 'POOL-TYPE' in parameters and parameters['POOL-TYPE'] == AveragePooling1D: 37 | model.add(AveragePooling1D(pool_size= (POOL_SIZE[0],POOL_SIZE[1]), strides= (POOL_STRIDE[0],POOL_STRIDE[1]))) 38 | 39 | 40 | # Flatten => RELU layers 41 | model.add(ks.layers.Flatten()) 42 | # Dense Connected Layer 43 | for i in range(DENSE_LAYER[0]): 44 | model.add(ks.layers.Dense(DENSE_LAYER[i+1], activation=DENSE_LAYER_ACTIVATION_FUNCTION[i])) 45 | model.add(tf.keras.layers.BatchNormalization()) 46 | 47 | 48 | return model 49 | -------------------------------------------------------------------------------- /compare-accuracy-code.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | from scipy.signal import savgol_filter 5 | from google.colab import files 6 | 7 | # Upload Excel file 8 | uploaded = files.upload() 9 | 10 | # Read Excel file into a DataFrame 11 | df = pd.read_excel(next(iter(uploaded))) 12 | 13 | # Define line styles and symbols for each column 14 | line_styles = ['-', '--', ':'] 15 | #symbols = ['s', 'o', '^'] 16 | columns = ['CBS', 'CSCNN-[79]', 'Datanet-[19]'] 17 | 18 | # Plotting 19 | plt.figure(figsize=(10, 6)) 20 | 21 | for i, column in enumerate(columns): 22 | x = df['Epoch'] 23 | y = df[column] 24 | 25 | # Apply smoothing filter 26 | #y_smooth = savgol_filter(y, window_length=5, polyorder=2) 27 | # Apply Gaussian smoothing filter 28 | y_smooth = gaussian_filter1d(y, sigma=1) 29 | 30 | # Plot line with specific style and symbol 31 | plt.plot(x, y_smooth, linestyle=line_styles[i],linewidth=3,label=column) 32 | 33 | # Add legend and labels 34 | plt.legend() 35 | plt.xlabel('Epoch') 36 | plt.ylabel('Accuracy [%]') 37 | # Set x-axis spacing to 2 38 | #plt.xticks(np.arange(min(x), max(x)+1, 2)) 39 | # Set x-axis spacing to 2 40 | plt.xticks(np.arange(min(x), 21, 1)) 41 | #plt.title('Line Plot') 42 | # Save the plot as SVG 43 | output_file = input("Please provide the output file name (e.g., plot.svg): ") 44 | plt.savefig(output_file, format='svg') 45 | print("Line plot saved as SVG.") 46 | 47 | # Download the saved SVG file 48 | files.download(output_file) 49 | 50 | # Show the plot 51 | plt.show() -------------------------------------------------------------------------------- /defin_1D-CNN_model_params.py: -------------------------------------------------------------------------------- 1 | 2 | from tensorflow.keras.layers import AveragePooling1D 3 | def defin_1D-CNN_model_params(): 4 | size_list = [4,1,5,1] 5 | stride_list = [3,1,1,1] 6 | parameters_dict = {} 7 | parameters_dict['DROPOUT'] = 0.12 8 | parameters_dict['KERNEL_SIZE'] = [] 9 | parameters_dict['FILTERS'] = 2 10 | parameters_dict['STRIDES'] = [] 11 | parameters_dict['PADDING'] = 'same' 12 | # Add a key name 'POOL-TYPE' to the parameters_dict dictionary and put its value as AveragePooling1D 13 | parameters_dict['POOL-TYPE'] = AveragePooling1D 14 | parameters_dict['POOL_SIZE'] = (2,1) 15 | parameters_dict['POOL_STRIDE'] = (2,1) 16 | parameters_dict['INPUT_SHAPE'] = (1500,1) 17 | parameters_dict['CNN_LAYER_SPEC'] = (2,200,200) 18 | parameters_dict['DENSE_LAYER'] = (2,300,200) 19 | for i in range(parameters_dict['FILTERS']): 20 | parameters_dict['KERNEL_SIZE'].append(size_list[2*i]) 21 | parameters_dict['KERNEL_SIZE'].append(size_list[2*i+1]) 22 | parameters_dict['STRIDES'].append(stride_list[2*i]) 23 | parameters_dict['STRIDES'].append(stride_list[2*i+1]) 24 | return parameters_dict 25 | -------------------------------------------------------------------------------- /define_Bi-LSTM_model_params.py: -------------------------------------------------------------------------------- 1 | def define_Bi-LSTM_model_params(): 2 | # Define the model parameters as a dictionary 3 | params = { 4 | 'input_shape': (1, 1500), # Shape of input data [1*1500] 5 | 'activation': 'relu', # Activation function for the fully connected layers 6 | 'dense_neurons': [1024, 512, 256, 74] # Number of neurons in each dense layer 7 | } 8 | return params -------------------------------------------------------------------------------- /define_FC_model_params.py: -------------------------------------------------------------------------------- 1 | def define_FC_model_params(): 2 | parameters = { 3 | "input_shape": 1500, 4 | "first_layer": 1300, 5 | "second_layer": 1000, 6 | "third_layer": 512, 7 | "fourth_layer": 256, 8 | "num_classes": 12 9 | "dropout_rate": 0.4, 10 | } 11 | return parameters -------------------------------------------------------------------------------- /define_GAN_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | import tensorflow as tf 5 | from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Reshape, BatchNormalization, LeakyReLU 6 | from tensorflow.keras.models import Sequential 7 | 8 | from memory usage-execution time import measure_execution_memory 9 | 10 | # Define the Generator model 11 | def build_generator(latent_dim): 12 | 13 | generator = tf.keras.Sequential() 14 | generator.add(layers.Conv1D(256, kernel_size=3, activation=tf.keras.layers.LeakyReLU(), input_shape=(1, 1500), input_dim=latent_dim)) 15 | generator.add(layers.BatchNormalization()) 16 | generator.add(layers.AveragePooling1D(pool_size=2, strides=2)) 17 | generator.add(layers.Conv1D(128, kernel_size=3, activation=tf.keras.layers.LeakyReLU())) 18 | generator.add(layers.BatchNormalization()) 19 | generator.add(layers.AveragePooling1D(pool_size=2, strides=2)) 20 | generator.add(layers.Conv1D(64, kernel_size=3, activation=tf.keras.layers.LeakyReLU())) 21 | generator.add(layers.BatchNormalization()) 22 | generator.add(layers.AveragePooling1D(pool_size=2, strides=2)) 23 | generator.add(layers.Flatten()) 24 | generator.add(layers.Dense(1500, activation='tanh') 25 | generator.add(layers.Reshape((1, 1500))) 26 | return generator 27 | 28 | 29 | # Define the Discriminator model 30 | def build_discriminator(): 31 | discriminator = tf.keras.Sequential() 32 | discriminator.add(layers.Conv1D(256, kernel_size=3, activation=tf.keras.layers.LeakyReLU(), input_shape=(1, 1500))) 33 | discriminator.add(layers.BatchNormalization()) 34 | discriminator.add(layers.AveragePooling1D(pool_size=2, strides=2)) 35 | discriminator.add(layers.Conv1D(128, kernel_size=3, activation=tf.keras.layers.LeakyReLU())) 36 | discriminator.add(layers.BatchNormalization()) 37 | discriminator.add(layers.AveragePooling1D(pool_size=2, strides=2)) 38 | discriminator.add(layers.Conv1D(64, kernel_size=3, activation=tf.keras.layers.LeakyReLU())) 39 | discriminator.add(layers.BatchNormalization()) 40 | discriminator.add(layers.AveragePooling1D(pool_size=2, strides=2)) 41 | discriminator.add(layers.Flatten()) 42 | discriminator.add(layers.Dense(1, activation='sigmoid')) 43 | return discriminator 44 | 45 | 46 | # Combine the Generator and Discriminator into a GAN model 47 | def build_gan(generator, discriminator): 48 | discriminator.trainable = False 49 | model = Sequential() 50 | model.add(generator) 51 | model.add(discriminator) 52 | return model 53 | 54 | 55 | # Load the CSV files from the directory 56 | def load_csv_files(directory): 57 | csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')] 58 | dataframes = [] 59 | for file in csv_files: 60 | dataframe = pd.read_csv(os.path.join(directory, file)) 61 | dataframes.append(dataframe) 62 | return dataframes 63 | 64 | 65 | # Generate artificial data using GAN for specific labels 66 | def generate_artificial_data(generator, num_samples): 67 | latent_dim = 100 68 | noise = np.random.normal(0, 1, (num_samples, latent_dim)) 69 | artificial_data = generator.predict(noise) 70 | return artificial_data 71 | 72 | 73 | # GAN model network program 74 | @measure_execution_memory 75 | def process_csv_files() 76 | directory = './data' # Directory containing the CSV files 77 | num_samples = int(input("Enter the number of samples to generate via GAN: ")) 78 | 79 | # Load CSV files 80 | dataframes = load_csv_files(directory) 81 | 82 | # Process each CSV file 83 | for dataframe in dataframes: 84 | packet_data = dataframe['packet_normalized_data'].values 85 | class_labels = dataframe['class_label'].values 86 | 87 | # Filter class labels requiring artificial data 88 | labels_to_generate = [0, 2, 3, 11] 89 | filtered_indices = [i for i, label in enumerate(class_labels) if label in labels_to_generate] 90 | filtered_packet_data = packet_data[filtered_indices] 91 | filtered_class_labels = class_labels[filtered_indices] 92 | 93 | # Prepare and train GAN only if there are filtered records 94 | if len(filtered_indices) > 0: 95 | # Prepare data for GAN training (Normalize between -1 and 1) 96 | filtered_packet_data = (filtered_packet_data - np.min(filtered_packet_data)) / ( 97 | np.max(filtered_packet_data) - np.min(filtered_packet_data)) 98 | filtered_packet_data = 2 * filtered_packet_data - 1 99 | filtered_packet_data = np.expand_dims(filtered_packet_data, axis=-1) 100 | 101 | # Build and compile the models 102 | generator = build_generator(latent_dim=100) 103 | discriminator = build_discriminator() 104 | gan = build_gan(generator,discriminator) 105 | 106 | discriminator.compile(loss='binary_crossentropy', optimizer='adam') 107 | gan.compile(loss='binary_crossentropy', optimizer='adam') 108 | 109 | # Train the GAN 110 | batch_size = 32 111 | epochs = 100 112 | num_batches = len(filtered_packet_data) // batch_size 113 | 114 | for epoch in range(epochs): 115 | for batch in range(num_batches): 116 | # Select a random batch of real samples 117 | real_samples = filtered_packet_data[batch * batch_size:(batch + 1) * batch_size] 118 | 119 | # Generate a batch of fake samples 120 | noise = np.random.normal(0, 1, (batch_size, latent_dim)) 121 | fake_samples = generator.predict(noise) 122 | 123 | # Train the discriminator 124 | discriminator.trainable = True 125 | d_loss_real = discriminator.train_on_batch(real_samples, np.ones((batch_size, 1))) 126 | d_loss_fake = discriminator.train_on_batch(fake_samples, np.zeros((batch_size, 1))) 127 | d_loss = 0.5 * np.add(d_loss_real, d_loss_fake) 128 | 129 | # Train the generator 130 | discriminator.trainable = False 131 | g_loss = gan.train_on_batch(noise, np.ones((batch_size, 1))) 132 | 133 | # Print the progress 134 | print(f"Epoch {epoch + 1}/{epochs} - D loss: {d_loss} - G loss: {g_loss}") 135 | 136 | # Generate artificial data 137 | artificial_data = generate_artificial_data(generator, num_samples) 138 | 139 | # Add the artificial data to the original dataframe 140 | dataframe = dataframe.append(pd.DataFrame({ 141 | 'packet_normalized_data': artificial_data.squeeze(), 142 | 'class_label': np.random.choice(labels_to_generate, num_samples) 143 | }), ignore_index=True) 144 | 145 | # Save the updated dataframe back to the original CSV file 146 | dataframe.to_csv(os.path.join(directory, file), index=False) 147 | ============================================================================================================= 148 | -------------------------------------------------------------------------------- /define_SAE_model_params.py: -------------------------------------------------------------------------------- 1 | def define_SAE_model_params(): 2 | encoder_neurons_sae1 = [] 3 | code_neurons_sae1 = [] 4 | decoder_neurons_sae1 = [] 5 | output_neurons_sae1 = [20, 15, 10, 15, 10, 20] 6 | 7 | for _ in range(6): 8 | encoder_neurons_sae.append([1024, 512, 256, 128]) 9 | code_neurons_sae.append(10) 10 | decoder_neurons_sae.append([128, 256, 512, 1024]) 11 | 12 | dictionary = { 13 | 'encoder_neurons_sae': encoder_neurons_sae1, 14 | 'code_neurons_sae': code_neurons_sae1, 15 | 'decoder_neurons_sae': decoder_neurons_sae1, 16 | 'output_neurons_sae': output_neurons_sae1 17 | } 18 | 19 | return dictionary -------------------------------------------------------------------------------- /define_autoencoder.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras import layers 3 | import pandas as pd 4 | 5 | def define_autoencoder(input_shape, encoder_neurons, code_neurons, decoder_neurons, output_neurons): 6 | # Encoder 7 | input_data = tf.keras.Input(shape=input_shape, name='input') 8 | encoder = input_data 9 | for neurons in encoder_neurons: 10 | encoder = layers.Dense(neurons, activation='relu')(encoder) 11 | 12 | # Code layer 13 | code = layers.Dense(code_neurons, activation='relu', name='code')(encoder) 14 | 15 | # Decoder 16 | decoder = code 17 | for neurons in decoder_neurons: 18 | decoder = layers.Dense(neurons, activation='relu')(decoder) 19 | 20 | # Output layer 21 | output = layers.Dense(output_neurons, name='output')(decoder) 22 | 23 | # Define the model 24 | model = tf.keras.Model(inputs=input_data, outputs=output) 25 | return model -------------------------------------------------------------------------------- /extract_header_payload_packets.py: -------------------------------------------------------------------------------- 1 | """ 2 | The code you have provided is a function called extract_header_payload_packets(). This function extracts the header and payload information from packets in a pcap file and saves it to a CSV file. 3 | 4 | The function first defines a few variables, such as the default MTU, the number of packets to flush to the CSV file at a time, and a flag to indicate whether the CSV file has been created yet. 5 | 6 | The function then iterates over the packets in the pcap file. For each packet, the function checks if the packet has a payload. If the packet does have a payload, the function extracts the header and payload information from the packet and saves it to a DataFrame. 7 | 8 | If the packet is larger than the MTU, the function breaks the packet up into multiple packets and saves each of the smaller packets to the DataFrame. 9 | 10 | Once the function has processed all of the packets in the pcap file, it flushes the DataFrame to the CSV file. 11 | """ 12 | from ip_masking import ip_masking 13 | 14 | def extract_header_payload_packets(packets,k,v): 15 | print('file name processing: {}'.format(k)) 16 | default_mtu = 1500 17 | # flush counter for flusshing packet to csv file 18 | number_of_pck = len(packets) 19 | flush_counter = 10000 20 | if(number_of_pck <= flush_counter): 21 | flush_counter = number_of_pck 22 | has_flushed = False 23 | # counter for checking how many valid packet has been processed 24 | total_processed_packet = 0 25 | temp_processed_packet = 0 26 | # root address for saving extracket packet data is set here 27 | extracted_packet_root_dir = '/media/mehdi/linux/data/must_be_normalized_data/' 28 | processed_file_name =b'' 29 | # create extracted data packet folder as name extracted_packet_root_dir 30 | if not os.path.exists(extracted_packet_root_dir): 31 | os.makedirs(extracted_packet_root_dir) 32 | # Find All Protocol supported by Scapy 33 | f = io.StringIO() 34 | protocol = [] # list of all Protocol 35 | with redirect_stdout(f): 36 | ls() 37 | out = f.getvalue() 38 | #print("Packet Listing:", out, sep="\n\n") 39 | protocol_list = out.split('\n') 40 | for i in range(len(protocol_list)): 41 | protocol.append(protocol_list[i].split(':')[0].replace(" ", "")) 42 | # Create DataFrame for packet 43 | df = pd.DataFrame(columns=['Source_IP', 'Dest_IP', 'Source_Port', 'Destination_Port','pckt_protocol', 44 | 'src_MAC', 'dst_MAC', 'pckt_ttl','payload','ip_header','packet_lenght','packet_data_lenght','packet_number','class_label']) 45 | 46 | pkt_count = 0 47 | # List to holds srpIPs 48 | srpIP = [] 49 | tls_ip = [field.name for field in TLS().fields_desc] 50 | print(tls_ip) 51 | cnt = 0 52 | pkt_number = 0 53 | pktlst = [] 54 | # Read each packet and appent to srpIP list 55 | for pkt in packets: 56 | pkt_number += 1 57 | 58 | if(pkt_number == 40): 59 | print("893") 60 | 61 | print("packet number :{} has been proccessed".format(pkt_number)) 62 | has_payload = False 63 | if(pkt.haslayer(Raw)): 64 | try: 65 | if (pkt.haslayer(SSLv2)): 66 | print("sslv2 layer") 67 | 68 | pck_load = TLS(pkt.load) 69 | if pck_load.haslayer('TLS'): 70 | #records = pkt['TLS'].records 71 | print("tls layer") 72 | #pck_load.show() 73 | pck_fields = [field.name for field in pck_load.fields_desc] 74 | has_payload = True 75 | 76 | except: 77 | print("Oops!", sys.exc_info()[0], "occurred.") 78 | 79 | 80 | 81 | 82 | if (pkt.haslayer(TLS) == False and has_payload == True and pck_fields.count('type') > 0 and len(pkt) >= 60): 83 | if (len(pck_load.fields.get("msg")) > 0 ): 84 | if(pck_load.msg[0].name == 'TLS Application Data'): 85 | if pkt.haslayer(Ether) and pkt.haslayer(IP) and pkt.haslayer(TCP) and pkt.haslayer(Raw): 86 | if pck_load.type == 23: 87 | # pktlst.append(cnt-1) 88 | src_mac = pkt[Ether].src 89 | dst_mac = pkt[Ether].dst 90 | pckt_ip_dest = pkt[IP].dst 91 | pckt_ip_source = pkt[IP].src 92 | pckt_ttl = pkt[IP].ttl 93 | pckt_protocol = 'TLS' 94 | pckt_dest_port = pkt[TCP].dport 95 | pckt_src_port = pkt[TCP].sport 96 | payload = binascii.hexlify(bytes(pck_load.msg[0].data)) 97 | payload_lenght = int(len(payload) / 2) 98 | packet_lenght = len(pkt) 99 | # zero padding do here 100 | #if (pkt[IP].len) < 1500: 101 | if (len(pkt[IP])) < 1500: 102 | # find tcp header 103 | p = binascii.hexlify(bytes(pkt[IP].payload)[12:13]) 104 | # convert hex to binary 105 | binary_length = "{0:08b}".format(int(p, 16)) 106 | # convert binary to decimal and normalize number 107 | decimal_lenght = int(binary_length, 2) 108 | numbe_of_tcp_header_byte = int(decimal_lenght / 4) 109 | pad_len = (1500 - len(pkt[IP])) + 5 110 | pad = Padding() 111 | pad.load = '\x00' * int(pad_len) 112 | pkt = pkt / pad 113 | tcp_header = binascii.hexlify(bytes(pkt[IP].payload)[:numbe_of_tcp_header_byte]) 114 | pad_payload = binascii.hexlify(bytes(pkt[Raw].payload)) 115 | tcp_header += payload 116 | tcp_header += pad_payload 117 | payload = tcp_header 118 | ip_header = ip_masking(pkt) 119 | df.loc[len(df.index)] = [pckt_ip_source, pckt_ip_dest, pckt_src_port, pckt_dest_port, 120 | pckt_protocol, src_mac, dst_mac, pckt_ttl, payload, ip_header, 121 | packet_lenght, payload_lenght, pkt_number, v] 122 | # in this section flush dataframe data to csv file 123 | total_processed_packet += 1 124 | temp_processed_packet += 1 125 | # check if csv file has been created or not 126 | if (total_processed_packet == flush_counter): 127 | processed_file_name = extracted_packet_root_dir + os.path.basename(k) + '.' + 'csv' 128 | df.to_csv(processed_file_name,index = False) 129 | # empty df dataframe 130 | # Delete the first flush_counter rows 131 | temp_processed_packet = 0 132 | has_flushed = True 133 | df = df.drop(df.index[range(flush_counter)]) 134 | # csv file exist and must flush processed packet to it 135 | else: 136 | if (temp_processed_packet == flush_counter): 137 | # Write the new data to the CSV file in append mode 138 | df.to_csv(processed_file_name, mode='a', header=False, index=False) 139 | temp_processed_packet = 0 140 | # empty df dataframe 141 | # Delete the first flush_counter rows 142 | df = df.drop(df.index[range(flush_counter)]) 143 | 144 | # if lenght of packet is greater that MTU w must break it up to multiple packet 145 | else: 146 | number_of_fragmnet_pkt = 0 147 | has_reminder = False 148 | # find number of byte in tcp header and find mtu 149 | # find tcp header 150 | p = binascii.hexlify(bytes(pkt[IP].payload)[12:13]) 151 | # convert hex to binary 152 | binary_length = "{0:08b}".format(int(p, 16)) 153 | # convert binary to decimal and normalize number 154 | decimal_lenght = int(binary_length, 2) 155 | numbe_of_tcp_header_byte = int(decimal_lenght / 4) 156 | mtu = default_mtu - 20 - (numbe_of_tcp_header_byte) 157 | if(int(payload_lenght % mtu) == 0): 158 | number_of_fragmnet_pkt = int(payload_lenght / mtu) 159 | else: 160 | number_of_fragmnet_pkt = int(payload_lenght/mtu) + 1 161 | has_reminder = True 162 | 163 | offset = 0 164 | payload = b'' 165 | for index in range(number_of_fragmnet_pkt): 166 | if(has_reminder == False): 167 | payload += binascii.hexlify(bytes(pck_load.msg[0].data[offset: (index + 1) * mtu])) 168 | offset += mtu 169 | else: 170 | if(index == number_of_fragmnet_pkt - 1): 171 | payload += binascii.hexlify(bytes(pck_load.msg[0].data[offset: offset + int(payload_lenght % mtu)])) 172 | else: 173 | payload += binascii.hexlify(bytes(pck_load.msg[0].data[offset: (index + 1) * mtu])) 174 | offset += mtu 175 | 176 | 177 | pad_len = (1500 - ((int(len(payload) / 2)) + 20 + numbe_of_tcp_header_byte)) 178 | pad = Padding() 179 | pad.load = '\x00' * int(pad_len) 180 | pkt = pkt / pad 181 | tcp_header = binascii.hexlify(bytes(pkt[IP].payload)[:numbe_of_tcp_header_byte]) 182 | pad_payload = binascii.hexlify(bytes(pkt[Raw].payload)) 183 | tcp_header += payload 184 | tcp_header += pad_payload 185 | payload = tcp_header 186 | ip_header = ip_masking(pkt) 187 | df.loc[len(df.index)] = [pckt_ip_source, pckt_ip_dest, pckt_src_port, 188 | pckt_dest_port, pckt_protocol, src_mac, dst_mac, pckt_ttl, payload, 189 | ip_header, packet_lenght, payload_lenght, pkt_number, v] 190 | payload = b'' 191 | 192 | # in this section flush dataframe data to csv file 193 | total_processed_packet += 1 194 | temp_processed_packet += 1 195 | # check if csv file has been created or not 196 | if (total_processed_packet == flush_counter): 197 | processed_file_name = extracted_packet_root_dir + os.path.basename( 198 | k) + '.' + 'csv' 199 | df.to_csv(processed_file_name,index = False) 200 | # empty df dataframe 201 | # Delete the first flush_counter rows 202 | temp_processed_packet = 0 203 | has_flushed = True 204 | df = df.drop(df.index[range(flush_counter)]) 205 | # csv file exist and must flush processed packet to it 206 | else: 207 | if (temp_processed_packet == flush_counter): 208 | # Write the new data to the CSV file in append mode 209 | df.to_csv(processed_file_name, mode='a', header=False, index=False) 210 | temp_processed_packet = 0 211 | # empty df dataframe 212 | # Delete the first flush_counter rows 213 | df = df.drop(df.index[range(flush_counter)]) 214 | 215 | 216 | 217 | 218 | 219 | else: 220 | SSL2v_flag = False 221 | #checke if packet is server hello message 222 | if (pkt.haslayer(TLS) == False and has_payload == True and pck_fields.count('type') == 0 and pck_fields.count('msg')>0 ): 223 | if (len(pck_load.fields.get("msg")) > 0): 224 | if (pck_load.msg[0].name == 'Raw' and pck_load.name == 'SSLv2'): 225 | SSL2v_flag = True 226 | # find source and destination mac address of packet 227 | if (pkt.haslayer(Ether) and pkt.haslayer(IP) and pkt.haslayer(TCP) and SSL2v_flag == False and len(pkt) >= 60) : 228 | res_list = [i for i, value in enumerate(protocol) if (pkt.haslayer(value) == True and value != 'TCP' and 229 | value != 'IP' and value != 'Ether' and value != 'Raw' and value != 'TLS')] 230 | # check if a tls or ssl packet 231 | cnt += 1 232 | if pkt.haslayer(TLS): 233 | #print('a') 234 | extra_tls_layers = pkt[TLS] 235 | if pkt[TLS].type == 23: 236 | payload = b'' 237 | app_data_layer_count = 0 238 | has_tls_payload = True 239 | pktlst.append(cnt - 1) 240 | src_mac = pkt[Ether].src 241 | dst_mac = pkt[Ether].dst 242 | pckt_ip_dest = pkt[IP].dst 243 | pckt_ip_source = pkt[IP].src 244 | pckt_ttl = pkt[IP].ttl 245 | pckt_protocol = 'TLS' 246 | pckt_dest_port = pkt[TCP].dport 247 | pckt_src_port = pkt[TCP].sport 248 | # fetch all Application Record Layer 249 | while(has_tls_payload == True): 250 | payload += binascii.hexlify(bytes(extra_tls_layers.msg[0].data)) 251 | app_data_layer_count += 1 252 | if(len(extra_tls_layers.payload) > 0): 253 | extra_tls_layers = extra_tls_layers.payload 254 | else: 255 | has_tls_payload = False 256 | 257 | #payload = binascii.hexlify(bytes(pkt[TLS])[5:int(pkt[TLS].len) + 5]) 258 | payload_lenght = int(len(payload) / 2) 259 | packet_lenght = len(pkt) 260 | # zero padding do here 261 | #if (pkt[IP].len) < 1500: 262 | if (len(pkt[IP])) < 1500: 263 | # find tcp header 264 | p = binascii.hexlify(bytes(pkt[IP].payload)[12:13]) 265 | # convert hex to binary 266 | binary_length = "{0:08b}".format(int(p, 16)) 267 | # convert binary to decimal and normalize number 268 | decimal_lenght = int(binary_length, 2) 269 | numbe_of_tcp_header_byte = int(decimal_lenght / 4) 270 | pad_len = (1500 - len(pkt[IP])) + app_data_layer_count * 5 271 | pad = Padding() 272 | pad.load = '\x00' * int(pad_len) 273 | pkt = pkt / pad 274 | tcp_header = binascii.hexlify(bytes(pkt[IP].payload)[:numbe_of_tcp_header_byte]) 275 | #pad_payload = binascii.hexlify(bytes(pkt[TLS].payload)) 276 | pad_payload = binascii.hexlify(bytes(pad.load)) 277 | tcp_header += payload 278 | tcp_header += pad_payload 279 | payload = tcp_header 280 | ip_header = ip_masking(pkt) 281 | df.loc[len(df.index)] = [pckt_ip_dest, pckt_ip_source, pckt_src_port, pckt_dest_port, 282 | pckt_protocol, src_mac, dst_mac, pckt_ttl, payload, ip_header,packet_lenght,payload_lenght,pkt_number, v] 283 | # in this section flush dataframe data to csv file 284 | total_processed_packet += 1 285 | temp_processed_packet += 1 286 | # check if csv file has been created or not 287 | if(total_processed_packet == flush_counter): 288 | processed_file_name = extracted_packet_root_dir + os.path.basename(k) + '.' + 'csv' 289 | df.to_csv(processed_file_name,index = False) 290 | # empty df dataframe 291 | # Delete the first flush_counter rows 292 | temp_processed_packet = 0 293 | has_flushed = True 294 | df = df.drop(df.index[range(flush_counter)]) 295 | # csv file exist and must flush processed packet to it 296 | else: 297 | if(temp_processed_packet == flush_counter): 298 | # Write the new data to the CSV file in append mode 299 | df.to_csv(processed_file_name, mode='a', header=False, index=False) 300 | temp_processed_packet = 0 301 | # empty df dataframe 302 | # Delete the first flush_counter rows 303 | df = df.drop(df.index[range(flush_counter)]) 304 | 305 | 306 | 307 | # if lenght of packet is greater that MTU w must break it up to multiple packet 308 | else: 309 | number_of_fragmnet_pkt = 0 310 | has_reminder = False 311 | # find number of byte in tcp header and find mtu 312 | # find tcp header 313 | p = binascii.hexlify(bytes(pkt[IP].payload)[12:13]) 314 | # convert hex to binary 315 | binary_length = "{0:08b}".format(int(p, 16)) 316 | # convert binary to decimal and normalize number 317 | decimal_lenght = int(binary_length, 2) 318 | numbe_of_tcp_header_byte = int(decimal_lenght / 4) 319 | mtu = default_mtu - 20 - (numbe_of_tcp_header_byte) 320 | if (int(payload_lenght % mtu) == 0): 321 | number_of_fragmnet_pkt = int(payload_lenght / mtu) 322 | else: 323 | number_of_fragmnet_pkt = int(payload_lenght / mtu) + 1 324 | has_reminder = True 325 | 326 | offset = 0 327 | new_payload = b'' 328 | for index in range(number_of_fragmnet_pkt): 329 | if (has_reminder == False): 330 | new_payload += payload[offset: (index + 1) * mtu] 331 | offset += mtu 332 | else: 333 | if (index == number_of_fragmnet_pkt - 1): 334 | new_payload += payload[offset: offset + int(payload_lenght % mtu)] 335 | else: 336 | new_payload += payload[offset: (index + 1) * mtu] 337 | offset += mtu 338 | 339 | pad_len = (1500 - (int(len(new_payload) / 2) + 20 + numbe_of_tcp_header_byte)) 340 | pad = Padding() 341 | pad.load = '\x00' * int(pad_len) 342 | pkt = pkt / pad 343 | tcp_header = binascii.hexlify(bytes(pkt[IP].payload)[:numbe_of_tcp_header_byte]) 344 | #pad_payload = binascii.hexlify(bytes(pkt[Raw].payload)) 345 | pad_payload = binascii.hexlify(bytes(pad.load)) 346 | tcp_header += new_payload 347 | tcp_header += pad_payload 348 | new_payload = tcp_header 349 | ip_header = ip_masking(pkt) 350 | df.loc[len(df.index)] = [pckt_ip_source, pckt_ip_dest, pckt_src_port, 351 | pckt_dest_port, pckt_protocol, src_mac, dst_mac, pckt_ttl, 352 | new_payload,ip_header, packet_lenght, payload_lenght, pkt_number, v] 353 | new_payload = b'' 354 | 355 | # in this section flush dataframe data to csv file 356 | total_processed_packet += 1 357 | temp_processed_packet += 1 358 | # check if csv file has been created or not 359 | if (total_processed_packet == flush_counter): 360 | processed_file_name = extracted_packet_root_dir + os.path.basename(k) + '.' + 'csv' 361 | df.to_csv(processed_file_name,index = False) 362 | # empty df dataframe 363 | # Delete the first flush_counter rows 364 | temp_processed_packet = 0 365 | has_flushed = True 366 | df = df.drop(df.index[range(flush_counter)]) 367 | # csv file exist and must flush processed packet to it 368 | else: 369 | if (temp_processed_packet == flush_counter): 370 | # Write the new data to the CSV file in append mode 371 | df.to_csv(processed_file_name, mode='a', header=False, index=False) 372 | temp_processed_packet = 0 373 | # empty df dataframe 374 | # Delete the first flush_counter rows 375 | df = df.drop(df.index[range(flush_counter)]) 376 | 377 | else: 378 | # check if packet has pkt.load and TLS layer. some of packet has Raw data and load data 379 | # and load data contain TLS data. so we must check these packet 380 | if (has_payload == True ): 381 | tls_payload = TLS(pkt.load) 382 | if(tls_payload.name == 'TLS'): 383 | if pkt[TLS].type == 23: 384 | payload = b'' 385 | app_data_layer_count = 0 386 | has_tls_payload = True 387 | pktlst.append(cnt - 1) 388 | src_mac = pkt[Ether].src 389 | dst_mac = pkt[Ether].dst 390 | pckt_ip_dest = pkt[IP].dst 391 | pckt_ip_source = pkt[IP].src 392 | pckt_ttl = pkt[IP].ttl 393 | pckt_protocol = 'TLS' 394 | pckt_dest_port = pkt[TCP].dport 395 | pckt_src_port = pkt[TCP].sport 396 | # fetch all Application Record Layer 397 | while (has_tls_payload == True): 398 | payload += binascii.hexlify(bytes(tls_payload.msg[0].data)) 399 | app_data_layer_count += 1 400 | if (len(tls_payload.payload) > 0): 401 | tls_payload = tls_payload.payload 402 | else: 403 | has_tls_payload = False 404 | 405 | # payload = binascii.hexlify(bytes(pkt[TLS])[5:int(pkt[TLS].len) + 5]) 406 | payload_lenght = int(len(payload) / 2) 407 | packet_lenght = len(pkt) 408 | # zero padding do here 409 | # if (pkt[IP].len) < 1500: 410 | if (len(pkt[IP])) < 1500: 411 | # find tcp header 412 | p = binascii.hexlify(bytes(pkt[IP].payload)[12:13]) 413 | # convert hex to binary 414 | binary_length = "{0:08b}".format(int(p, 16)) 415 | # convert binary to decimal and normalize number 416 | decimal_lenght = int(binary_length, 2) 417 | numbe_of_tcp_header_byte = int(decimal_lenght / 4) 418 | pad_len = (1500 - len(pkt[IP])) + app_data_layer_count * 5 419 | pad = Padding() 420 | pad.load = '\x00' * int(pad_len) 421 | pkt = pkt / pad 422 | tcp_header = binascii.hexlify(bytes(pkt[IP].payload)[:numbe_of_tcp_header_byte]) 423 | # pad_payload = binascii.hexlify(bytes(pkt[TLS].payload)) 424 | pad_payload = binascii.hexlify(bytes(pad.load)) 425 | tcp_header += payload 426 | tcp_header += pad_payload 427 | payload = tcp_header 428 | ip_header = ip_masking(pkt) 429 | df.loc[len(df.index)] = [pckt_ip_dest, pckt_ip_source, pckt_src_port, 430 | pckt_dest_port, 431 | pckt_protocol, src_mac, dst_mac, pckt_ttl, payload, 432 | ip_header, packet_lenght, payload_lenght, pkt_number, v] 433 | # in this section flush dataframe data to csv file 434 | total_processed_packet += 1 435 | temp_processed_packet += 1 436 | # check if csv file has been created or not 437 | if (total_processed_packet == flush_counter): 438 | processed_file_name = extracted_packet_root_dir + os.path.basename( 439 | k) + '.' + 'csv' 440 | df.to_csv(processed_file_name,index = False) 441 | # empty df dataframe 442 | # Delete the first flush_counter rows 443 | temp_processed_packet = 0 444 | has_flushed = True 445 | df = df.drop(df.index[range(flush_counter)]) 446 | # csv file exist and must flush processed packet to it 447 | else: 448 | if (temp_processed_packet == flush_counter): 449 | # Write the new data to the CSV file in append mode 450 | df.to_csv(processed_file_name, mode='a', header=False, index=False) 451 | temp_processed_packet = 0 452 | # empty df dataframe 453 | # Delete the first flush_counter rows 454 | df = df.drop(df.index[range(flush_counter)]) 455 | 456 | # if lenght of packet is greater that MTU w must break it up to multiple packet 457 | else: 458 | number_of_fragmnet_pkt = 0 459 | has_reminder = False 460 | # find number of byte in tcp header and find mtu 461 | # find tcp header 462 | p = binascii.hexlify(bytes(pkt[IP].payload)[12:13]) 463 | # convert hex to binary 464 | binary_length = "{0:08b}".format(int(p, 16)) 465 | # convert binary to decimal and normalize number 466 | decimal_lenght = int(binary_length, 2) 467 | numbe_of_tcp_header_byte = int(decimal_lenght / 4) 468 | mtu = default_mtu - 20 - (numbe_of_tcp_header_byte) 469 | if (int(payload_lenght % mtu) == 0): 470 | number_of_fragmnet_pkt = int(payload_lenght / mtu) 471 | else: 472 | number_of_fragmnet_pkt = int(payload_lenght / mtu) + 1 473 | has_reminder = True 474 | 475 | offset = 0 476 | new_payload = b'' 477 | for index in range(number_of_fragmnet_pkt): 478 | if (has_reminder == False): 479 | new_payload += payload[offset: (index + 1) * mtu] 480 | offset += mtu 481 | else: 482 | if (index == number_of_fragmnet_pkt - 1): 483 | new_payload += payload[offset: offset + int(payload_lenght % mtu)] 484 | else: 485 | new_payload += payload[offset: (index + 1) * mtu] 486 | offset += mtu 487 | 488 | 489 | pad_len = (1500 - (int(len(new_payload) / 2) + 20 + numbe_of_tcp_header_byte)) 490 | # pad_len = (1500 - int(len(new_payload)/2)) 491 | pad = Padding() 492 | pad.load = '\x00' * int(pad_len) 493 | pkt = pkt / pad 494 | tcp_header = binascii.hexlify(bytes(pkt[IP].payload)[:numbe_of_tcp_header_byte]) 495 | # pad_payload = binascii.hexlify(bytes(pkt[Raw].payload)) 496 | pad_payload = binascii.hexlify(bytes(pad.payload)) 497 | 498 | tcp_header += new_payload 499 | tcp_header += pad_payload 500 | new_payload = tcp_header 501 | ip_header = ip_masking(pkt) 502 | df.loc[len(df.index)] = [pckt_ip_source, pckt_ip_dest, pckt_src_port, 503 | pckt_dest_port, pckt_protocol, src_mac, dst_mac, 504 | pckt_ttl, 505 | new_payload, ip_header, packet_lenght, payload_lenght, 506 | pkt_number, v] 507 | new_payload = b'' 508 | 509 | # in this section flush dataframe data to csv file 510 | total_processed_packet += 1 511 | temp_processed_packet += 1 512 | # check if csv file has been created or not 513 | if (total_processed_packet == flush_counter): 514 | processed_file_name = extracted_packet_root_dir + os.path.basename( 515 | k) + '.' + 'csv' 516 | df.to_csv(processed_file_name,index = False) 517 | # empty df dataframe 518 | # Delete the first flush_counter rows 519 | temp_processed_packet = 0 520 | has_flushed = True 521 | df = df.drop(df.index[range(flush_counter)]) 522 | # csv file exist and must flush processed packet to it 523 | else: 524 | if (temp_processed_packet == flush_counter): 525 | # Write the new data to the CSV file in append mode 526 | df.to_csv(processed_file_name, mode='a', header=False, index=False) 527 | temp_processed_packet = 0 528 | # empty df dataframe 529 | # Delete the first flush_counter rows 530 | df = df.drop(df.index[range(flush_counter)]) 531 | 532 | else: 533 | # if packet is a tcp packet or not 534 | if pkt[IP].proto == 6 and len(res_list) == 0: 535 | if pkt.haslayer(Raw): 536 | src_mac = pkt[Ether].src 537 | dst_mac = pkt[Ether].dst 538 | pckt_ip_dest = pkt[IP].dst 539 | pckt_ip_source = pkt[IP].src 540 | pckt_ttl = pkt[IP].ttl 541 | pckt_protocol = 'TCP' 542 | pckt_dest_port = pkt[TCP].dport 543 | pckt_src_port = pkt[TCP].sport 544 | payload = binascii.hexlify(bytes((pkt[Raw]))) 545 | payload_lenght = int(len(payload) / 2) 546 | packet_lenght = len(pkt) 547 | # zero padding do here 548 | # if (pkt[IP].len) < 1500: 549 | if (len(pkt[IP])) < 1500: 550 | # find tcp header 551 | p = binascii.hexlify(bytes(pkt[IP].payload)[12:13]) 552 | # convert hex to binary 553 | binary_length = "{0:08b}".format(int(p, 16)) 554 | # convert binary to decimal and normalize number 555 | decimal_lenght = int(binary_length, 2) 556 | numbe_of_tcp_header_byte = int(decimal_lenght / 4) 557 | pad_len = (1500 - len(pkt[IP])) 558 | pad = Padding() 559 | pad.load = '\x00' * int(pad_len) 560 | pkt = pkt / pad 561 | tcp_header = binascii.hexlify(bytes(pkt[IP].payload)[:numbe_of_tcp_header_byte]) 562 | pad_payload = binascii.hexlify(bytes(pkt[Raw])) 563 | tcp_header += pad_payload 564 | payload = tcp_header 565 | ip_header = ip_masking(pkt) 566 | df.loc[len(df.index)] = [pckt_ip_source, pckt_ip_dest, pckt_src_port, 567 | pckt_dest_port, 568 | pckt_protocol, src_mac, dst_mac, pckt_ttl, 569 | payload, ip_header, packet_lenght, payload_lenght, 570 | pkt_number, v] 571 | # in this section flush dataframe data to csv file 572 | total_processed_packet += 1 573 | temp_processed_packet += 1 574 | # check if csv file has been created or not 575 | if (total_processed_packet == flush_counter): 576 | processed_file_name = extracted_packet_root_dir + os.path.basename( 577 | k) + '.' + 'csv' 578 | df.to_csv(processed_file_name,index = False) 579 | # empty df dataframe 580 | # Delete the first flush_counter rows 581 | temp_processed_packet = 0 582 | has_flushed = True 583 | df = df.drop(df.index[range(flush_counter)]) 584 | # csv file exist and must flush processed packet to it 585 | else: 586 | if (temp_processed_packet == flush_counter): 587 | # Write the new data to the CSV file in append mode 588 | df.to_csv(processed_file_name, mode='a', header=False, index=False) 589 | temp_processed_packet = 0 590 | # empty df dataframe 591 | # Delete the first flush_counter rows 592 | df = df.drop(df.index[range(flush_counter)]) 593 | 594 | 595 | # if lenght of packet is greater that MTU w must break it up to multiple packet 596 | else: 597 | number_of_fragmnet_pkt = 0 598 | has_reminder = False 599 | # find number of byte in tcp header and find mtu 600 | # find tcp header 601 | p = binascii.hexlify(bytes(pkt[IP].payload)[12:13]) 602 | # convert hex to binary 603 | binary_length = "{0:08b}".format(int(p, 16)) 604 | # convert binary to decimal and normalize number 605 | decimal_lenght = int(binary_length, 2) 606 | numbe_of_tcp_header_byte = int(decimal_lenght / 4) 607 | mtu = default_mtu - 20 - (numbe_of_tcp_header_byte) 608 | if (int(payload_lenght % mtu) == 0): 609 | number_of_fragmnet_pkt = int(payload_lenght / mtu) 610 | else: 611 | number_of_fragmnet_pkt = int(payload_lenght / mtu) + 1 612 | has_reminder = True 613 | 614 | offset = 0 615 | new_payload = b'' 616 | for index in range(number_of_fragmnet_pkt): 617 | if (has_reminder == False): 618 | new_payload += payload[offset: (index + 1) * mtu] 619 | offset += mtu 620 | else: 621 | if (index == number_of_fragmnet_pkt - 1): 622 | new_payload += payload[offset: offset + int(payload_lenght % mtu)] 623 | else: 624 | new_payload += payload[offset: (index + 1) * mtu] 625 | offset += mtu 626 | 627 | 628 | pad_len = (1500 - (int(len(new_payload) / 2) + 20 + numbe_of_tcp_header_byte)) 629 | # pad_len = (1500 - int(len(new_payload)/2)) 630 | pad = Padding() 631 | pad.load = '\x00' * int(pad_len) 632 | pkt = pkt / pad 633 | tcp_header = binascii.hexlify(bytes(pkt[IP].payload)[:numbe_of_tcp_header_byte]) 634 | # pad_payload = binascii.hexlify(bytes(pkt[Raw].payload)) 635 | pad_payload = binascii.hexlify(bytes(pad.payload)) 636 | 637 | tcp_header += new_payload 638 | tcp_header += pad_payload 639 | new_payload = tcp_header 640 | ip_header = ip_masking(pkt) 641 | df.loc[len(df.index)] = [pckt_ip_source, pckt_ip_dest, pckt_src_port, 642 | pckt_dest_port, pckt_protocol, src_mac, dst_mac, 643 | pckt_ttl, 644 | new_payload, ip_header, packet_lenght, payload_lenght, 645 | pkt_number, v] 646 | new_payload = b'' 647 | 648 | # in this section flush dataframe data to csv file 649 | total_processed_packet += 1 650 | temp_processed_packet += 1 651 | # check if csv file has been created or not 652 | if (total_processed_packet == flush_counter): 653 | processed_file_name = extracted_packet_root_dir + os.path.basename( 654 | k) + '.' + 'csv' 655 | df.to_csv(processed_file_name,index = False) 656 | # empty df dataframe 657 | # Delete the first flush_counter rows 658 | temp_processed_packet = 0 659 | has_flushed = True 660 | df = df.drop(df.index[range(flush_counter)]) 661 | # csv file exist and must flush processed packet to it 662 | else: 663 | if (temp_processed_packet == flush_counter): 664 | # Write the new data to the CSV file in append mode 665 | df.to_csv(processed_file_name, mode='a', header=False, index=False) 666 | temp_processed_packet = 0 667 | # empty df dataframe 668 | # Delete the first flush_counter rows 669 | df = df.drop(df.index[range(flush_counter)]) 670 | 671 | 672 | 673 | else: 674 | if (pkt.haslayer(Ether) and pkt.haslayer(IP) and pkt.haslayer(UDP) and len(pkt) >= 60): 675 | res_list = [i for i, value in enumerate(protocol) if 676 | (pkt.haslayer(value) == True and value != 'UDP' and 677 | value != 'IP' and value != 'Ether' and value != 'Raw')] 678 | if pkt[IP].proto == 17 and len(res_list) == 0: 679 | if pkt.haslayer(Raw): 680 | src_mac = pkt[Ether].src 681 | dst_mac = pkt[Ether].dst 682 | pckt_ip_dest = pkt[IP].dst 683 | pckt_ip_source = pkt[IP].src 684 | pckt_ttl = pkt[IP].ttl 685 | pckt_protocol = 'UDP' 686 | pckt_dest_port = pkt[UDP].dport 687 | pckt_src_port = pkt[UDP].sport 688 | payload = binascii.hexlify(bytes((pkt[Raw]))) 689 | payload_lenght = int(len(payload) / 2) 690 | packet_lenght = len(pkt) 691 | # zero padding do here 692 | #if (pkt[IP].len) < 1500: 693 | if (len(pkt[IP])) < 1500: 694 | pad_len = (1500 - len(pkt[IP])) 695 | pad = Padding() 696 | pad.load = '\x00' * int(pad_len) 697 | pkt = pkt / pad 698 | udp_header = binascii.hexlify(bytes(pkt[IP].payload)[:8]) 699 | pad_payload = binascii.hexlify(bytes(pkt[Raw])) 700 | udp_header += pad_payload 701 | payload = udp_header 702 | ip_header = ip_masking(pkt) 703 | df.loc[len(df.index)] = [pckt_ip_source, pckt_ip_dest, pckt_src_port, pckt_dest_port, 704 | pckt_protocol, src_mac, dst_mac, pckt_ttl, payload, ip_header,packet_lenght,payload_lenght,pkt_number, v] 705 | 706 | # in this section flush dataframe data to csv file 707 | total_processed_packet += 1 708 | temp_processed_packet += 1 709 | # check if csv file has been created or not 710 | if (total_processed_packet == flush_counter): 711 | processed_file_name = extracted_packet_root_dir + os.path.basename(k) + '.' + 'csv' 712 | df.to_csv(processed_file_name,index = False) 713 | # empty df dataframe 714 | # Delete the first flush_counter rows 715 | temp_processed_packet = 0 716 | has_flushed = True 717 | df = df.drop(df.index[range(flush_counter)]) 718 | # csv file exist and must flush processed packet to it 719 | else: 720 | if (temp_processed_packet == flush_counter): 721 | # Write the new data to the CSV file in append mode 722 | df.to_csv(processed_file_name, mode='a', header=False, index=False) 723 | temp_processed_packet = 0 724 | # empty df dataframe 725 | # Delete the first flush_counter rows 726 | df = df.drop(df.index[range(flush_counter)]) 727 | 728 | else: 729 | pass 730 | 731 | print("finished") 732 | if(temp_processed_packet > 0): 733 | if(has_flushed == True): 734 | # Write the new data to the CSV file in append mode 735 | df.to_csv(processed_file_name, mode='a', header=False, index=False) 736 | else: 737 | processed_file_name = extracted_packet_root_dir + os.path.basename(k) + '.' + 'csv' 738 | df.to_csv(processed_file_name) 739 | return extracted_packet_root_dir -------------------------------------------------------------------------------- /fc_build_model.py: -------------------------------------------------------------------------------- 1 | def fc_build_model(params): 2 | 3 | model = Sequential() 4 | 5 | # First layer 6 | model.add(Dense(params['first_layer'], input_shape=(params['input_shape'],))) 7 | model.add(Dropout(params['dropout_rate'])) 8 | model.add(BatchNormalization()) 9 | 10 | # Second layer 11 | model.add(Dense(params['second_layer'], activation='relu')) 12 | model.add(Dropout(params['dropout_rate'])) 13 | model.add(BatchNormalization()) 14 | 15 | # Third layer 16 | model.add(Dense(params['third_layer'], activation='relu')) 17 | model.add(Dropout(params['dropout_rate'])) 18 | model.add(BatchNormalization()) 19 | 20 | # Fourth layer 21 | model.add(Dense(params['fourth_layer'], activation='relu')) 22 | model.add(Dropout(params['dropout_rate'])) 23 | model.add(BatchNormalization()) 24 | 25 | # Last layer 26 | if params['num_classes'] == 12: 27 | output_units = 12 28 | elif params['num_classes'] == 17: 29 | output_units = 17 30 | else: 31 | raise ValueError("Invalid number of classes!") 32 | 33 | model.add(Dense(output_units, activation='softmax')) 34 | # Compile the model 35 | #model.compile(optimizer=params['optimizer'], loss=params['loss_function']) 36 | 37 | return model -------------------------------------------------------------------------------- /gausian-compare-accuracy-code.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | from scipy.signal import savgol_filter 5 | from google.colab import files 6 | 7 | # Upload Excel file 8 | uploaded = files.upload() 9 | 10 | # Read Excel file into a DataFrame 11 | df = pd.read_excel(next(iter(uploaded))) 12 | 13 | # Define line styles and symbols for each column 14 | line_styles = ['-', '--', ':'] 15 | #symbols = ['s', 'o', '^'] 16 | columns = ['CBS', 'CSCNN-[79]', 'Datanet-[19]'] 17 | 18 | # Plotting 19 | plt.figure(figsize=(10, 6)) 20 | 21 | for i, column in enumerate(columns): 22 | x = df['Epoch'] 23 | y = df[column] 24 | 25 | # Apply smoothing filter 26 | #y_smooth = savgol_filter(y, window_length=5, polyorder=2) 27 | # Apply Gaussian smoothing filter 28 | y_smooth = gaussian_filter1d(y, sigma=2) 29 | 30 | # Plot line with specific style and symbol 31 | plt.plot(x, y_smooth, linestyle=line_styles[i],linewidth=3,label=column) 32 | 33 | # Add legend and labels 34 | plt.legend() 35 | plt.xlabel('Epoch') 36 | plt.ylabel('Accuracy [%]') 37 | # Set x-axis spacing to 2 38 | #plt.xticks(np.arange(min(x), max(x)+1, 2)) 39 | # Set x-axis spacing to 2 40 | plt.xticks(np.arange(min(x), 21, 1)) 41 | #plt.title('Line Plot') 42 | # Save the plot as SVG 43 | output_file = input("Please provide the output file name (e.g., plot.svg): ") 44 | plt.savefig(output_file, format='svg') 45 | print("Line plot saved as SVG.") 46 | 47 | # Download the saved SVG file 48 | files.download(output_file) 49 | 50 | # Show the plot 51 | plt.show() -------------------------------------------------------------------------------- /gausian-validation-train-acc.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | from scipy.signal import savgol_filter 5 | from google.colab import files 6 | 7 | # Upload Excel file 8 | uploaded = files.upload() 9 | 10 | # Read Excel file into a DataFrame 11 | df = pd.read_excel(next(iter(uploaded))) 12 | 13 | # Define line styles and symbols for each column 14 | line_styles = ['-', '--'] 15 | #symbols = ['s', 'o', '^'] 16 | columns = ['Val acc', 'Train acc'] 17 | 18 | # Plotting 19 | plt.figure(figsize=(10, 6)) 20 | 21 | for i, column in enumerate(columns): 22 | x = df['Epoch'] 23 | y = df[column] 24 | 25 | # Apply smoothing filter 26 | #y_smooth = savgol_filter(y, window_length=5, polyorder=2) 27 | # Apply Gaussian smoothing filter 28 | y_smooth = gaussian_filter1d(y, sigma=2) 29 | 30 | # Plot line with specific style and symbol 31 | plt.plot(x, y_smooth, linestyle=line_styles[i],linewidth=3,label=column) 32 | 33 | # Add legend and labels 34 | plt.legend() 35 | plt.xlabel('Epoch') 36 | plt.ylabel('Accuracy [%]') 37 | # Set x-axis spacing to 2 38 | #plt.xticks(np.arange(min(x), max(x)+1, 2)) 39 | # Set x-axis spacing to 2 40 | plt.xticks(np.arange(min(x), 21, 1)) 41 | #plt.title('Line Plot') 42 | # Save the plot as SVG 43 | output_file = input("Please provide the output file name (e.g., plot.svg): ") 44 | plt.savefig(output_file, format='svg') 45 | print("Line plot saved as SVG.") 46 | 47 | # Download the saved SVG file 48 | files.download(output_file) 49 | 50 | # Show the plot 51 | plt.show() -------------------------------------------------------------------------------- /gausian-validation-training-loss.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | from scipy.signal import savgol_filter 5 | from google.colab import files 6 | 7 | # Upload Excel file 8 | uploaded = files.upload() 9 | 10 | # Read Excel file into a DataFrame 11 | df = pd.read_excel(next(iter(uploaded))) 12 | 13 | # Define line styles and symbols for each column 14 | line_styles = ['-', '--'] 15 | #symbols = ['s', 'o', '^'] 16 | columns = ['Val Loss', 'Train Loss'] 17 | 18 | # Plotting 19 | plt.figure(figsize=(10, 6)) 20 | 21 | for i, column in enumerate(columns): 22 | x = df['Epoch'] 23 | y = df[column] 24 | 25 | # Apply smoothing filter 26 | #y_smooth = savgol_filter(y, window_length=5, polyorder=2) 27 | # Apply Gaussian smoothing filter 28 | y_smooth = gaussian_filter1d(y, sigma=2) 29 | 30 | # Plot line with specific style and symbol 31 | plt.plot(x, y_smooth, linestyle=line_styles[i],linewidth=3,label=column) 32 | 33 | # Add legend and labels 34 | plt.legend() 35 | plt.xlabel('Epoch') 36 | plt.ylabel('Loss [%]') 37 | # Set x-axis spacing to 2 38 | #plt.xticks(np.arange(min(x), max(x)+1, 2)) 39 | # Set x-axis spacing to 2 40 | plt.xticks(np.arange(min(x), 21, 1)) 41 | #plt.title('Line Plot') 42 | # Save the plot as SVG 43 | output_file = input("Please provide the output file name (e.g., plot.svg): ") 44 | plt.savefig(output_file, format='svg') 45 | print("Line plot saved as SVG.") 46 | 47 | # Download the saved SVG file 48 | files.download(output_file) 49 | 50 | # Show the plot 51 | plt.show() -------------------------------------------------------------------------------- /histogram_Dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | The code you have provided is a function called histogram_Dataset(). This function plots a histogram of the packet lengths in a set of pcap files. 3 | 4 | The function first defines two lists, pkt_numebr_list and pkt_length_list. The pkt_numebr_list stores the packet numbers in the pcap files. The pkt_length_list stores the lengths of the packets in the pcap files. 5 | 6 | The function then iterates over the pcap files in the files list. For each pcap file, the function reads the packets in the file and adds the packet numbers and lengths to the pkt_numebr_list and pkt_length_list lists, respectively. 7 | 8 | The function then calculates the probability mass function (PMF) of the packet lengths. The PMF is a function that gives the probability of a packet having a particular length. 9 | 10 | The function then plots a histogram of the PMF. The histogram is a bar chart that shows the number of packets with each length. 11 | 12 | The function finally returns the plot. 13 | 14 | The first few lines define the pkt_numebr_list and pkt_length_list lists. 15 | 16 | The next few lines iterate over the pcap files in the files list. For each pcap file, the function reads the packets in the file and adds the packet numbers and lengths to the pkt_numebr_list and pkt_length_list lists, respectively. 17 | 18 | The next few lines calculate the PMF of the packet lengths. 19 | 20 | The next few lines plot the histogram of the PMF. 21 | 22 | The final line returns the plot. 23 | """ 24 | 25 | 26 | def histogram_Dataset(files,path): 27 | 28 | pkt_numebr_list = [] 29 | pkt_length_list =[] 30 | n_200 = 0 31 | n_400 = 0 32 | n_600 = 0 33 | n_800 = 0 34 | n_1000 = 0 35 | n_1200 = 0 36 | n_1500 = 0 37 | bigger_1500 = 0 38 | for f in files: 39 | print('befor reading time is :{}'.format(datetime.now().time())) 40 | packets = rdpcap(os.path.join(path, f)) 41 | print('after reading time is :{}'.format(datetime.now().time())) 42 | for i in range(len(packets)): 43 | pkt_numebr_list.append(i) 44 | pkt_length_list.append(len(packets[i])) 45 | # in this section we calculate PMF of packet lenght 46 | packet_lenght_data = dict((x, pkt_length_list.count(x)) for x in set(pkt_length_list)) 47 | key_max = max(packet_lenght_data.keys(), key=(lambda k: packet_lenght_data[k])) 48 | key_min = min(packet_lenght_data.keys(), key=(lambda k: packet_lenght_data[k])) 49 | pkt_lenght = list(packet_lenght_data.keys()) 50 | values = list(packet_lenght_data.values()) 51 | total_packets = sum(values) 52 | 53 | # naming the x-axis 54 | plt.xlabel('Packet Lenght') 55 | # naming the y-axis 56 | plt.ylabel('PMF') 57 | # plot title 58 | plt.title('Packet Lenght Distribution Map') 59 | 60 | for i in range(len(pkt_lenght)): 61 | if(pkt_lenght[i]>=0 and pkt_lenght[i]<=200): 62 | n_200 = n_200 + values[i] 63 | elif(pkt_lenght[i]>200 and pkt_lenght[i]<=400): 64 | n_400 = n_400 + values[i] 65 | elif(pkt_lenght[i]>400 and pkt_lenght[i]<=600): 66 | n_600 = n_600 + values[i] 67 | elif(pkt_lenght[i]>600 and pkt_lenght[i]<=800): 68 | n_800 = n_800 + values[i] 69 | elif(pkt_lenght[i]>800 and pkt_lenght[i]<=1000): 70 | n_1000 = n_1000 + values[i] 71 | elif(pkt_lenght[i]>1000 and pkt_lenght[i]<=1200): 72 | n_1200 = n_1200 + values[i] 73 | elif(pkt_lenght[i]>1200 and pkt_lenght[i]<=1500): 74 | n_1500 = n_1500 + values[i] 75 | else: 76 | bigger_1500 = bigger_1500 + values[i] 77 | # setting ticks for x-axis 78 | x_tickes = [n_200,n_400,n_600,n_800,n_1000,n_1200,n_1500,bigger_1500] 79 | x_ticklabels = ['0-200', '200-400', '400-600', '600-800','800-1000' 80 | ,'1000-1200','1200-1500','1500-bigger'] 81 | y_tickes = [round((x/total_packets)) for x in x_tickes ] 82 | y_tickLabels = [0,15,30,45,60,75,90,100] 83 | 84 | plt.bar(x_ticklabels, y_tickes, color ='maroon',width =0.4) 85 | plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1)) 86 | 87 | plt.show() 88 | 89 | return -------------------------------------------------------------------------------- /ip_masking.py: -------------------------------------------------------------------------------- 1 | """ 2 | The code you have provided is a function called ip_masking(). This function takes a packet as input and masks the source and destination IP addresses of the packet. 3 | 4 | The function first gets the IP header of the packet. The IP header is a 20-byte header that contains information about the source and destination IP addresses of the packet, as well as other information. 5 | 6 | The function then splits the source and destination IP addresses into their individual octets. An octet is a group of 8 bits, which is equivalent to one byte. 7 | 8 | The function then randomly selects one of the octets from the source and destination IP addresses. It then sets the value of this octet to 0. 9 | 10 | The function then updates the source and destination IP addresses of the packet with the masked octets. 11 | 12 | Finally, the function returns the masked IP header of the packet. 13 | """ 14 | 15 | def ip_masking(packet): 16 | #hex_packet = binascii.hexlify(bytes(packet)) 17 | ip_header_hex_packet = binascii.hexlify(bytes(packet[Ether].payload))[:(packet[IP].ihl*4)*2] 18 | # ip masking 19 | src_addr = str(packet[IP].src) 20 | dst_addr = str(packet[IP].dst) 21 | src = src_addr.split('.') 22 | dst = dst_addr.split('.') 23 | src_index = src.index(random.choice(src)) 24 | dst_index = dst.index(random.choice(dst)) 25 | src[src_index] = '0' 26 | dst[dst_index] = '0' 27 | msk_scr = '.'.join(src) 28 | msk_dst = '.'.join(dst) 29 | packet[IP].src = msk_scr 30 | packet[IP].dst = msk_dst 31 | msk_ip_header_hex_packet = binascii.hexlify(bytes(packet[Ether].payload))[:(packet[IP].ihl*4)*2] 32 | return msk_ip_header_hex_packet -------------------------------------------------------------------------------- /load_pcap_datatype.py: -------------------------------------------------------------------------------- 1 | """ 2 | The code you have provided is a function called load_pcap_datatype(). This function loads the packets from a set of pcap files, extracts the header and payload information, and saves it to a CSV file. 3 | 4 | The function first defines a variable called root_must_normalized_dir that stores the directory where the normalized pcap files will be saved. 5 | 6 | The function then defines a variable called normalized_files_name that stores the names of the normalized pcap files. 7 | 8 | The function then defines a variable called chunk_size that specifies the size of the chunks that the pcap files will be broken into. 9 | 10 | The function then loads the TLS and SSL layers. 11 | 12 | The function then iterates over the files in the file_name_dict dictionary. For each file, the function breaks the file into chunks and saves the chunks to the root_must_normalized_dir directory. 13 | 14 | The function then extracts the header and payload information from the packets in the normalized_files_name list and saves it to a CSV file. 15 | 16 | The function finally returns the normalized packets. 17 | 18 | """ 19 | from Break_Data_File import Break_Data_File 20 | 21 | def load_pcap_datatype(file_name_dict): 22 | 23 | 24 | normalized_files_name = [] 25 | # must be deleted 26 | test_n_filename = [] 27 | chunk_size = 40000 28 | load_layer("tls") 29 | load_layer("ssl") 30 | 31 | 32 | 33 | for k,v in file_name_dict.items(): 34 | chunk_size_file = 10 35 | # get file size in MB 36 | size = get_file_size(k, SIZE_UNIT.BYTES) 37 | print('Size of file is : ', size, 'Byte') 38 | # breake file based on pcap size file 39 | if(size > 2*chunk_size_file*1024*1024 ): 40 | Break_Data_File(k,chunk_size_file) 41 | 42 | working_directory = os.path.splitext(os.path.basename(k))[0] 43 | directory = os.path.splitext(os.path.basename(k))[0] 44 | # Parent Directory path 45 | parent_dir = os.path.dirname(k) 46 | # Path 47 | path = os.path.join(parent_dir, directory) 48 | files = os.listdir(path) 49 | # in this section we show histogram of packet length 50 | histogram_Dataset(files,path) 51 | for f in files: 52 | print('befor reading time is :{}'.format(datetime.now().time())) 53 | packets = rdpcap(os.path.join(path,f)) 54 | print('after reading time is :{}'.format(datetime.now().time())) 55 | print("file:{} has been read".format((os.path.join(path,f)))) 56 | extracted_packet_root_dir = extract_header_payload_packets(packets,os.path.join(path,f) , v) 57 | return extracted_packet_root_dir 58 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # This is a sample Python script. 2 | 3 | # Press Shift+F10 to execute it or replace it with your code. 4 | # Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings. 5 | from __future__ import absolute_import, division, print_function, unicode_literals 6 | 7 | import base64 8 | import binascii 9 | import enum 10 | from typing import List, Any, Union 11 | 12 | import numpy as np 13 | import pandas as pd 14 | import tensorflow as tf 15 | import tensorflow_datasets as tfds 16 | import matplotlib.pyplot as plt 17 | import seaborn as sns 18 | import matplotlib.ticker as mtick 19 | from matplotlib.ticker import AutoMinorLocator 20 | import plotly 21 | from docutils.nodes import date 22 | from pycodestyle import BaseReport 23 | from pytest import collect 24 | import os 25 | import csv 26 | import glob 27 | import time 28 | import datetime 29 | 30 | from scapy.layers.dns import DNS 31 | from scapy.layers.inet import IP, TCP, UDP 32 | from scapy.layers.l2 import Ether 33 | from scapy.layers.tls.handshake import TLSClientHello, TLSServerHello, TLSCertificateVerify 34 | from scapy.layers.tls.handshake_sslv2 import SSLv2ClientHello, SSLv2ServerHello 35 | from scapy.layers.tls.record import TLS 36 | from scapy.layers.tls.record import * 37 | from scapy.layers.tls.record_sslv2 import SSLv2 38 | from scapy.layers.tls.record_sslv2 import * 39 | 40 | 41 | #from scapy.layers.tls.record_sslv2 import SSLv3 42 | from sphinx.testing.path import path 43 | from tensorflow import keras as ks 44 | #from tensorflow.keras.layers import la 45 | #from keras import layers 46 | #from keras import models 47 | 48 | #new import for test 49 | from tensorflow.keras import layers 50 | from tensorflow.keras import models 51 | #new import for test 52 | 53 | from scipy.io import arff 54 | from scapy.all import * 55 | from scapy.utils import RawPcapReader 56 | from sklearn.model_selection import train_test_split 57 | from collections import Counter 58 | from contextlib import redirect_stdout 59 | import b_colors 60 | 61 | import pktDirection 62 | 63 | from define_FC_model_params import define_FC_model_params 64 | from define_SAE_model_params import define_SAE_model_params 65 | from define_Bi-LSTM_model_params import define_Bi-LSTM_model_params 66 | from network_parameters_initializer import network_parameters_initializer 67 | from load_pcap_datatype import load_pcap_datatype 68 | from read_pcap_files import read_pcap_files 69 | from packet_normalization import packet_normalization 70 | from define_GAN_mode import process_csv_files 71 | from preprocessing-traffic-label import read_app_pcap_files() 72 | 73 | 74 | 75 | # Enum for size units 76 | class SIZE_UNIT(enum.Enum): 77 | BYTES = 1 78 | KB = 2 79 | MB = 3 80 | GB = 4 81 | def convert_unit(size_in_bytes, unit): 82 | """ Convert the size from bytes to other units like KB, MB or GB""" 83 | if unit == SIZE_UNIT.KB: 84 | return round(size_in_bytes/1024,3) 85 | elif unit == SIZE_UNIT.MB: 86 | return size_in_bytes/(1024*1024) 87 | elif unit == SIZE_UNIT.GB: 88 | return round(size_in_bytes/(1024*1024*1024),3) 89 | else: 90 | return size_in_bytes 91 | 92 | def get_file_size(file_name, size_type = SIZE_UNIT.BYTES ): 93 | """ Get file in size in given unit like KB, MB or GB""" 94 | size = os.path.getsize(file_name) 95 | return convert_unit(size, size_type) 96 | 97 | def print_hi(name): 98 | # Use a breakpoint in the code line below to debug your script. 99 | print(f'Hi, {name}') # Press Ctrl+F8 to toggle the breakpoint. 100 | 101 | 102 | # Press the green button in the gutter to run the script. 103 | if __name__ == '__main__': 104 | print_hi('PyCharm') 105 | 106 | # See PyCharm help at https://www.jetbrains.com/help/pycharm/ 107 | 108 | 109 | "===============================version of tensorflow and keras=======================" 110 | 111 | print("TensorFlow version: {}".format(tf.__version__)) 112 | print("Eager execution is: {}".format(tf.executing_eagerly())) 113 | print("Keras version: {}".format(tf.keras.__version__)) 114 | 115 | "====================test on cpu or gpu =======================" 116 | if tf.test.is_gpu_available(): 117 | print('Running on GPU') 118 | print('GPU #0?') 119 | 120 | else: 121 | print('Running on CPU') 122 | print(b_colors.bcolors.warning("This is dangerous")) 123 | my_macs = [get_if_hwaddr(i) for i in get_if_list()] 124 | print('mac address:{}'.format(str(my_macs))) 125 | print(Ether().src) 126 | 127 | 128 | 129 | root_normalized_dir = 'media/mehdi/linux/normalized_data/' 130 | root_app_normalized_dir = 'media/mehdi/linux/normalized_app_data/' 131 | sae-extracted-feature-file = 'mehdi/linux' 132 | # in this function PCAP files read and based on file name the label extract. 133 | extracted_packet_root_dir = read_pcap_files() 134 | extracted_app_root_dir = read_app_pcap_files() 135 | files = [] 136 | 137 | for file in os.listdir(extracted_packet_root_dir): 138 | if os.path.isfile(os.path.join(extracted_packet_root_dir, file)): 139 | files.append(file) 140 | app_files = [] 141 | for file in os.listdir(extracted_app_root_dir): 142 | if os.path.isfile(os.path.join(extracted_app_root_dir, file)): 143 | files.append(file) 144 | # Do packet normalization 145 | packet_normalization(files,1) 146 | packet_normalization(app_files,0) 147 | # define network model parameters ( hyper parameters ) 148 | net_params = network_parameters_initializer() 149 | #GAN model for producing syntesized data 150 | process_csv_files() 151 | # Define 1D-CNN the model parameters 152 | cnn_model_params = defin_1D-CNN_model_params() 153 | #====================== for traffic type classification ==================== 154 | cnn_output,cnn_saved_model,cnn_saved_weights,1d-cnn_path = cnn_Traffic_classification(root_normalized_dir,net_params,cnn_model_params) 155 | # ============for application classification ============================== 156 | cnn1_output,cnn1_saved_model,cnn1_saved_weights,app_1d-cnn_path = cnn_Traffic_classification(root_app_normalized_dir,net_params,cnn_model_params) 157 | # Define Bi-LSTM the model parameters 158 | #======================= for traffic type classification ==================== 159 | Bi-LSTM_model_params = define_Bi-LSTM_model_params() 160 | bi-LSTM_output,bilstm_saved_model,bilstm_saved_weights,bi-lstm_path = Bi-LSTM_Traffic_classification(root_normalized_dir,net_params,Bi-LSTM_model_params) 161 | #======================= for application classification ==================== 162 | bi-LSTM1_output,bilstm1_saved_model,bilstm1_saved_weights,bi-lstm1_path = Bi-LSTM_Traffic_classification(root_app_normalized_dir,net_params,Bi-LSTM_model_params) 163 | #Define SAE the model parameters 164 | SAE_model_params = define_SAE_model_params() 165 | sae_output,sae_saved_models,sae_saved_weights,sae_path = SAE_Traffic_classification(sae-extracted-feature-file,net_params,SAE_model_params) 166 | # Combine the outputs into a single input array 167 | fc_model_params = define_FC_model_params() 168 | #====================== for traffic type classification ====================================== 169 | mlp_model = FC-traffic-classification(root_normalized_dir,net_params,fc_model_params, 1d-cnn_path,cnn_saved_model,bi-lstm_path,bilstm_saved_model,sae_path,sae_saved_models,sae-extracted-feature-file) 170 | # =============================== for application classification ================================= 171 | mlp_model2 = FC-traffic-classification(root_app_normalized_dir,net_params,fc_model_params, app_1d-cnn_path,cnn1_saved_model,bi-lstm1_path,bilstm1_saved_model,sae_path,sae_saved_models,sae-extracted-feature-file) 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | -------------------------------------------------------------------------------- /memory usage-execution time.py: -------------------------------------------------------------------------------- 1 | import time 2 | import psutil 3 | 4 | # Function to get current memory usage 5 | def get_memory_usage(): 6 | process = psutil.Process() 7 | mem_info = process.memory_info() 8 | return mem_info.rss # Resident Set Size (memory usage) 9 | 10 | # Function to measure execution time and memory consumption of another function 11 | def measure_execution_memory(func): 12 | def wrapper(*args, **kwargs): 13 | # Record start time 14 | start_time = time.time() 15 | 16 | # Measure memory usage before execution 17 | start_memory = get_memory_usage() 18 | 19 | # Execute the wrapped function 20 | result = func(*args, **kwargs) 21 | 22 | # Measure memory usage after execution 23 | end_memory = get_memory_usage() 24 | 25 | # Calculate execution time 26 | execution_time = time.time() - start_time 27 | 28 | # Calculate memory consumption 29 | memory_consumption_bytes = end_memory - start_memory 30 | memory_consumption_megabytes = memory_consumption_bytes / (1024 * 1024) # Convert to megabytes 31 | 32 | print(f"Function '{func.__name__}' executed in {execution_time:.2f} seconds.") 33 | print(f"Memory consumption during execution: {memory_consumption_megabytes:.2f} MB") 34 | 35 | return result # Return the result of the wrapped function 36 | 37 | return wrapper 38 | -------------------------------------------------------------------------------- /metric-evaluation.py: -------------------------------------------------------------------------------- 1 | from keras import backend as K 2 | 3 | def recall_m(y_true, y_pred): 4 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 5 | possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) 6 | recall = true_positives / (possible_positives + K.epsilon()) 7 | return recall 8 | 9 | def precision_m(y_true, y_pred): 10 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 11 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) 12 | precision = true_positives / (predicted_positives + K.epsilon()) 13 | return precision 14 | 15 | def f1_m(y_true, y_pred): 16 | precision = precision_m(y_true, y_pred) 17 | recall = recall_m(y_true, y_pred) 18 | return 2*((precision*recall)/(precision+recall+K.epsilon())) 19 | 20 | -------------------------------------------------------------------------------- /network_parameters_initializer.py: -------------------------------------------------------------------------------- 1 | def network_parameters_initializer(): 2 | 3 | network_parameters_dict = {} 4 | network_parameters_dict['BATCH_SIZE'] = 64 5 | network_parameters_dict['LEARNING_RATE'] = 0.0001 6 | network_parameters_dict['EPOCH'] = 50 7 | network_parameters_dict['VERBOSE'] = 1 8 | network_parameters_dict['VALIDATION_SPLIT'] = 0.1 9 | network_parameters_dict['NUM_CLASSES'] = 3 10 | network_parameters_dict['OPTIMIZER'] = 'Adam' 11 | network_parameters_dict['LOSS_FUNCTION'] = 'categorical_crossentropy' 12 | network_parameters_dict['METRICS'] = ['accuracy',recall_m,precision_m,f1_m] 13 | network_parameters_dict['DROPOUT'] = 0.30 14 | network_parameters_dict['HIDEN_ACTIVATION_FUNCTION'] = 'relu' 15 | network_parameters_dict['OUTPUT_ACTIVATION_FUNCTION'] = 'relu' 16 | network_parameters_dict['DENSE_LAYER_ACTIVATION_FUNCTION'] = ('relu','relu') 17 | network_parameters_dict['SOFTMAX_LAYER_ACTIVATION_FUNCTION'] = 'softmax' 18 | 19 | return network_parameters_dict 20 | -------------------------------------------------------------------------------- /packet_normalization.py: -------------------------------------------------------------------------------- 1 | """ 2 | The code you have provided is a function called packet_normalization(). This function takes a list of normalized packet files as input and normalizes the packets in each file. 3 | 4 | The function first declares a chunk size of 50000 bytes. This is the size of each chunk that the function will break the normalized packet files into. 5 | 6 | The function then reads the normalized packet files one by one. For each file, the function reads the payload and IP header columns. 7 | 8 | The function then converts the payload and IP header columns from hexadecimal to binary format. It then normalizes the binary values by dividing them by 255 and multiplying them by 10 raised to the power of the decimal place. 9 | 10 | The function then merges the header and payload columns into a single vector. It then converts the vector to a string and saves it to the packet_normalized_data column of a Pandas DataFrame. 11 | 12 | The function then saves the Pandas DataFrame to a new file. It also breaks the new file into smaller files, each with a size of 50000 bytes. 13 | 14 | Finally, the function returns the directory where the normalized and split files are saved. 15 | 16 | Here is a breakdown of the code: 17 | 18 | """ 19 | from Break_CSV_File import Break_CSV_File 20 | def packet_normalization(normalized_files_name, type): 21 | # declare chunk size as BYTE 22 | chunk_size_file = 50000 23 | 24 | # breake file based on pcap size file 25 | print('Size of file is : ', size, 'Byte') 26 | if type== 1: 27 | normalized_dir = 'media/mehdi/linux/normalized_data/' 28 | else: 29 | normalized_dir = 'media/mehdi/linux/normalized_app_data/' 30 | df_normalized = pd.DataFrame(columns=['packet_normalized_data', 'class_label']) 31 | binary = "{0:08b}".format(int("1a", 16)) 32 | col_list = ["payload","ip_header", "class_label"] 33 | n = 2 34 | decPlace = 4 35 | payload_list = [] 36 | header_list = [] 37 | final_packet_vector = [] 38 | for index1 in range(len(normalized_files_name)): 39 | 40 | # df = pd.read_csv("packet.csv", usecols=col_list) 41 | df = pd.read_csv(normalized_files_name[index1], usecols=col_list) 42 | print("file:{} has been read".format(normalized_files_name[index1])) 43 | for index, row in df.iterrows(): 44 | print("index row is :{}".format(index)) 45 | if(index == 65533): 46 | print("yes") 47 | payload = row["payload"].replace("'", "")[1:] 48 | header = row["ip_header"].replace("'", "")[1:] 49 | # convert hex format to binary format and binary format to decimal 50 | 51 | for i in range(0, len(payload), n): 52 | payload_list.append(payload[i:i + n]) 53 | for j in range(0, len(header), n): 54 | header_list.append(header[j:j + n]) 55 | for i in range(len(payload_list)): 56 | # convert hex to binary 57 | payload_list[i] = "{0:08b}".format(int(payload_list[i], 16)) 58 | # convert binary to decimal and normalize number 59 | payload_list[i] = int(int(payload_list[i], 2) / 255.0 * 10 ** decPlace) / 10 ** decPlace 60 | for j in range(len(header_list)): 61 | # convert hex to binary 62 | header_list[j] = "{0:08b}".format(int(header_list[j], 16)) 63 | # convert binary to decimal and normalize number 64 | header_list[j] = int(int(header_list[j], 2) / 255.0 * 10 ** decPlace) / 10 ** decPlace 65 | 66 | # merge header and payload to eachother 67 | final_packet_vector = header_list 68 | for data in payload_list: 69 | final_packet_vector.append(data) 70 | 71 | # convert list to string 72 | normalized_packet = ','.join([str(elem) for elem in final_packet_vector]) 73 | df_normalized.loc[len(df_normalized.index)] = [normalized_packet,row["class_label"]] 74 | payload_list = [] 75 | header_list = [] 76 | final_packet_vector = [] 77 | normalized_packet = '' 78 | base_filename = os.path.basename(normalized_files_name[index1]) 79 | new_filename = normalized_dir +'normalized_'+ base_filename 80 | df_normalized.to_csv(new_filename) 81 | # in this section we break large csv file to smaller ones 82 | Break_CSV_File(new_filename,chunk_size_file,normalized_dir) 83 | df_normalized = df_normalized[0:0] 84 | return normalized_dir 85 | -------------------------------------------------------------------------------- /packet_zero_pading.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | The code you have provided is a function called packet_zero_padding(). This function takes a list of packets as input and pads the packets with zeros if their length is less than 1500 bytes. 4 | 5 | The function first iterates over the list of packets. For each packet, the function gets the IP header and the payload length. 6 | 7 | If the packet length is less than 1500 bytes, the function creates a padding object with a length of 1500 - packet length. The padding object is then added to the packet. 8 | 9 | The function then prints the length of the padding object. 10 | """ 11 | 12 | def packet_zero_pading (packets): 13 | namey = 'mehdi' 14 | print(namey.ljust(8,'0')) 15 | for pkt in packets: 16 | print(len(pkt)) 17 | header_length = pkt[IP].ihl 18 | payload_length = pkt[IP].len - (header_length * 32)/8 19 | print(binascii.hexlify(struct.pack('i', 00))) 20 | print('{:x}'.format(123)) 21 | if (pkt[IP].len) < 1500: 22 | pad_len = 1500 - len(pkt[IP]) 23 | pad_str_len = int(pad_len)*2 24 | pad = Padding() 25 | pad.load = '\x00' * int(pad_len) 26 | firstdata = binascii.hexlify(bytes(pkt[Raw])) 27 | pkt = pkt / pad 28 | layer = pkt.getlayer(1) 29 | if layer.haslayer(Raw) and layer.haslayer(IP): 30 | print(b_colors.bcolors.OKBLUE + '\n[Info] Found the following (' + layer.name + ' layer): ' + layer.src + " -> " + layer.dst + b_colors.bcolors.ENDC) 31 | tcpdata = layer.getlayer(Raw).load 32 | padding = binascii.hexlify(bytes(layer.getlayer(Padding).load)) 33 | #padding2 = binascii.hexlify(bytes(b'\x00\x00\x00\00')) 34 | print(hexdump(pkt[Raw].load)) 35 | lastdata = binascii.hexlify(bytes(pkt[Raw])) 36 | mydata = lastdata.decode() 37 | print('before padding len is:{}'.format(len(mydata))) 38 | print(mydata) 39 | mydata = mydata.ljust(300, '0') 40 | print('after padding len is:{}'.format(len(mydata))) 41 | print(mydata) 42 | #print(hexdump(pkt[Padding].load)) 43 | 44 | 45 | print(len(pad)) 46 | if len(pkt[IP]) == 1500: 47 | print(pkt.show()) 48 | """ 49 | if not isinstance(packet[TCP].payload, scapy.packet.NoPayload): 50 | payload = json.loads(bytes(packet[TCP].payload).decode('utf-8')) 51 | p.update(payload) 52 | p['_data'] = base64.b64decode(payload['data']).decode('utf-8') 53 | p.__delitem__('data') 54 | arr.append(p) 55 | """ -------------------------------------------------------------------------------- /plot_heatmap_result.py: -------------------------------------------------------------------------------- 1 | def plot_heatmap(): 2 | uniform_data = np.random.rand(10, 12) 3 | ax = sns.heatmap(uniform_data, linewidth=0.5,cmap='winter',annot=True) 4 | plt.show() 5 | return 6 | 7 | def plot_heatmap_result(): 8 | rootDir = '/home/mehdi/PycharmProjects/pythonProject/result-heatmap/' 9 | csv_files = os.listdir(rootDir) 10 | for f in csv_files: 11 | # read the csv file 12 | path = os.path.join(rootDir, f) 13 | data = pd.read_csv(path) 14 | fig, ax = plt.subplots(figsize=(11,9)) 15 | sns.heatmap(data.corr(), cmap='winter') 16 | plt.show() 17 | return -------------------------------------------------------------------------------- /pmf_Dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | The code you have provided is a function called pmf_Dataset(). This function calculates the probability mass function (PMF) of the packet lengths in a set of CSV files. 3 | 4 | The function first defines a list called csv_files that stores the names of the CSV files in the specified directory. 5 | 6 | The function then defines a list called my_temp_list that stores the number of packets in each length range. The my_packet_length_list list is initialized with 0s. 7 | 8 | The function then iterates over the csv_files list. For each CSV file, the function reads the file and stores the number of packets in each length range in the my_temp_list list. 9 | 10 | The function then calculates the PMF of the packet lengths by dividing the number of packets in each length range by the total number of packets. 11 | 12 | The function finally returns the PMF of the packet lengths. 13 | 14 | The first few lines define the csv_files list and the my_temp_list list. 15 | 16 | The next few lines iterate over the csv_files list. For each CSV file, the function reads the file and stores the number of packets in each length range in the my_temp_list list. 17 | 18 | The next few lines calculate the PMF of the packet lengths by dividing the number of packets in each length range by the total number of packets. 19 | 20 | The final few lines format the PMF as strings and return it. 21 | 22 | """ 23 | 24 | def pmf_Dataset(rootDir): 25 | csv_files = os.listdir(rootDir) 26 | my_temp_list = [] 27 | my_packet_length_list = [0]*9 28 | my_packet_length_list = [float(x) for x in my_packet_length_list] 29 | # loop over the list of csv files 30 | for f in csv_files: 31 | # read the csv file 32 | path = os.path.join(rootDir, f) 33 | df = pd.read_csv(path,usecols=['Topic / Item','Count']) 34 | 35 | # print the location and filename 36 | print('Location:', path) 37 | df_percent= df.iloc[1:10,1:2] 38 | my_temp_list = df_percent['Count'].values.tolist() 39 | for item in range(len(my_temp_list)): 40 | #my_temp_list[item] = str(my_temp_list[item]).replace("%","") 41 | my_packet_length_list[item] = my_packet_length_list[item] + my_temp_list[item] 42 | print(my_temp_list) 43 | # print the content 44 | print('Content:') 45 | print(df) 46 | print() 47 | total_packet = sum(my_packet_length_list) 48 | packet_lenght_percent = [(float(x/total_packet)*100) for x in my_packet_length_list] 49 | print(packet_lenght_percent) 50 | final_packet_lenght = ['{:.2f}'.format(x) for x in packet_lenght_percent] 51 | 52 | return -------------------------------------------------------------------------------- /preprocessing-traffic-label.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | from scapy.all import * 4 | def get_application_from_filename(filename): 5 | """Gets the application of a packet from the filename. 6 | 7 | Args: 8 | filename: The filename of the packet. 9 | 10 | Returns: 11 | The application of the packet, or None if the application cannot be determined. 12 | """ 13 | # Create a dictionary to map prefixes to application names 14 | prefix_to_application = { 15 | 'icq': 'Icq', 16 | 'aim-chat': 'AIM-Chat' 17 | 'chat_facebook': 'chat_facebook', 18 | 'chat_hangout': 'chat_hangout', 19 | 'chat_gmail': 'chat_gmail', 20 | 'chat_skype': 'chat_skype', 21 | 'email': 'email', 22 | 'gmail': 'gmail', 23 | 'ftps': 'ftps', 24 | 'sftp': 'sftp', 25 | 'scp': 'scp', 26 | 'ftp_skype': 'ftp_skype', 27 | 'torrent': 'torrent', 28 | 'tor': 'tor', 29 | 'youtube': 'youtube', 30 | 'netflix': 'netflix', 31 | 'spotify': 'spotify', 32 | 'vimeo': 'vimeo', 33 | 'streaming_skype': 'streaming_skype', 34 | 'voip_skype': 'voip_skype', 35 | 'voipbuster': 'Voipbuster', 36 | 'voip_hangout': 'voipbuster', 37 | 'voip_facebook': 'voip_facebook' 38 | } 39 | 40 | # Get the prefix of the filename and convert it to lowercase 41 | prefix = filename.split(".")[0].lower() 42 | 43 | # Use the dictionary to determine the application 44 | application = prefix_to_application.get(prefix, None) 45 | 46 | return application 47 | 48 | def get_label(filename): 49 | """Gets the label of a packet from the filename. 50 | 51 | Args: 52 | filename: The filename of the packet. 53 | 54 | Returns: 55 | The label of the packet, or None if the label cannot be determined. 56 | """ 57 | 58 | # Get the prefix of the filename. 59 | prefix = filename.split(".")[0].lower() 60 | 61 | # Determine the label based on the prefix. 62 | if prefix.startswith("vpn_"): 63 | label = "VPN" 64 | else: 65 | label = "Non-VPN" 66 | 67 | return label 68 | # Function to mask the IP layer header 69 | def ip_mask(packet): 70 | if IP in packet: 71 | packet[IP].src = '0.0.0.0' 72 | packet[IP].dst = '0.0.0.0' 73 | 74 | # Function to normalize a packet's byte values to [0-1] 75 | def normalize_packet(packet): 76 | if Raw in packet: 77 | raw_data = bytes(packet[Raw]) 78 | normalized_data = [byte / 255.0 for byte in raw_data] 79 | packet[Raw].load = bytes(normalized_data) 80 | 81 | # Function to split and pad packets if length > 1500 82 | def split_and_pad(packet): 83 | if len(packet) > 1500: 84 | num_packets = len(packet) // 1500 + 1 85 | split_packets = [packet[i:i + 1500] for i in range(0, len(packet), 1500)] 86 | for i in range(num_packets): 87 | if len(split_packets[i]) < 1500: 88 | split_packets[i] += b'\x00' * (1500 - len(split_packets[i])) 89 | return split_packets 90 | else: 91 | return [packet] 92 | 93 | # Function to categorize file types based on the filename 94 | def categorize_file_type(filename): 95 | if "vpn" in filename: 96 | if "chat" in filename: 97 | return 1 98 | elif "email" in filename: 99 | return 3 100 | elif any(word in filename for word in ["facebook_audio", "hangouts_audio", "skype_audio", "voip"]): 101 | return 5 102 | elif any(word in filename for word in ["ftp", "file", "scp", "sftp"]): 103 | return 7 104 | elif any(word in filename for word in ["vimeo", "youtube", "netflix", "hangouts_video", "facebook_video", "skype_video"]): 105 | return 9 106 | elif "Torrent01" in filename: 107 | return 11 108 | elif "tor" in filename: 109 | return 13 110 | else: # Non-VPN traffic 111 | if "chat" in filename: 112 | return 0 113 | elif "email" in filename: 114 | return 2 115 | elif any(word in filename for word in ["facebook_audio", "hangouts_audio", "skype_audio", "voip"]): 116 | return 4 117 | elif any(word in filename for word in ["ftp", "file", "scp", "sftp"]): 118 | return 6 119 | elif any(word in filename for word in ["vimeo", "youtube", "netflix", "hangouts_video", "facebook_video", "skype_video"]): 120 | return 8 121 | elif "Torrent01" in filename: 122 | return 10 123 | elif "tor" in filename: 124 | return 14 125 | return None 126 | def read_app_pcap_files(): 127 | #file_list = [x for x in os.listdir('/home/mehdi') if x.endswith(".pcap")] 128 | #print(file_list) 129 | root_dir = '/media/mehdi/linux/data/CompletePCAPs' 130 | file_name_list_full_path = [] 131 | file_name_list = [] 132 | file_name_dict = {} 133 | for path in os.listdir(root_dir): 134 | full_path = os.path.join(root_dir, path) 135 | if os.path.isfile(full_path) and (path.endswith(".pcap") or path.endswith("pcapng")): 136 | print(full_path) 137 | file_name_list_full_path.append(full_path) 138 | file_name_list.append(path) 139 | 140 | # find category of ISCX VPN-NONVPN DATASET 141 | for i in range(len(file_name_list_full_path)): 142 | app_name = get_application_from_filename(file_name_list_full_path[i]) 143 | if "icq" in app_name: 144 | file_name_dict[file_name_list_full_path[i]] = 1 #"vpn icq" 145 | elif "chat_facebook" in app_name: 146 | file_name_dict[file_name_list_full_path[i]] = 2 #"chat_facebook" 147 | elif "chat_hangout" in app_name: 148 | file_name_dict[file_name_list_full_path[i]] = 3 #"chat_hangout" 149 | elif "chat_gmail" in app_name: 150 | file_name_dict[file_name_list_full_path[i]] = 4 #"chat_gmail" 151 | elif "chat_skype" in app_name: 152 | file_name_dict[file_name_list_full_path[i]] = 5 #"chat_skype" 153 | elif "email" in app_name: 154 | file_name_dict[file_name_list_full_path[i]] = 6 #"email" 155 | elif "gmail" in app_name: 156 | file_name_dict[file_name_list_full_path[i]] = 4 #"gmail" 157 | elif "ftps" in app_name: 158 | file_name_dict[file_name_list_full_path[i]] = 7 #"ftps" 159 | elif "sftp" in app_name: 160 | file_name_dict[file_name_list_full_path[i]] = 8 #"sftp" 161 | elif "scp" in app_name: 162 | file_name_dict[file_name_list_full_path[i]] = 9 #"scp" 163 | elif "ftp_skype" in app_name: 164 | file_name_dict[file_name_list_full_path[i]] = 5 #"ftp_skype" 165 | elif "torrent" in app_name: 166 | file_name_dict[file_name_list_full_path[i]] = 10 #"torrent" 167 | elif "youtube" in app_name: 168 | file_name_dict[file_name_list_full_path[i]] = 11 #"youtube" 169 | elif "netflix" in app_name: 170 | file_name_dict[file_name_list_full_path[i]] = 12 #"netflix" 171 | elif "spotify" in app_name: 172 | file_name_dict[file_name_list_full_path[i]] = 13 #"spotify" 173 | elif "vimeo" in app_name: 174 | file_name_dict[file_name_list_full_path[i]] = 14 #"vimeo" 175 | elif "streaming_skype" in app_name: 176 | file_name_dict[file_name_list_full_path[i]] = 5 #"streaming_skype" 177 | elif "voip_skype" in app_name: 178 | file_name_dict[file_name_list_full_path[i]] = 5 #"voip_skype" 179 | elif "voipbuster" in app_name: 180 | file_name_dict[file_name_list_full_path[i]] = 15 #"voipbuster" 181 | elif "voip_hangout" in app_name: 182 | file_name_dict[file_name_list_full_path[i]] = 3 #"voip_hangout" 183 | elif "voip_facebook" in app_name: 184 | file_name_dict[file_name_list_full_path[i]] = 2 #"voip_facebook" 185 | elif "aim-chat" in app_name: 186 | file_name_dict[file_name_list_full_path[i]] = 16 #"AIM-Chat" 187 | elif "tor" in app_name: 188 | file_name_dict[file_name_list_full_path[i]] = 17 #"tor" 189 | else: 190 | pass 191 | 192 | 193 | print(file_name_dict) 194 | exctracted_root_dir = load_pcap_datatype(file_name_dict) 195 | 196 | return exctracted_root_dir 197 | def read_app_pacap_load() 198 | # Path to the directory containing PCAP files 199 | pcap_directory = '/path/to/pcap/files' 200 | 201 | # Create a CSV file for saving normalized packets 202 | csv_file = open('normalized_packets.csv', 'w', newline='') 203 | csv_writer = csv.writer(csv_file) 204 | 205 | # Iterate through each PCAP file in the directory 206 | or pcap_file in os.listdir(pcap_directory): 207 | if pcap_file.endswith('.pcap'): 208 | label = categorize_file_type(pcap_file) 209 | application = get_application_from_filename(pcap_path.name) 210 | app_label = get_label(pcap_path.name) 211 | if label is not None: 212 | packets = rdpcap(os.path.join(pcap_directory, pcap_file)) 213 | 214 | for packet in packets: 215 | if Raw in packet and packet.haslayer(Ether) and packet[Ether].type == 0x800: 216 | ip_mask(packet) 217 | normalized_packets = split_and_pad(packet) 218 | for normalized_packet in normalized_packets: 219 | normalize_packet(normalized_packet) 220 | csv_writer.writerow([bytes(normalized_packet), label,application+app_label]) 221 | 222 | # Close the CSV file 223 | csv_file.close() 224 | -------------------------------------------------------------------------------- /print_summary.py: -------------------------------------------------------------------------------- 1 | """ 2 | The code you have provided is a function called print_summary(). This function takes a packet as input and prints the source and destination IP addresses and the source and destination TCP ports of the packet. 3 | 4 | The function first checks if the packet contains an IP layer. If it does, then the function gets the source and destination IP addresses of the packet. 5 | 6 | The function then checks if the packet contains a TCP layer. If it does, then the function gets the source and destination TCP ports of the packet. 7 | 8 | Finally, the function prints the source and destination IP addresses and the source and destination TCP ports of the packet. 9 | 10 | The first line checks if the packet contains an IP layer. The IP layer is a required layer for all IP packets. If the packet does not contain an IP layer, then the function does not print anything. 11 | 12 | The second line gets the source IP address of the packet. The src attribute of the IP object returns the source IP address of the packet. 13 | 14 | The third line gets the destination IP address of the packet. The dst attribute of the IP object returns the destination IP address of the packet. 15 | 16 | The fourth line checks if the packet contains a TCP layer. The TCP layer is not a required layer for all IP packets. If the packet does not contain a TCP layer, then the function does not print anything. 17 | 18 | The fifth line gets the source TCP port of the packet. The sport attribute of the TCP object returns the source TCP port of the packet. 19 | 20 | The sixth line gets the destination TCP port of the packet. The dport attribute of the TCP object returns the destination TCP port of the packet. 21 | 22 | Finally, the seventh and eighth lines print the source and destination IP addresses and the source and destination TCP ports of the packet. 23 | 24 | """ 25 | def print_summary(pkt): 26 | if IP in pkt: 27 | ip_src=pkt[IP].src 28 | ip_dst=pkt[IP].dst 29 | print(' IP src is: {}'.format(str(ip_src))) 30 | print(' IP dst is: {}'.format(str(ip_dst))) 31 | if TCP in pkt: 32 | tcp_sport=pkt[TCP].sport 33 | tcp_dport=pkt[TCP].dport 34 | print(' TCP sport is: {}'.format(str(tcp_sport))) 35 | print(' TCP dport is: {}'.format(str(tcp_dport))) -------------------------------------------------------------------------------- /process_metadata_pcap.py: -------------------------------------------------------------------------------- 1 | """ 2 | The code you have provided is a function called process_metadata_pcap(). This function reads a pcap file and prints the metadata of the first and the last packets in the connection between the two hosts specified by the client and server parameters. 3 | 4 | The function first opens the pcap file and gets the number of packets in the file. It then iterates over the packets in the file. For each packet, the function first creates an Ether object from the packet data. The Ether class in the packet library represents an Ethernet packet. The function then checks if the type field of the Ether object is equal to 0x0800. If it is, then the packet is an IPv4 packet. 5 | 6 | The function then checks if the proto field of the IP object is equal to 6. If it is, then the packet is a TCP packet. 7 | 8 | The function then checks if the source or destination IP address of the packet matches the client or server parameter. If it does, then the function increments the interesting_packet_count variable. 9 | 10 | If the packet is an interesting packet, then the function checks if it is the first or the last packet in the connection. If it is, then the function stores the timestamp and ordinal number of the packet. 11 | 12 | Finally, the function prints the total number of packets in the file, the number of interesting packets, and the timestamps and ordinal numbers of the first and the last packets in the connection. 13 | 14 | 15 | """ 16 | 17 | # In this code iteration, we’ll access the packet’s metadata; 18 | # in particular the timestamps and ordinal numbers (i.e. packet number within the packet capture) of the first and the last packets of the connection that we’re interested in. 19 | def process_metadata_pcap(file_name): 20 | print('Opening {}...'.format(file_name)) 21 | 22 | client = '192.168.43.75:54732' 23 | server = '172.217.22.78:443' 24 | 25 | (client_ip, client_port) = client.split(':') 26 | (server_ip, server_port) = server.split(':') 27 | 28 | count = 0 29 | interesting_packet_count = 0 30 | first_pkt_timestamp = 0 31 | first_pkt_ordinal = 0 32 | first_pkt_timestamp_resolution= 0 33 | last_pkt_ordinal = 0 34 | last_pkt_timestamp_resolution = 0 35 | 36 | for (pkt_data, pkt_metadata,) in RawPcapReader(file_name): 37 | count += 1 38 | ether_pkt = Ether(pkt_data) 39 | if 'type' not in ether_pkt.fields: 40 | # LLC frames will have 'len' instead of 'type'. 41 | # We disregard those 42 | continue 43 | 44 | if ether_pkt.type != 0x0800: 45 | # disregard non-IPv4 packets 46 | continue 47 | 48 | ip_pkt = ether_pkt[IP] 49 | 50 | if ip_pkt.proto != 6: 51 | # Ignore non-TCP packet 52 | continue 53 | 54 | if (ip_pkt.src != server_ip) and (ip_pkt.src != client_ip): 55 | # Uninteresting source IP address 56 | continue 57 | 58 | if (ip_pkt.dst != server_ip) and (ip_pkt.dst != client_ip): 59 | # Uninteresting destination IP address 60 | continue 61 | 62 | tcp_pkt = ip_pkt[TCP] 63 | 64 | if (tcp_pkt.sport != int(server_port)) and \ 65 | (tcp_pkt.sport != int(client_port)): 66 | # Uninteresting source TCP port 67 | continue 68 | 69 | if (tcp_pkt.dport != int(server_port)) and \ 70 | (tcp_pkt.dport != int(client_port)): 71 | # Uninteresting destination TCP port 72 | continue 73 | 74 | interesting_packet_count += 1 75 | if interesting_packet_count == 1: 76 | first_pkt_timestamp = (pkt_metadata.tshigh << 32) | pkt_metadata.tslow 77 | first_pkt_timestamp_resolution = pkt_metadata.tsresol 78 | first_pkt_ordinal = count 79 | 80 | last_pkt_timestamp = (pkt_metadata.tshigh << 32) | pkt_metadata.tslow 81 | last_pkt_timestamp_resolution = pkt_metadata.tsresol 82 | last_pkt_ordinal = count 83 | # --- 84 | 85 | print('{} contains {} packets ({} interesting)'. 86 | format(file_name, count, interesting_packet_count)) 87 | 88 | print('First packet in connection: Packet #{} {}'. 89 | format(first_pkt_ordinal, 90 | printable_timestamp(first_pkt_timestamp, 91 | first_pkt_timestamp_resolution))) 92 | print(' Last packet in connection: Packet #{} {}'. 93 | format(last_pkt_ordinal, 94 | printable_timestamp(last_pkt_timestamp, 95 | last_pkt_timestamp_resolution))) 96 | def printable_timestamp(ts, resol): 97 | ts_sec = ts // resol 98 | ts_subsec = ts % resol 99 | ts_sec_str = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ts_sec)) 100 | return '{}.{}'.format(ts_sec_str, ts_subsec) -------------------------------------------------------------------------------- /process_pcap.py: -------------------------------------------------------------------------------- 1 | """ 2 | The code you have provided is a function called process_pcap(). 3 | This function reads a pcap file and prints some information about each packet in the file. 4 | 5 | The function first opens the pcap file and gets the number of packets in the file. 6 | It then iterates over the packets and prints the following information for each packet: 7 | 8 | The packet number 9 | The source IP address 10 | The destination IP address 11 | The source TCP port 12 | The destination TCP port 13 | Finally, the function prints the total number of packets in the file. 14 | 15 | 16 | The first line opens the pcap file and gets the number of packets in the file. 17 | The RawPcapReader() function takes the name of the pcap file as input and returns an iterator that yields tuples of (packet data, packet metadata). 18 | The count variable keeps track of the number of packets processed. 19 | 20 | The next line starts an iteration over the packets in the file. For each packet, the function first creates an Ether object from the packet data. 21 | The Ether class in the packet library represents an Ethernet packet. The function then creates an IP object and a TCP object from the Ether object. 22 | The IP class represents an IP packet and the TCP class represents a TCP packet. 23 | 24 | The function then prints the following information for the packet: 25 | 26 | The packet number 27 | The source IP address 28 | The destination IP address 29 | The source TCP port 30 | The destination TCP port 31 | Finally, the function prints the total number of packets in the file. 32 | 33 | Here is a breakdown of the code: 34 | 35 | """ 36 | def process_pcap(file_name): 37 | print('Opening {}...'.format(file_name)) 38 | count = 0 39 | for (pkt_data, pkt_metadata,) in RawPcapReader(file_name): 40 | count += 1 41 | ether_pkt = Ether(pkt_data) 42 | ip_pkt = ether_pkt[IP] 43 | tcp_pkt = ip_pkt[TCP] 44 | print('packet number is: {}'.format(count)) 45 | print(' IP src is: {}'.format(ip_pkt.src)) 46 | print(' IP dst is: {}'.format(str(ip_pkt.dst))) 47 | print(' TCP sport is: {}'.format(str(tcp_pkt.sport))) 48 | print(' TCP dport is: {}'.format(str(tcp_pkt.dport))) 49 | 50 | 51 | 52 | print('{} contains {} packets'.format(file_name, count)) 53 | -------------------------------------------------------------------------------- /read_pcap_files.py: -------------------------------------------------------------------------------- 1 | """ 2 | The code you have provided is a function called read_pcap_files(). This function reads all the pcap files in a directory and creates a dictionary that maps the file name to the corresponding category. 3 | 4 | The function first gets the list of all the files in the directory. It then iterates over the files and checks if the file is a pcap file. If it is, the function adds the file name and its category to the dictionary. 5 | 6 | The function then calls the load_pcap_datatype() function to load the data from the pcap files. 7 | 8 | Finally, the function prints the dictionary and returns it. 9 | 10 | Here is a breakdown of the code: 11 | """ 12 | def read_pcap_files(): 13 | #file_list = [x for x in os.listdir('/home/mehdi') if x.endswith(".pcap")] 14 | #print(file_list) 15 | root_dir = '/media/mehdi/linux/data/CompletePCAPs' 16 | file_name_list_full_path = [] 17 | file_name_list = [] 18 | file_name_dict = {} 19 | for path in os.listdir(root_dir): 20 | full_path = os.path.join(root_dir, path) 21 | if os.path.isfile(full_path) and (path.endswith(".pcap") or path.endswith("pcapng")): 22 | print(full_path) 23 | file_name_list_full_path.append(full_path) 24 | file_name_list.append(path) 25 | 26 | # find category of ISCX VPN-NONVPN DATASET 27 | for i in range(len(file_name_list_full_path)): 28 | if "vpn" in file_name_list[i]: 29 | if "chat" in file_name_list[i]: 30 | file_name_dict[file_name_list_full_path[i]] = 1 #"vpn chat" 31 | elif "email" in file_name_list[i]: 32 | file_name_dict[file_name_list_full_path[i]] = 3 #"vpn email" 33 | elif ("facebook_audio" in file_name_list[i]) or ("hangouts_audio" in file_name_list[i])\ 34 | or ("skype_audio" in file_name_list[i]) or ("voip" in file_name_list[i]): 35 | file_name_dict[file_name_list_full_path[i]] = 5 #"vpn audio streaming" 36 | elif ("ftp" in file_name_list[i]) or ("file" in file_name_list[i]) \ 37 | or ("scp" in file_name_list[i]) or ("sftp" in file_name_list[i]): 38 | file_name_dict[file_name_list_full_path[i]] = 7 #"vpn ftp" 39 | elif ("vimeo" in file_name_list[i]) or ("youtube" in file_name_list[i]) or ("netflix" in file_name_list[i]) \ 40 | or ("hangouts_video" in file_name_list[i]) or ("facebook_video" in file_name_list[i]) or ("skype_video" in file_name_list[i]): 41 | file_name_dict[file_name_list_full_path[i]] = 9 #"vpn video streaming" 42 | elif "Torrent01" in file_name_list[i]: 43 | file_name_dict[file_name_list_full_path[i]] = 11 #"vpn p2p" 44 | elif "tor" in file_name_list[i]: 45 | file_name_dict[file_name_list_full_path[i]] = 13 # "vpn tor" 46 | else: 47 | pass 48 | else: 49 | if "chat" in file_name_list[i]: 50 | file_name_dict[file_name_list_full_path[i]] = 0 #"chat" 51 | elif "email" in file_name_list[i]: 52 | file_name_dict[file_name_list_full_path[i]] = 2 #"email" 53 | elif ("facebook_audio" in file_name_list[i]) or ("hangouts_audio" in file_name_list[i]) \ 54 | or ("skype_audio" in file_name_list[i]) or ("voip" in file_name_list[i]): 55 | file_name_dict[file_name_list_full_path[i]] = 4 #"audio streaming" 56 | elif ("ftp" in file_name_list[i]) or ("file" in file_name_list[i]) \ 57 | or ("scp" in file_name_list[i]) or ("sftp" in file_name_list[i]): 58 | file_name_dict[file_name_list_full_path[i]] = 6 #"ftp" 59 | elif ("vimeo" in file_name_list[i]) or ("youtube" in file_name_list[i]) or ("netflix" in file_name_list[i]) \ 60 | or ("hangouts_video" in file_name_list[i]) or ("facebook_video" in file_name_list[i]) or ("skype_video" in file_name_list[i]): 61 | file_name_dict[file_name_list_full_path[i]] = 8 #"video streaming" 62 | elif "Torrent01" in file_name_list[i]: 63 | file_name_dict[file_name_list_full_path[i]] = 10 #"p2p" 64 | elif "tor" in file_name_list[i]: 65 | file_name_dict[file_name_list_full_path[i]] = 14 # "tor" 66 | else: 67 | pass 68 | print(file_name_dict) 69 | exctracted_root_dir = load_pcap_datatype(file_name_dict) 70 | 71 | return exctracted_root_dir 72 | -------------------------------------------------------------------------------- /transform_pcap_to_dataframe.py: -------------------------------------------------------------------------------- 1 | """ 2 | The code you have provided is a function called transform_pcap_to_dataframe(). This function transforms a set of packets into a Pandas DataFrame. 3 | 4 | The function first defines a list of fields for the IP and TCP layers. 5 | 6 | The function then creates a blank DataFrame with the specified fields. 7 | 8 | The function then iterates over the packets. For each packet, the function reads the values of the IP and TCP fields and appends them to a row in the DataFrame. 9 | 10 | The function finally saves the DataFrame to a CSV file. 11 | 12 | The function first defines the list of fields for the IP and TCP layers. This list is used to create the DataFrame. 13 | 14 | The function then creates a blank DataFrame with the specified fields. 15 | 16 | The function then iterates over the packets. For each packet, the function reads the values of the IP and TCP fields and appends them to a row in the DataFrame. 17 | 18 | The function finally saves the DataFrame to a CSV file. 19 | """ 20 | 21 | 22 | def transform_pcap_to_dataframe(packets): 23 | 24 | # Store the pre-defined fields name in IP, TCP layers 25 | f_ip = [field.name for field in IP().fields_desc] 26 | f_tcp = [field.name for field in TCP().fields_desc] 27 | f_udp = [field.name for field in UDP().fields_desc] 28 | print(f_ip) # field name of IP Layer 29 | print(f_tcp) # field name of TCP Layer 30 | print(f_udp) # field name of UDP Layer 31 | f_all = f_ip + ['time'] + f_tcp + ['payload'] 32 | # Blank DataFrame 33 | df_field = pd.DataFrame(columns=f_all) 34 | # store data for each row of DataFrame 35 | for pkt in packets: 36 | field_values = [] 37 | # Read values of IP fields 38 | if pkt.haslayer(TCP) and pkt.haslayer(IP): 39 | for field in f_ip: 40 | try: 41 | if field == 'options': 42 | # we only store the number of options defined in IP Header 43 | field_values.append(len(pkt[IP].fields[field])) 44 | else: 45 | field_values.append(pkt[IP].fields[field]) 46 | except: 47 | # the field value may not exist 48 | field_values.append(None) 49 | 50 | # Read values of Time 51 | field_values.append(packet.time) 52 | # Read values of TCP fields 53 | layer_type = type(pkt[IP].payload) 54 | for field in f_tcp: 55 | try: 56 | 57 | if field == 'options': 58 | field_values.append(len(pkt[layer_type].fields[field])) 59 | else: 60 | field_values.append(pkt[layer_type].fields[field]) 61 | except: 62 | # the field value may not exist 63 | field_values.append(None) 64 | # Read values of Payload 65 | field_values.append(len(pkt[layer_type].payload)) 66 | # Fill the data of one row 67 | df_append = pd.DataFrame([field_values], columns=f_all) 68 | # Append row in df 69 | df_field = pd.concat([df_field, df_append], axis=0) 70 | df_field.to_csv('packet1.cvs') 71 | """ 72 | src_addr = df_field.groupby("src")['payload'].sum() # show the sum of payload for each src ip 73 | src_addr.plot(kind='barh', figsize=(8, 2)) # plot figure 74 | plt.show() 75 | """ 76 | plt.hist(df_field['payload'],bins = 20) 77 | plt.show() 78 | return 79 | -------------------------------------------------------------------------------- /var_function_initializer.py: -------------------------------------------------------------------------------- 1 | # Parameter Initialization 2 | def var_function_initializer(): 3 | size_list = [4,1,5,1] 4 | stride_list = [3,1,1,1] 5 | parameters_dict = {} 6 | parameters_dict['BATCH_SIZE'] = 64 7 | parameters_dict['EPOCH'] = 10 8 | parameters_dict['VERBOSE'] = 1 9 | parameters_dict['VALIDATION_SPLIT'] = 0.16 10 | parameters_dict['NUM_CLASSES'] = 3 11 | parameters_dict['OPTIMIZER'] = 'Adam' 12 | parameters_dict['LOSS_FUNCTION'] = 'categorical_crossentropy' 13 | parameters_dict['METRICS'] = ['accuracy',recall_m,precision_m,f1_m] 14 | parameters_dict['DROPOUT'] = 0.12 15 | parameters_dict['KERNEL_SIZE'] = [] 16 | parameters_dict['FILTERS'] = 2 17 | parameters_dict['STRIDES'] = [] 18 | parameters_dict['PADDING'] = 'same' 19 | parameters_dict['POOL_SIZE'] = (2,1) 20 | parameters_dict['POOL_STRIDE'] = (2,1) 21 | parameters_dict['HIDEN_ACTIVATION_FUNCTION'] = 'relu' 22 | parameters_dict['OUTPUT_ACTIVATION_FUNCTION'] = 'relu' 23 | parameters_dict['INPUT_SHAPE'] = (1500,1) 24 | parameters_dict['CNN_LAYER_SPEC'] = (2,200,200) 25 | parameters_dict['DENSE_LAYER'] = (2,300,200) 26 | parameters_dict['DENSE_LAYER_ACTIVATION_FUNCTION'] = ('relu','relu') 27 | parameters_dict['SOFTMAX_LAYER'] = 3 28 | parameters_dict['SOFTMAX_LAYER_ACTIVATION_FUNCTION'] = 'softmax' 29 | for i in range(parameters_dict['FILTERS']): 30 | parameters_dict['KERNEL_SIZE'].append(size_list[2*i]) 31 | parameters_dict['KERNEL_SIZE'].append(size_list[2*i+1]) 32 | parameters_dict['STRIDES'].append(stride_list[2*i]) 33 | parameters_dict['STRIDES'].append(stride_list[2*i+1]) 34 | 35 | 36 | 37 | 38 | return parameters_dict --------------------------------------------------------------------------------