├── Architecture and Preprocessing
    ├── CBS-Architecture.svg
    └── CBS-Preprocessing.svg
├── Bi-LSTM_Traffic_classification .py
├── Bi-LSTM_build_model.py
├── Break_CSV_File.py
├── Break_Data_File.py
├── Dataset
    ├── ISCX VPN-NonVPN 2016.md
    └── dataset-more description1.md
├── FC-traffic-classification.py
├── ISCX-Analysis
    ├── README.md
    └── session features.py
├── README.md
├── SAE_Traffic_classification.py
├── SAE_build_model.py
├── cnn_Traffic_classification.py
├── cnn_build_model.py
├── compare-accuracy-code.py
├── defin_1D-CNN_model_params.py
├── define_Bi-LSTM_model_params.py
├── define_FC_model_params.py
├── define_GAN_model.py
├── define_SAE_model_params.py
├── define_autoencoder.py
├── extract_header_payload_packets.py
├── fc_build_model.py
├── gausian-compare-accuracy-code.py
├── gausian-validation-train-acc.py
├── gausian-validation-training-loss.py
├── histogram_Dataset.py
├── ip_masking.py
├── load_pcap_datatype.py
├── main.py
├── memory usage-execution time.py
├── metric-evaluation.py
├── network_parameters_initializer.py
├── packet_normalization.py
├── packet_zero_pading.py
├── plot_heatmap_result.py
├── pmf_Dataset.py
├── preprocessing-traffic-label.py
├── print_summary.py
├── process_metadata_pcap.py
├── process_pcap.py
├── read_pcap_files.py
├── transform_pcap_to_dataframe.py
└── var_function_initializer.py


/Bi-LSTM_Traffic_classification .py:
--------------------------------------------------------------------------------
  1 | # in this function we build CNN for Traffic Classification
  2 | from memory usage-execution time import measure_execution_memory
  3 | from BiLSTM_build_model import BiLSTM_build_model
  4 | @measure_execution_memory
  5 | def Bi-LSTM_Traffic_classification(root_normalized_dir,net_parameters,model_params):
  6 |     df_normalized = pd.DataFrame(columns=['packet_normalized_data', 'class_label'])
  7 |     df_train = pd.DataFrame(columns=['packet_normalized_data'])
  8 |     binary = "{0:08b}".format(int("1a", 16))
  9 |     col_list = ['packet_normalized_data', 'class_label']
 10 | 
 11 |     # list out keys and values separately
 12 |     key_list = list(net_parameters.keys())
 13 |     val_list = list(net_parameters.values())
 14 |     
 15 |      # list out keys and values separately
 16 |     key_list1 = list(model_params.keys())
 17 |     val_list1 = list(model_params.values())
 18 |     DENSE_LAYER =  val_list1[key_list1.index("dense_neurons")]
 19 |     # network parameters
 20 |     BATCH_SIZE = val_list[key_list.index("BATCH_SIZE")]
 21 |     EPOCH = val_list[key_list.index("EPOCH")]
 22 |     VERBOSE =  val_list[key_list.index("VERBOSE")]
 23 |     #OPTIMIZER = tf.keras.optimizers.Adam()
 24 |     VALIDATION_SPLIT = val_list[key_list.index("VALIDATION_SPLIT")]
 25 |     #NUM_CLASSES = val_list[key_list.index("NUM_CLASSES")]
 26 |     OPTIMIZER = val_list[key_list.index("OPTIMIZER")]
 27 |     LOSS_FUNCTION = val_list[key_list.index("LOSS_FUNCTION")]
 28 |     METRICS =  val_list[key_list.index("METRICS")[0], key_list.index("METRICS")[1](),key_list.index("METRICS")[2](),key_list.index("METRICS")[3]()]
 29 |     DROPOUT = val_list[key_list.index("DROPOUT")]
 30 | 
 31 |     for path in os.listdir(root_normalized_dir):
 32 |         full_path = os.path.join(root_normalized_dir, path)
 33 |         df = pd.read_csv(full_path, usecols=col_list)
 34 |         model = Bi-LSTM_build_model(model_params)
 35 |         model.compile(loss=LOSS_FUNCTION,
 36 |                       optimizer=OPTIMIZER,
 37 |                       metrics=METRICS)
 38 |         model.summary()
 39 |         print("this is running Bi-LSTM model: ")
 40 | 
 41 |         # train on model
 42 |         X = df.iloc[:,0:1] # Data
 43 |         Y =  df.iloc[:,1:2]  # Label
 44 |         X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
 45 |         # prepare label of packet for deep NN
 46 |         train_label_data_list = []
 47 |         test_label_data_list = []
 48 |         pkt_train_label_data = np.zeros([len(y_train), 1])
 49 |         pkt_test_label_data = np.zeros([len(y_test), 1])
 50 |         for i in range(len(y_train)):
 51 |             pkt_train_label_data[i,0] = y_train.iloc[i,0]
 52 |         #train_label_data_list.append(pkt_train_label_data)
 53 |         train_label = np.array(pkt_train_label_data)
 54 |         train_label = train_label[:,0]
 55 |         train_label = train_label.astype(np.int)
 56 |         for i in range(len(y_test)):
 57 |             pkt_test_label_data[i,0] = y_test.iloc[i,0]
 58 |         #test_label_data_list.append(pkt_test_label_data)
 59 |         test_label = np.array(pkt_test_label_data)
 60 |         test_label = test_label[:, 0]
 61 |         test_label = test_label.astype(np.int)
 62 | 
 63 |         #y_train = y_train.to_numpy()
 64 |         #y_train = y_train.T
 65 |         # To create a x-by-y-by-z 3D list with initial values:
 66 |         
 67 |         data_list = []
 68 |         test_list = []
 69 | 
 70 |         pkt_data = np.zeros([len(X_train.iloc[0, 0].split(',')), 1])
 71 |         for i in range(len(X_train)):
 72 |             print("trian preparing data i {}".format(i))
 73 |             pkt_train_data = np.zeros([len(X_train.iloc[0, 0].split(',')), 1])
 74 |             temp_train_list = X_train.iloc[i, 0].split(',')[:]
 75 |             for j in range(len(temp_train_list)):
 76 |                 #print("test preparing data j {}".format(j))
 77 |                 pkt_train_data[j,0] = float(temp_train_list[j])
 78 |             data_list.append(pkt_train_data)
 79 |             
 80 |         train_data = np.array(data_list)
 81 |         for i in range(len(X_test)):
 82 |             print("test preparing data i {}".format(i))
 83 |             pkt_test_data = np.zeros([len(X_test.iloc[0, 0].split(',')), 1])
 84 |             temp_test_list = X_test.iloc[i, 0].split(',')[:]
 85 |             for j in range(len(temp_test_list)):
 86 |                 #print("test preparing data j {}".format(j))
 87 |                 pkt_test_data[j,0] = float(temp_test_list[j])
 88 |             test_list.append(pkt_test_data)
 89 |         test_data = np.array(test_list)
 90 |         # convert class vectors to binary class matrices
 91 |         train_label = tf.keras.utils.to_categorical(train_label, NUM_CLASSES)
 92 |         test_label = tf.keras.utils.to_categorical(test_label, NUM_CLASSES)
 93 |         
 94 |         train_data = train_data.reshape((len(train_data), len(X_train.iloc[0,0].split(',')), 1, 1))
 95 |         test_data = test_data.reshape((len(test_data), len(X_test.iloc[0, 0].split(',')), 1, 1))
 96 |         #model = create_1dcnn_model()
 97 |         
 98 |                
 99 |         model.fit(train_data, train_label, batch_size=BATCH_SIZE,
100 |                   epochs=EPOCH,verbose=VERBOSE,validation_split= VALIDATION_SPLIT )
101 |         #score = model.evaluate(test_data, test_label,
102 |         #                       batch_size=BATCH_SIZE)
103 |         (loss, accuracy, f1_score, precision, recall) = model.evaluate(test_data, test_label,
104 |                                batch_size=BATCH_SIZE)
105 |         score = []
106 |         score[0] = loss
107 |         score[1]= accuracy
108 |         score[2] = f1_score
109 |         score[3] = precision
110 |         score[4] = recall
111 |         
112 |         print("\nTest loss:", score[0])
113 |         print('Test accuracy:', score[1])
114 |         print('Test f1_score:', score[2])
115 |         print('Test precision:', score[3])
116 |         print('Test recall:', score[4])
117 |     saved_models = []
118 |     saved_weights = [] 
119 |     save_model_weights_dir = 'media/mehdi/linux/normalized_data/'    
120 |     # save model architecture    
121 |     model.save(save_model_weights_dir + 'model_architecture_bilstm.h5')
122 |     saved_models.append('model_architecture_bilstm.h5') 
123 |     # save model weights
124 |     model.save_weights(save_model_weights_dir + 'model_weights_bilstm.h5')      
125 |     saved_weights.append('model_weights_bilstm.h5')
126 |     # Get the output of the last connected layer
127 |     last_dense_output = model.layers[-len(DENSE_LAYER)].output   
128 |     return last_dense_output,saved_models,saved_weights,save_model_weights_dir        
129 |         
130 | 


--------------------------------------------------------------------------------
/Bi-LSTM_build_model.py:
--------------------------------------------------------------------------------
 1 | def BiLSTM_build_model(params):
 2 | 
 3 |    
 4 |   # Define the model architecture
 5 |   model = tf.keras.Sequential()
 6 | 
 7 |   # Add the Bi-LSTM layers
 8 |   model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(params['input_shape'][1], return_sequences=True), input_shape=params['input_shape']))
 9 |   model.add(tf.keras.layers.BatchNormalization())
10 |   model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(params['input_shape'][1], return_sequences=True)))
11 |   model.add(tf.keras.layers.BatchNormalization())
12 |   model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(params['input_shape'][1], return_sequences=True)))
13 |   model.add(tf.keras.layers.BatchNormalization()) 
14 | 
15 |   # Add the attention layer
16 |   model.add(tf.keras.layers.Attention())
17 | 
18 |   # Add the fully connected layers (MLP)
19 |   model.add(tf.keras.layers.Dense(params['dense_neurons'][0], activation=params['activation']))
20 |   model.add(tf.keras.layers.BatchNormalization())
21 |   model.add(tf.keras.layers.Dense(params['dense_neurons'][1], activation=params['activation']))
22 |   model.add(tf.keras.layers.BatchNormalization())
23 |   model.add(tf.keras.layers.Dense(params['dense_neurons'][2], activation=params['activation']))
24 |   model.add(tf.keras.layers.BatchNormalization()) 
25 |   model.add(tf.keras.layers.Dense(params['dense_neurons'][3], activation=params['activation']))
26 |   model.add(tf.keras.layers.BatchNormalization())  
27 |   # Save the model
28 |   model.save('bilstm_model.h5')
29 |    
30 |   return model
31 | 


--------------------------------------------------------------------------------
/Break_CSV_File.py:
--------------------------------------------------------------------------------
 1 | 
 2 | """
 3 | The code you have provided is a function called Break_CSV_File(). This function breaks a large CSV file into smaller files of a specified size.
 4 | 
 5 | The function first creates a list to store the number of rows in each chunk.
 6 | 
 7 | The function then iterates over the CSV file in chunks of the specified size. For each chunk, the function adds the number of rows in the chunk to the list.
 8 | 
 9 | If the number of chunks is greater than or equal to 2, then the function breaks the CSV file into smaller files. The function creates a new file for each chunk and writes the chunk to the file.
10 | 
11 | The function also removes the original CSV file.
12 | 
13 | The first line defines the chunks_list variable to store the number of rows in each chunk.
14 | 
15 | The next line defines the normalized_dir variable as the directory where the broken CSV files will be saved.
16 | 
17 | The next line iterates over the CSV file in chunks of the specified size. For each chunk, the function adds the number of rows in the chunk to the chunks_list variable.
18 | 
19 | The next line checks if the number of chunks is greater than or equal to 2. If it is, then the function breaks the CSV file into smaller files.
20 | 
21 | The next few lines create a new file for each chunk and writes the chunk to the file.
22 | 
23 | The last line removes the original CSV file.
24 | """
25 | 
26 | def Break_CSV_File(filename,chunk_size,normalized_dir):
27 |     chunks_list = []
28 |     #normalized_dir = 'media/mehdi/linux/normalized_data/'
29 |     for chunk in pd.read_csv(filename, iterator=True, chunksize=chunk_size):
30 |         chunks_list.append(len(chunk))
31 |     if(len(chunks_list) >=2):
32 |         base_filename = os.path.basename(filename)
33 |         for i, chunk in enumerate(pd.read_csv(filename, chunksize=chunk_size)):
34 |             chunk.to_csv(normalized_dir + base_filename +'_chunk' + '{}'.format(i), index=False)
35 |         if os.path.exists(normalized_dir+os.path.basename(filename)):
36 |             os.remove(normalized_dir+os.path.basename(filename))
37 |             print("file with name {} has been chunked and rermoved ".format(base_filename))
38 |     return


--------------------------------------------------------------------------------
/Break_Data_File.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The code you have provided is a function called Break_Data_File(). This function breaks a large pcap file into smaller files of a specified size.
 3 | 
 4 | The function first creates a directory with the same name as the pcap file. If the directory already exists, then the function deletes all the files in the directory.
 5 | 
 6 | The function then creates a tcpdump script that breaks the pcap file into smaller files of the specified size. The tcpdump script is executed using the os.system() function.
 7 | 
 8 | The function also creates an editcap script that can be used to further break the pcap files into smaller files. However, the editcap script is not executed by the function.
 9 | 
10 | The first line defines the directory variable as the name of the directory that will be created to store the broken pcap files. The parent_dir variable is the parent directory of the pcap file. The path variable is the full path to the directory that will be created.
11 | 
12 | The next four lines check if the directory directory exists. If it does not exist, then the function creates the directory. If the directory does exist, then the function deletes all the files in the directory.
13 | 
14 | The next line defines the tcpdump_script variable as the tcpdump script that will be used to break the pcap file into smaller files. The editcap_script variable is defined as the editcap script that can be used to further break the pcap files into smaller files.
15 | 
16 | The next line defines the new_filename variable as the name of the broken pcap file. The saved_break_file variable is the full path to the broken pcap file.
17 | 
18 | The next line executes the tcpdump_script using the os.system() function.
19 | 
20 | The last line commented out executes the editcap_script using the os.system() function.
21 | """
22 | 
23 | def Break_Data_File(filename,chunk_size):
24 |     # create a floder as name as file for breaked files
25 |     # Directory
26 |     directory = os.path.splitext(os.path.basename(filename))[0]
27 |     # Parent Directory path
28 |     parent_dir = os.path.dirname(filename)
29 |     # Path
30 |     path = os.path.join(parent_dir, directory)
31 |     # Create the directory
32 |     if(os.path.isdir(path) == False):
33 |         os.mkdir(path)
34 |     else:
35 |         for f in os.listdir(path):
36 |             os.remove(os.path.join(path, f))
37 |     print("Directory '% s' created" % directory)
38 |     # in this section large file greater than chunk size will be breake
39 |     tcpdump_script = ""
40 |     editcap_script = ""
41 |     #new_dir_filename = os.path.dirname(filename)
42 |     new_filename = "breaked_" + os.path.splitext(os.path.basename(filename))[0]
43 |     #saved_break_file = new_dir_filename + "/" + new_filename
44 |     saved_break_file = path + "/" + new_filename
45 |     tcpdump_script = "tcpdump -r " + filename + " -w " + saved_break_file + " -C " + str(chunk_size)
46 |     editcap_script = "editcap -c 100000 " + filename + " " + saved_break_file
47 |     os.system(tcpdump_script)
48 |     # os.system(editcap_script)
49 | 
50 | 


--------------------------------------------------------------------------------
/Dataset/ISCX VPN-NonVPN 2016.md:
--------------------------------------------------------------------------------
 1 | # ISCX VPN-nonVPN Dataset (ISCXVPN2016)
 2 | 
 3 | This dataset is sourced from the [UNB Cybersecurity Research Group](https://www.unb.ca/cic/) and contains network traffic data for research purposes. The information below is based on the dataset's official website: [ISCX VPN-nonVPN dataset](http://205.174.165.80/CICDataset/ISCX-VPN-NonVPN-2016/).
 4 | 
 5 | ## Dataset Overview
 6 | 
 7 | The ISCX VPN-nonVPN dataset was created to represent real-world network traffic diversity and quantity. The dataset includes accounts for users named Alice and Bob, allowing for the use of various services like Skype, Facebook, and more. It captures both regular and VPN sessions, resulting in a total of 14 traffic categories, including VOIP, P2P, and more.
 8 | 
 9 | ## Traffic Types and Applications
10 | 
11 | Here is a list of different types of traffic and applications considered in the dataset:
12 | 
13 | 1. **Browsing**: HTTPS traffic generated while users browse the web or perform tasks involving a browser.
14 | 
15 | 2. **Email**: Traffic samples generated using a Thunderbird client, configured for mail delivery through SMTP/S and receipt through POP3/SSL or IMAP/SSL.
16 | 
17 | 3. **Chat**: Instant messaging applications, including Facebook, Hangouts (via web browsers), Skype, IAM, and ICQ (using the Pidgin application).
18 | 
19 | 4. **Streaming**: Multimedia applications requiring a continuous stream of data, including YouTube and Vimeo services using Chrome and Firefox.
20 | 
21 | 5. **File Transfer**: Traffic applications designed for sending or receiving files and documents. This includes Skype file transfers, FTP over SSH (SFTP), and FTP over SSL (FTPS) traffic sessions.
22 | 
23 | 6. **VoIP**: Voice over IP traffic, encompassing voice calls using Facebook, Hangouts, and Skype.
24 | 
25 | 7. **TraP2P**: Identification of file-sharing protocols like BitTorrent, captured by downloading .torrent files from a public repository and using uTorrent and Transmission applications.
26 | 
27 | ## Dataset Details
28 | 
29 | - **Data Size**: The dataset includes 28GB of captured traffic data.
30 | 
31 | - **VPN Usage**: An external VPN service provider was used for VPN sessions, connected via OpenVPN (UDP mode).
32 | 
33 | - **SFTP and FTPS**: For SFTP and FTPS traffic, external service providers and Filezilla as a client were used.
34 | 
35 | - **Filtering**: To simplify the labeling process, unnecessary services and applications were closed during traffic capture. Only the objective application (e.g., Skype voice call, SFTP file transfer) was active.
36 | 
37 | - **Filtering by IP**: A filter was applied to capture only packets with source or destination IP addresses matching the local client's address (Alice or Bob).
38 | 
39 | ## Data Processing
40 | 
41 | Scapy is used to read the pcap files and create CSV files based on selected features.
42 | 
43 | ## Dataset Availability
44 | 
45 | The UNB ISCX Network Traffic (VPN-nonVPN) dataset is available for research purposes. It includes labeled network traffic, full packet data in pcap format, and CSV files (flows generated by ISCXFlowMeter). Researchers can access the dataset through the following link:
46 | 
47 | [Dataset Download Link](http://205.174.165.80/CICDataset/ISCX-VPN-NonVPN-2016/)
48 | 
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/Dataset/dataset-more description1.md:
--------------------------------------------------------------------------------
 1 | # ISCX VPN Non-VPN 2016 Dataset
 2 | 
 3 | The "ISCX VPN Non-VPN 2016 dataset" is a network traffic dataset used for research and analysis in the field of network security and intrusion detection. This dataset provides valuable insights into VPN (Virtual Private Network) and non-VPN network traffic.
 4 | 
 5 | ## Origin and Source
 6 | 
 7 | - The dataset was collected as part of the ISCX (Information Security Centre of Excellence) project.
 8 | - It contains network traffic data captured in a controlled environment for research purposes.
 9 | 
10 | ## Purpose
11 | 
12 | - The main purpose of this dataset is to facilitate research in the detection of VPN (Virtual Private Network) and non-VPN network traffic.
13 | - Researchers often use it to develop and evaluate intrusion detection systems and network traffic analysis techniques.
14 | 
15 | ## Dataset Contents
16 | 
17 | - The dataset typically consists of network traffic captures in PCAP (Packet Capture) format.
18 | - It is divided into two main categories: VPN traffic and non-VPN traffic.
19 | - Each category contains network traffic data for various network activities.
20 | 
21 | ### VPN Traffic
22 | 
23 | - This category includes network traffic generated by VPN connections. VPNs are commonly used for secure and private communication over the internet.
24 | - The dataset may contain VPN traffic for different VPN protocols, such as OpenVPN, PPTP, L2TP, etc.
25 | - VPN traffic often includes encrypted communication, making it challenging to analyze for security purposes.
26 | 
27 | ### Non-VPN Traffic
28 | 
29 | - This category includes regular network traffic that does not involve VPN connections.
30 | - It can include various types of network activities, such as web browsing, email communication, file transfers, and more.
31 | - Non-VPN traffic is often used as a baseline for comparison when detecting unusual or potentially malicious network behavior.
32 | 
33 | ## Use Cases
34 | 
35 | Researchers and cybersecurity professionals can use this dataset to:
36 | 
37 | - Develop and evaluate intrusion detection systems (IDS) to identify VPN-based attacks or anomalies.
38 | - Study and analyze network traffic patterns for both VPN and non-VPN scenarios.
39 | - Improve network security by identifying potentially malicious VPN traffic.
40 | 
41 | ## Challenges
42 | 
43 | - Analyzing VPN traffic can be challenging due to encryption, making it difficult to inspect the content of packets.
44 | - Distinguishing between legitimate VPN usage and malicious activities is a common challenge in network security research.
45 | 
46 | ## Privacy and Ethics
47 | 
48 | - When using network traffic datasets, it's important to consider privacy and ethical concerns. Care should be taken to ensure that personally identifiable information (PII) or sensitive data is not exposed.
49 | 
50 | ## Availability
51 | 
52 | - The dataset is publicly available for research purposes, often through academic or research institutions or cybersecurity organizations.
53 | 
54 | ## PCAP Files
55 | 
56 | The PCAP files in the ISCX VPN-NonVPN 2016 dataset contain the following information for each packet:
57 | 
58 | - Timestamp
59 | - Source IP address
60 | - Destination IP address
61 | - Source port
62 | - Destination port
63 | - Protocol
64 | - Packet length
65 | - Packet payload
66 | 


--------------------------------------------------------------------------------
/FC-traffic-classification.py:
--------------------------------------------------------------------------------
  1 | from memory usage-execution time import measure_execution_memory
  2 | from fc_build_model import fc_build_model
  3 | @measure_execution_memory
  4 | def FC-traffic-classification(root_normalized_dir,net_params,model_params, 1d-cnn_path,1d-cnn_model,bi-lstm_path,bi-lstm_model,sae_path,sae_model,sae_directory_features):
  5 | 
  6 |     df_normalized = pd.DataFrame(columns=['packet_normalized_data', 'class_label'])
  7 |     df_train = pd.DataFrame(columns=['packet_normalized_data'])
  8 |     binary = "{0:08b}".format(int("1a", 16))
  9 |     col_list = ['packet_normalized_data', 'class_label']
 10 | 
 11 |     # list out keys and values separately
 12 |     key_list = list(net_parameters.keys())
 13 |     val_list = list(net_parameters.values())
 14 |     
 15 |      # list out keys and values separately
 16 |     key_list1 = list(model_params.keys())
 17 |     val_list1 = list(model_params.values())
 18 |     DENSE_LAYER =  val_list1[key_list1.index("dense_neurons")]
 19 |     # network parameters
 20 |     BATCH_SIZE = val_list[key_list.index("BATCH_SIZE")]
 21 |     EPOCH = val_list[key_list.index("EPOCH")]
 22 |     VERBOSE =  val_list[key_list.index("VERBOSE")]
 23 |     #OPTIMIZER = tf.keras.optimizers.Adam()
 24 |     VALIDATION_SPLIT = val_list[key_list.index("VALIDATION_SPLIT")]
 25 |     NUM_CLASSES = val_list[key_list.index("NUM_CLASSES")]
 26 |     OPTIMIZER = val_list[key_list.index("OPTIMIZER")]
 27 |     LOSS_FUNCTION = val_list[key_list.index("LOSS_FUNCTION")]
 28 |     METRICS =  val_list[key_list.index("METRICS")[0], key_list.index("METRICS")[1](),key_list.index("METRICS")[2](),key_list.index("METRICS")[3]()]
 29 |     DROPOUT = val_list[key_list.index("DROPOUT")]
 30 |     #load models weights
 31 |     1d-cnn_model.load_weights(1d-cnn_path)
 32 |     bi-lstm_model.load_weights(bi-lstm_path)
 33 |     sae_model.load_weights(sae_path)
 34 |     train_data_list = []
 35 |     train_label_list = []
 36 |     test_data_list = []
 37 |     test_lable_list = [] 
 38 |     sae_train_data_list[]
 39 |     sae_train_label_list =[]
 40 |     sae_test_data_list =[] 
 41 |     sae_test_label_list = []    
 42 |     for path in os.listdir(root_normalized_dir):
 43 |         full_path = os.path.join(root_normalized_dir, path)
 44 |         df = pd.read_csv(full_path, usecols=col_list)
 45 |         
 46 | 
 47 |         # train on model
 48 |         X = df.iloc[:,0:1] # Data
 49 |         Y =  df.iloc[:,1:2]  # Label
 50 |         X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
 51 |         # prepare label of packet for deep NN
 52 |         train_label_data_list = []
 53 |         test_label_data_list = []
 54 |         pkt_train_label_data = np.zeros([len(y_train), 1])
 55 |         pkt_test_label_data = np.zeros([len(y_test), 1])
 56 |         for i in range(len(y_train)):
 57 |             pkt_train_label_data[i,0] = y_train.iloc[i,0]
 58 |         #train_label_data_list.append(pkt_train_label_data)
 59 |         train_label = np.array(pkt_train_label_data)
 60 |         train_label = train_label[:,0]
 61 |         train_label = train_label.astype(np.int)
 62 |         for i in range(len(y_test)):
 63 |             pkt_test_label_data[i,0] = y_test.iloc[i,0]
 64 |         #test_label_data_list.append(pkt_test_label_data)
 65 |         test_label = np.array(pkt_test_label_data)
 66 |         test_label = test_label[:, 0]
 67 |         test_label = test_label.astype(np.int)
 68 | 
 69 |         #y_train = y_train.to_numpy()
 70 |         #y_train = y_train.T
 71 |         # To create a x-by-y-by-z 3D list with initial values:
 72 |         
 73 |         data_list = []
 74 |         test_list = []
 75 | 
 76 |         pkt_data = np.zeros([len(X_train.iloc[0, 0].split(',')), 1])
 77 |         for i in range(len(X_train)):
 78 |             print("trian preparing data i {}".format(i))
 79 |             pkt_train_data = np.zeros([len(X_train.iloc[0, 0].split(',')), 1])
 80 |             temp_train_list = X_train.iloc[i, 0].split(',')[:]
 81 |             for j in range(len(temp_train_list)):
 82 |                 #print("test preparing data j {}".format(j))
 83 |                 pkt_train_data[j,0] = float(temp_train_list[j])
 84 |             data_list.append(pkt_train_data)
 85 |             
 86 |         train_data = np.array(data_list)
 87 |         for i in range(len(X_test)):
 88 |             print("test preparing data i {}".format(i))
 89 |             pkt_test_data = np.zeros([len(X_test.iloc[0, 0].split(',')), 1])
 90 |             temp_test_list = X_test.iloc[i, 0].split(',')[:]
 91 |             for j in range(len(temp_test_list)):
 92 |                 #print("test preparing data j {}".format(j))
 93 |                 pkt_test_data[j,0] = float(temp_test_list[j])
 94 |             test_list.append(pkt_test_data)
 95 |         test_data = np.array(test_list)
 96 |         # convert class vectors to binary class matrices
 97 |         train_label = tf.keras.utils.to_categorical(train_label, NUM_CLASSES)
 98 |         train_label_list.append(train_label)
 99 |         test_label = tf.keras.utils.to_categorical(test_label, NUM_CLASSES)
100 |         test_label_list.append(test_label)
101 |         train_data = train_data.reshape((len(train_data), len(X_train.iloc[0,0].split(',')), 1, 1))
102 |         train_data_list.append(train_data)
103 |         test_data = test_data.reshape((len(test_data), len(X_test.iloc[0, 0].split(',')), 1, 1))
104 |         test_data_list.append(test_data)
105 |         
106 |     # read sae features from specified directory
107 |     for path in os.listdir(sae_directory_features):
108 |         
109 |         full_path = os.path.join(sae_feature_dir, path)
110 |         df = pd.read_csv(full_path, usecols=col_list)
111 |         
112 |         # train on model
113 |         X = df.iloc[:,0:1] # Data
114 |         Y =  df.iloc[:,1:2]  # Label
115 |         X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
116 |         # prepare label of packet for deep NN
117 |         train_label_data_list = []
118 |         test_label_data_list = []
119 |         train_label_data = np.zeros([len(y_train), 1])
120 |         test_label_data = np.zeros([len(y_test), 1])
121 |         for i in range(len(y_train)):
122 |             train_label_data[i,0] = y_train.iloc[i,0]
123 |         #train_label_data_list.append(pkt_train_label_data)
124 |         train_label = np.array(train_label_data)
125 |         train_label = train_label[:,0]
126 |         train_label = train_label.astype(np.int)
127 |         for i in range(len(y_test)):
128 |             test_label_data[i,0] = y_test.iloc[i,0]
129 |         #test_label_data_list.append(pkt_test_label_data)
130 |         test_label = np.array(test_label_data)
131 |         test_label = test_label[:, 0]
132 |         test_label = test_label.astype(np.int)
133 |         # convert class vectors to binary class matrices
134 |         train_label = tf.keras.utils.to_categorical(train_label, NUM_CLASSES)
135 |         sae_train_label_list.append(train_label)
136 |         test_label = tf.keras.utils.to_categorical(test_label, NUM_CLASSES)
137 |         sae_test_label_list.append(test_label)
138 |         train_data = train_data.reshape((len(train_data), len(X_train.iloc[0,0].split(',')), 1, 1))
139 |         sae_train_data_list.append(train_data)
140 |         test_data = test_data.reshape((len(test_data), len(X_test.iloc[0, 0].split(',')), 1, 1))
141 |         sae_test_data_list.append(test_data)
142 |                 
143 | 
144 |     #compile and fit final combined model
145 |     model = fc_build_model(model_params)
146 |     model.compile(loss=LOSS_FUNCTION,
147 |                       optimizer=OPTIMIZER,
148 |                       metrics=METRICS)
149 |     model.summary()
150 |     print("this is running Bi-LSTM model: ")
151 |     # create output of 1D-CNN, Bi-LSTM ans SAE
152 |     x_combined = np.concatenate((1d-cnn_model.predict(train_data_list), bi-lstm_model.predict(train_data_list),sae_model(sae_train_data_list), axis=1)
153 |     
154 |     model.fit(x_combined, train_label, batch_size=BATCH_SIZE,
155 |             epochs=EPOCH,verbose=VERBOSE,validation_split= VALIDATION_SPLIT )
156 |     x_combined_test = np.concatenate((1d-cnn_model.predict(test_data_list), bi-lstm_model.predict(test_data_list),sae_model(sae_test_data_list), axis=1)        
157 |     (loss, accuracy, f1_score, precision, recall) = model.evaluate(x_combined_test, test_label,
158 |                                batch_size=BATCH_SIZE)
159 |         score = []
160 |         score[0] = loss
161 |         score[1]= accuracy
162 |         score[2] = f1_score
163 |         score[3] = precision
164 |         score[4] = recall
165 |         
166 |         print("\nTest loss:", score[0])
167 |         print('Test accuracy:', score[1])
168 |         print('Test f1_score:', score[2])
169 |         print('Test precision:', score[3])
170 |         print('Test recall:', score[4])        
171 | 


--------------------------------------------------------------------------------
/ISCX-Analysis/README.md:
--------------------------------------------------------------------------------
 1 | # Network Traffic Session Features Extractor
 2 | 
 3 | The code provided in `session_features.py` extracts statistical features from network traffic sessions in PCAP files and saves them to a CSV file. These features provide valuable insights into network traffic characteristics. The extracted statistical features include:
 4 | 
 5 | - **Interarrival Times**: Minimum, maximum, median, and standard deviation of the time between packets in a session.
 6 | - **Packet Lengths**: Minimum, maximum, median, and standard deviation of the packet lengths in a session.
 7 | - **Payload Sizes**: Minimum, maximum, median, and standard deviation of the payload sizes in a session.
 8 | - **Session Duration**: Total time of the session.
 9 | - **Active Time**: Total time that packets were being sent or received in the session.
10 | - **Idle Time**: Total time that there was no packet activity in the session.
11 | - **Packet Truncation**: Number of packets that were truncated in the session.
12 | - **Total Packets**: Total number of packets in the session.
13 | - **Bytes per Second**: Average number of bytes transmitted per second in the session.
14 | - **Packets per Second**: Average number of packets transmitted per second in the session.
15 | 
16 | ## How It Works
17 | 
18 | 1. **Data Preparation**: The code begins by reading PCAP files and extracting individual packets.
19 | 
20 | 2. **Session Identification**: It groups the packets into sessions based on specific session attributes such as source IP address, destination IP address, source port, and destination port. Each session represents a distinct flow of network traffic.
21 | 
22 | 3. **Feature Extraction**: For each session, the code calculates the statistical features listed above.
23 | 
24 | 4. **CSV File Output**: The extracted statistical features are saved to a CSV file for further analysis and utilization.
25 | 
26 | ## Use Cases
27 | 
28 | This code can be used for various network-related purposes, including:
29 | 
30 | - **Network Traffic Analysis**: The statistical features enable the identification of patterns and trends in network traffic. This information can be used to improve network performance and security.
31 | 
32 | - **Network Intrusion Detection**: The extracted features can serve as inputs to machine learning models for detecting malicious network traffic or anomalies.
33 | 
34 | - **Network Performance Monitoring**: By analyzing the statistics, you can monitor the performance of network applications and services, helping with troubleshooting and optimization.
35 | 
36 | ## Conclusion
37 | 
38 | The `session_features.py` code provides a valuable tool for extracting statistical features from network traffic sessions. These features can be utilized for a wide range of network analysis and monitoring tasks, ultimately contributing to better network performance and security.
39 | 
40 | 


--------------------------------------------------------------------------------
/ISCX-Analysis/session features.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pandas as pd
  3 | from scapy.all import rdpcap
  4 | from collections import defaultdict
  5 | import statistics
  6 | import numpy as np
  7 | from sklearn.model_selection import train_test_split
  8 | from sklearn.preprocessing import StandardScaler
  9 | from sklearn.neural_network import MLPClassifier
 10 | from sklearn.metrics import accuracy_score
 11 | 
 12 | # Function to calculate statistical features
 13 | def calculate_statistics(values):
 14 |     if not values:
 15 |         return None, None, None, None  # Return None for min, max, median, and standard deviation if the list is empty
 16 |     return min(values), max(values), statistics.median(values), statistics.stdev(values)
 17 | 
 18 | # Initialize data structures
 19 | session_data = defaultdict(list)
 20 | 
 21 | # Replace 'path/to/your/dataset/folder' with the actual path to your dataset folder
 22 | dataset_folder = 'path/to/your/dataset/folder'
 23 | output_csv = 'session_stats.csv'
 24 | 
 25 | # List of PCAP files in the dataset folder
 26 | pcap_files = [file for file in os.listdir(dataset_folder) if file.endswith('.pcap')]
 27 | 
 28 | # Loop through the PCAP files
 29 | for pcap_file in pcap_files:
 30 |     pcap_path = os.path.join(dataset_folder, pcap_file)
 31 |     packets = rdpcap(pcap_path)
 32 |     current_session = []
 33 |     last_time = None
 34 | 
 35 |     # Extract label from file name (assuming filenames are formatted as "label_filename.pcap")
 36 |     label = int(pcap_file.split('_')[0])
 37 | 
 38 |     # Loop through the packets in the PCAP file
 39 |     for packet in packets:
 40 |         if 'IP' in packet and 'TCP' in packet:
 41 |             session_key = (packet['IP'].src, packet['IP'].dst, packet['TCP'].sport, packet['TCP'].dport)
 42 |             if not current_session:
 43 |                 current_session.append(packet)
 44 |             else:
 45 |                 inter_arrival_time = packet.time - last_time
 46 |                 current_session.append(packet)
 47 |                 if packet['TCP'].flags & 0x02:  # Check if it's a SYN packet (start of a new session)
 48 |                     if session_key not in session_data:
 49 |                         session_data[session_key] = {
 50 |                             'InterarrivalTimes': [],
 51 |                             'PacketLengths': [],
 52 |                             'PayloadSizes': [],
 53 |                             'SessionDuration': 0,
 54 |                             'Label': label
 55 |                         }
 56 |                     session_data[session_key]['InterarrivalTimes'].append(inter_arrival_time)
 57 |                     session_data[session_key]['PacketLengths'].append(len(packet))
 58 |                     if 'Raw' in packet:
 59 |                         session_data[session_key]['PayloadSizes'].append(len(packet['Raw']))
 60 |                     current_session = []
 61 |             last_time = packet.time
 62 | 
 63 | # Initialize lists to store the calculated features
 64 | session_features = []
 65 | 
 66 | # Loop through the sessions and calculate the features
 67 | for session_key, session_info in session_data.items():
 68 |     interarrival_min, interarrival_max, interarrival_median, interarrival_std = calculate_statistics(session_info['InterarrivalTimes'])
 69 |     packet_length_min, packet_length_max, packet_length_median, packet_length_std = calculate_statistics(session_info['PacketLengths'])
 70 |     payload_size_min, payload_size_max, payload_size_median, payload_size_std = calculate_statistics(session_info['PayloadSizes'])
 71 |     session_duration = sum(session_info['InterarrivalTimes'])
 72 |     active_time_min, active_time_max, active_time_median, active_time_std = calculate_statistics([session_duration])
 73 |     idle_time_min, idle_time_max, idle_time_median, idle_time_std = calculate_statistics([session_duration - sum(session_info['InterarrivalTimes'])])
 74 |     total_packets = len(session_info['InterarrivalTimes']) + 1  # Adding 1 to account for the first packet
 75 |     packet_truncation = total_packets - len(session_info['InterarrivalTimes']) - 1  # Subtracting 1 to account for the last packet
 76 | 
 77 |     bytes_per_second = sum(session_info['PacketLengths']) / session_duration
 78 |     packets_per_second = total_packets / session_duration
 79 | 
 80 |     session_features.append({
 81 |         'Min_Interarrival': interarrival_min,
 82 |         'Max_Interarrival': interarrival_max,
 83 |         'Median_Interarrival': interarrival_median,
 84 |         'Std_Interarrival': interarrival_std,
 85 |         'Min_Packet_Length': packet_length_min,
 86 |         'Max_Packet_Length': packet_length_max,
 87 |         'Median_Packet_Length': packet_length_median,
 88 |         'Std_Packet_Length': packet_length_std,
 89 |         'Min_Payload_Size': payload_size_min,
 90 |         'Max_Payload_Size': payload_size_max,
 91 |         'Median_Payload_Size': payload_size_median,
 92 |         'Std_Payload_Size': payload_size_std,
 93 |         'Min_Active_Time': active_time_min,
 94 |         'Max_Active_Time': active_time_max,
 95 |         'Median_Active_Time': active_time_median,
 96 |         'Std_Active_Time': active_time_std,
 97 |         'Min_Idle_Time': idle_time_min,
 98 |         'Max_Idle_Time': idle_time_max,
 99 |         'Median_Idle_Time': idle_time_median,
100 |         'Std_Idle_Time': idle_time_std,
101 |         'Packet_Truncation': packet_truncation,
102 |         'Total_Packets': total_packets,
103 |         'Session_Duration': session_duration,
104 |         'Bytes_Per_Second': bytes_per_second,
105 |         'Packets_Per_Second': packets_per_second,
106 |         'Label': session_info['Label']
107 |     })
108 | 
109 | # Create a DataFrame from the session features
110 | df = pd.DataFrame(session_features)
111 | 
112 | # Save the statistical features to a CSV file
113 | df.to_csv(output_csv, index=False)
114 | 
115 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CBS - Encrypted Traffic Classification using Deep Learning
 2 | 
 3 | ## Overview
 4 | 
 5 | CBS is a platform designed and implemented for encrypted traffic classification using deep learning. The goal of this project is to accurately classify network traffic into different types and applications, such as email, P2P, Skype, and more.
 6 | 
 7 | ## General Algorithm
 8 | 
 9 | The general algorithm for performing classification is as follows:
10 | 
11 | 1. **Data Extraction:** PCAP files containing packets related to the type of traffic and application are extracted from the dataset.
12 | 
13 | 2. **Preprocessing:** The extracted PCAP files undergo preprocessing, which includes removing unused packets such as DNS, DHCP, and other non-essential data.
14 | 
15 | 3. **Feature Extraction:** Important parts of the payload and header of each packet are extracted, and a fixed-length record (e.g., 1500-length) is created. These records are added to the new dataset for learning.
16 | 
17 | 4. **Data Augmentation:** To address imbalances in the dataset, especially when some traffic types have fewer samples than others, Generative Adversarial Networks (GANs) are used to synthesize new samples.
18 | 
19 | 5. **Spatial Feature Extraction:** Spatial features are extracted from the data using a 1D Convolutional Neural Network (1D-CNN).
20 | 
21 | 6. **Temporal Feature Extraction:** Temporal features are extracted using an attention mechanism-based Bidirectional Long Short-Term Memory (Bi-LSTM) network.
22 | 
23 | 7. **Statistical Feature Extraction:** Statistical features are extracted through a Stacked Autoencoder (SAE).
24 | 
25 | 8. **Feature Aggregation:** The outputs of the 1D-CNN, attention Bi-LSTM, and SAE are aggregated and fed into a fully connected neural network.
26 | 
27 | 9. **Classification:** The fully connected network learns from the aggregated features and performs the final classification of traffic into 12 types of traffic (e.g., email, P2P) and 17 types of applications (e.g., Skype, Vimeo).
28 | 


--------------------------------------------------------------------------------
/SAE_Traffic_classification.py:
--------------------------------------------------------------------------------
 1 | # in this function we build CNN for Traffic Classification
 2 | from memory usage-execution time import measure_execution_memory
 3 | from SAE_build_model import SAE_build_model
 4 | @measure_execution_memory
 5 | def SAE_Traffic_classification(sae_feature_dir,net_parameters,model_params):
 6 |     
 7 |     df_normalized = pd.DataFrame(columns=['sae_normalized_features', 'class_label'])
 8 |     df_train = pd.DataFrame(columns=['sae_normalized_features'])
 9 |     binary = "{0:08b}".format(int("1a", 16))
10 |     col_list = ['sae_normalized_features', 'class_label']
11 |     
12 | 
13 |     
14 | 
15 |     for path in os.listdir(sae_feature_dir):
16 |         
17 |         full_path = os.path.join(sae_feature_dir, path)
18 |         df = pd.read_csv(full_path, usecols=col_list)
19 |         
20 |         # train on model
21 |         X = df.iloc[:,0:1] # Data
22 |         Y =  df.iloc[:,1:2]  # Label
23 |         X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
24 |         # prepare label of packet for deep NN
25 |         train_label_data_list = []
26 |         test_label_data_list = []
27 |         train_label_data = np.zeros([len(y_train), 1])
28 |         test_label_data = np.zeros([len(y_test), 1])
29 |         for i in range(len(y_train)):
30 |             train_label_data[i,0] = y_train.iloc[i,0]
31 |         #train_label_data_list.append(pkt_train_label_data)
32 |         train_label = np.array(train_label_data)
33 |         train_label = train_label[:,0]
34 |         train_label = train_label.astype(np.int)
35 |         for i in range(len(y_test)):
36 |             test_label_data[i,0] = y_test.iloc[i,0]
37 |         #test_label_data_list.append(pkt_test_label_data)
38 |         test_label = np.array(test_label_data)
39 |         test_label = test_label[:, 0]
40 |         test_label = test_label.astype(np.int)
41 |         # convert class vectors to binary class matrices
42 |         train_label = tf.keras.utils.to_categorical(train_label, NUM_CLASSES)
43 |         test_label = tf.keras.utils.to_categorical(test_label, NUM_CLASSES)
44 |         
45 |         train_data = train_data.reshape((len(train_data), len(X_train.iloc[0,0].split(',')), 1, 1))
46 |         test_data = test_data.reshape((len(test_data), len(X_test.iloc[0, 0].split(',')), 1, 1))
47 |         stack_autoencoder_list,code_layer_output = SAE_build_model(model_params,net_parameters,train_data,test_data,train_label,test_label,LOSS_FUNCTION,OPTIMIZER,METRICS)
48 |          
49 | 
50 |         
51 | 
52 |         
53 |     saved_models = []
54 |     saved_weights = []
55 |     save_model_weights_dir = 'media/mehdi/linux/normalized_data/'
56 |     for i, model in enumerate(stack_autoencoder_list):
57 |         # save model architecture 
58 |         tf.keras.models.save_model(model, save_model_weights_dir + f'model_architecture_sae{i}.h5')
59 |         saved_models.append(f'model_architecture_sae{i}.h5')
60 |         tf.keras.models.save_weights(model, save_model_weights_dir + f'model_weights_sae{i}.h5')
61 |         saved_weights.append(f'model_weights_sae{i}.h5')
62 |         # save model weights
63 |     return code_layer_output,save_models,saved_weights, save_model_weights_dir     
64 | 


--------------------------------------------------------------------------------
/SAE_build_model.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras import layers
 3 | import pandas as pd
 4 |     
 5 | def SAE_build_moel(params,net_parameters,train_data,test_data,train_label,test_label,LOSS_FUNCTION,OPTIMIZER,METRIC):
 6 | 
 7 |     encoder_neurons_sae = params['encoder_neurons_sae']
 8 |     code_neurons_sae = params['code_neurons_sae']
 9 |     decoder_neurons_sae = params['decoder_neurons_sae']
10 |     output_neurons_sae = params['output_neurons_sae'] 
11 |     autoencoders = []  # Create an empty list to store the output of define_autoencoder
12 |     # list out keys and values separately
13 |     key_list = list(net_parameters.keys())
14 |     val_list = list(net_parameters.values())
15 |     # network parameters
16 |     BATCH_SIZE = val_list[key_list.index("BATCH_SIZE")]
17 |     EPOCH = val_list[key_list.index("EPOCH")]
18 |     VERBOSE =  val_list[key_list.index("VERBOSE")]
19 |     #OPTIMIZER = tf.keras.optimizers.Adam()
20 |     VALIDATION_SPLIT = val_list[key_list.index("VALIDATION_SPLIT")]
21 |     #NUM_CLASSES = val_list[key_list.index("NUM_CLASSES")]
22 |     OPTIMIZER = val_list[key_list.index("OPTIMIZER")]
23 |     LOSS_FUNCTION = val_list[key_list.index("LOSS_FUNCTION")]
24 |     METRICS =  val_list[key_list.index("METRICS")[0], key_list.index("METRICS")[1](),key_list.index("METRICS")[2](),key_list.index("METRICS")[3]()]
25 |     DROPOUT = val_list[key_list.index("DROPOUT")]
26 |     stack = []
27 |     
28 |     # Process each parameter individually
29 |     for i in range(6):
30 |         encoder_neurons = encoder_neurons_sae[i]
31 |         code_neurons = code_neurons_sae[i]
32 |         decoder_neurons = decoder_neurons_sae[i]
33 |         output_neurons = output_neurons_sae[i]
34 |         # Call the autoencoder function with each set of parameters
35 |         autoencoder = define_autoencoder(encoder_neurons, code_neurons, decoder_neurons, output_neurons)
36 |         autoencoders.append(autoencoder)  # Append the output to the list
37 |     for model in autoencoders:
38 |         model.compile(loss=LOSS_FUNCTION,
39 |                       optimizer=OPTIMIZER,
40 |                       metrics=METRICS)
41 |         model.summary()
42 |         if(autoencoders.index(model) == 1):
43 |             stack.append([autoencoders.index(model)]) = model.fit(train_data, train_data, batch_size=BATCH_SIZE,
44 |                   epochs=EPOCH,verbose=VERBOSE, validation_data=(test_data, test_data) )
45 |         else:
46 |             temp_input = model.predict(train_data)
47 |             temp_input = np.concatenate((temp_input , train_data))  
48 |             stack.append([autoencoders.index(model)]) = model.fit(temp_input, temp_input, batch_size=BATCH_SIZE,
49 |                   epochs=EPOCH,verbose=VERBOSE, validation_data=(test_data, test_data) )    
50 |             train_data = temp_input       
51 |     return stack, stack[6].get_layer('code').output) 


--------------------------------------------------------------------------------
/cnn_Traffic_classification.py:
--------------------------------------------------------------------------------
  1 | # in this function we build CNN for Traffic Classification
  2 | from memory usage-execution time import measure_execution_memory
  3 | from cnn_build_model import cnn_build_model
  4 | @measure_execution_memory
  5 | def cnn_Traffic_classification(root_normalized_dir,net_parameters,model_parameters):
  6 |     df_normalized = pd.DataFrame(columns=['packet_normalized_data', 'class_label'])
  7 |     df_train = pd.DataFrame(columns=['packet_normalized_data'])
  8 |     binary = "{0:08b}".format(int("1a", 16))
  9 |     col_list = ['packet_normalized_data', 'class_label']
 10 |     
 11 |     # list out keys and values separately
 12 |     key_list = list(net_parameters.keys())
 13 |     val_list = list(net_parameters.values())
 14 |     
 15 |     # list out keys and values separately
 16 |     key_list1 = list(model_parameters.keys())
 17 |     val_list1 = list(model_parameters.values())
 18 |     DENSE_LAYER =  val_list1[key_list1.index("DENSE_LAYER")]
 19 |     
 20 |     # network parameters
 21 |     BATCH_SIZE = val_list[key_list.index("BATCH_SIZE")]
 22 |     EPOCH = val_list[key_list.index("EPOCH")]
 23 |     VERBOSE =  val_list[key_list.index("VERBOSE")]
 24 |     #OPTIMIZER = tf.keras.optimizers.Adam()
 25 |     VALIDATION_SPLIT = val_list[key_list.index("VALIDATION_SPLIT")]
 26 |     NUM_CLASSES = val_list[key_list.index("NUM_CLASSES")]
 27 |     OPTIMIZER = val_list[key_list.index("OPTIMIZER")]
 28 |     LOSS_FUNCTION = val_list[key_list.index("LOSS_FUNCTION")]
 29 |     METRICS =  val_list[key_list.index("METRICS")[0], key_list.index("METRICS")[1](),key_list.index("METRICS")[2](),key_list.index("METRICS")[3]()]
 30 |     DROPOUT = val_list[key_list.index("DROPOUT")]
 31 | 
 32 |     for path in os.listdir(root_normalized_dir):
 33 |         full_path = os.path.join(root_normalized_dir, path)
 34 |         df = pd.read_csv(full_path, usecols=col_list)
 35 |         model = cnn_build_model(model_parameters)
 36 |         model.compile(loss=LOSS_FUNCTION,
 37 |                       optimizer=OPTIMIZER,
 38 |                       metrics=METRICS)
 39 |         model.summary()
 40 |         print("this is running 1D-CNN model: ")
 41 | 
 42 |         # train on model
 43 |         X = df.iloc[:,0:1] # Data
 44 |         Y =  df.iloc[:,1:2]  # Label
 45 |         X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
 46 |         # prepare label of packet for deep NN
 47 |         train_label_data_list = []
 48 |         test_label_data_list = []
 49 |         pkt_train_label_data = np.zeros([len(y_train), 1])
 50 |         pkt_test_label_data = np.zeros([len(y_test), 1])
 51 |         for i in range(len(y_train)):
 52 |             pkt_train_label_data[i,0] = y_train.iloc[i,0]
 53 |         #train_label_data_list.append(pkt_train_label_data)
 54 |         train_label = np.array(pkt_train_label_data)
 55 |         train_label = train_label[:,0]
 56 |         train_label = train_label.astype(np.int)
 57 |         for i in range(len(y_test)):
 58 |             pkt_test_label_data[i,0] = y_test.iloc[i,0]
 59 |         #test_label_data_list.append(pkt_test_label_data)
 60 |         test_label = np.array(pkt_test_label_data)
 61 |         test_label = test_label[:, 0]
 62 |         test_label = test_label.astype(np.int)
 63 | 
 64 |         #y_train = y_train.to_numpy()
 65 |         #y_train = y_train.T
 66 |         # To create a x-by-y-by-z 3D list with initial values:
 67 |         
 68 |         data_list = []
 69 |         test_list = []
 70 | 
 71 |         pkt_data = np.zeros([len(X_train.iloc[0, 0].split(',')), 1])
 72 |         for i in range(len(X_train)):
 73 |             print("trian preparing data i {}".format(i))
 74 |             pkt_train_data = np.zeros([len(X_train.iloc[0, 0].split(',')), 1])
 75 |             temp_train_list = X_train.iloc[i, 0].split(',')[:]
 76 |             for j in range(len(temp_train_list)):
 77 |                 #print("test preparing data j {}".format(j))
 78 |                 pkt_train_data[j,0] = float(temp_train_list[j])
 79 |             data_list.append(pkt_train_data)
 80 |             
 81 |         train_data = np.array(data_list)
 82 |         for i in range(len(X_test)):
 83 |             print("test preparing data i {}".format(i))
 84 |             pkt_test_data = np.zeros([len(X_test.iloc[0, 0].split(',')), 1])
 85 |             temp_test_list = X_test.iloc[i, 0].split(',')[:]
 86 |             for j in range(len(temp_test_list)):
 87 |                 #print("test preparing data j {}".format(j))
 88 |                 pkt_test_data[j,0] = float(temp_test_list[j])
 89 |             test_list.append(pkt_test_data)
 90 |         test_data = np.array(test_list)
 91 |         # convert class vectors to binary class matrices
 92 |         train_label = tf.keras.utils.to_categorical(train_label, NUM_CLASSES)
 93 |         test_label = tf.keras.utils.to_categorical(test_label, NUM_CLASSES)
 94 |         
 95 |         train_data = train_data.reshape((len(train_data), len(X_train.iloc[0,0].split(',')), 1, 1))
 96 |         test_data = test_data.reshape((len(test_data), len(X_test.iloc[0, 0].split(',')), 1, 1))
 97 |         #model = create_1dcnn_model()
 98 |         
 99 |                
100 |         model.fit(train_data, train_label, batch_size=BATCH_SIZE,
101 |                   epochs=EPOCH,verbose=VERBOSE,validation_split= VALIDATION_SPLIT )
102 |         #score = model.evaluate(test_data, test_label,
103 |         #                       batch_size=BATCH_SIZE)
104 |         (loss, accuracy, f1_score, precision, recall) = model.evaluate(test_data, test_label,
105 |                                batch_size=BATCH_SIZE)
106 |         score = []
107 |         score[0] = loss
108 |         score[1]= accuracy
109 |         score[2] = f1_score
110 |         score[3] = precision
111 |         score[4] = recall
112 |         
113 |         print("\nTest loss:", score[0])
114 |         print('Test accuracy:', score[1])
115 |         print('Test f1_score:', score[2])
116 |         print('Test precision:', score[3])
117 |         print('Test recall:', score[4])
118 |         
119 |     saved_models = []
120 |     saved_weights = []    
121 |     save_model_weights_dir = 'media/mehdi/linux/normalized_data/'
122 |     # save model architecture    
123 |     model.save(save_model_weights_dir + 'model_architecture_cnn.h5')
124 |     saved_models.append(save_model_weights_dir +'model_architecture_cnn.h5')
125 |     # save model weights
126 |     model.save_weights('model_weights_cnn.h5')     
127 |     saved_weights.append('model_weights_cnn.h5')
128 |     # Get the output of the last connected layer
129 |     last_dense_output = model.layers[-DENSE_LAYER[0]].output   
130 |     
131 |     print(f"GAN training completed in {execution_time:.2f} seconds ({execution_time_minutes:.2f} minutes).") 
132 |     return last_dense_output,saved_models,saved_weights,save_model_weights_dir  
133 | 


--------------------------------------------------------------------------------
/cnn_build_model.py:
--------------------------------------------------------------------------------
 1 | def cnn_build_model(parameters):
 2 |     # list out keys and values separately
 3 |     key_list = list(parameters.keys())
 4 |     val_list = list(parameters.values())
 5 |     #FILTERS = val_list[key_list.index("FILTERS")]
 6 |     KERNEL_SIZE = val_list[key_list.index("KERNEL_SIZE")]
 7 |     STRIDES = val_list[key_list.index("STRIDES")]
 8 |     PADDING = val_list[key_list.index("PADDING")]
 9 |     POOL-TYPE = val_list[key_list.index("POOL-TYPE")]
10 |     POOL_SIZE = val_list[key_list.index("POOL_SIZE")]
11 |     POOL_STRIDE = val_list[key_list.index("POOL_STRIDE")]
12 |     HIDEN_ACTIVATION_FUNCTION = val_list[key_list.index("HIDEN_ACTIVATION_FUNCTION")]
13 |     OUTPUT_ACTIVATION_FUNCTION = val_list[key_list.index("OUTPUT_ACTIVATION_FUNCTION")]
14 |     INPUT_DATA_SHAPE = val_list[key_list.index("INPUT_SHAPE")]
15 |     INPUT_SHAPE =  (val_list[key_list.index("INPUT_SHAPE")][0],val_list[key_list.index("INPUT_SHAPE")][1],1)
16 |     CNN_LAYER_SPEC = val_list[key_list.index("CNN_LAYER_SPEC")]
17 |     DENSE_LAYER =  val_list[key_list.index("DENSE_LAYER")]
18 |     DENSE_LAYER_ACTIVATION_FUNCTION =  val_list[key_list.index("DENSE_LAYER_ACTIVATION_FUNCTION")]
19 |     SOFTMAX_LAYER =  val_list[key_list.index("SOFTMAX_LAYER")]
20 |     SOFTMAX_LAYER_ACTIVATION_FUNCTION =  val_list[key_list.index("SOFTMAX_LAYER_ACTIVATION_FUNCTION")]
21 |     
22 |     
23 |     model = ks.models.Sequential()
24 |     # this CNN has been implemented based on DEEP PACKET Paper
25 |     for i in range(CNN_LAYER_SPEC[0]):
26 |         if i == 0 :
27 |             model.add(ks.layers.Convolution2D(CNN_LAYER_SPEC[i+1], (KERNEL_SIZE[i],KERNEL_SIZE[i+1] ),padding=PADDING,
28 |                                               strides=(STRIDES[i],STRIDES[i+1]),activation = HIDEN_ACTIVATION_FUNCTION, input_shape=INPUT_SHAPE))
29 |             model.add(tf.keras.layers.BatchNormalization())
30 |         else:
31 | 
32 |             model.add(ks.layers.Convolution2D(CNN_LAYER_SPEC[i+1],(KERNEL_SIZE[2*i],KERNEL_SIZE[2*i+1] ) ,padding = PADDING,
33 |                                               strides=(STRIDES[2*i],STRIDES[2*i+1]),activation=HIDEN_ACTIVATION_FUNCTION))
34 |             model.add(tf.keras.layers.BatchNormalization())
35 |             # Add AveragePooling1D layer based on POOL-TYPE if 'POOL-TYPE' is defined
36 |             if 'POOL-TYPE' in parameters and parameters['POOL-TYPE'] == AveragePooling1D:
37 |                model.add(AveragePooling1D(pool_size= (POOL_SIZE[0],POOL_SIZE[1]), strides= (POOL_STRIDE[0],POOL_STRIDE[1])))
38 | 
39 |     
40 |     # Flatten => RELU layers
41 |     model.add(ks.layers.Flatten())
42 |     # Dense Connected Layer
43 |     for i in range(DENSE_LAYER[0]):
44 |         model.add(ks.layers.Dense(DENSE_LAYER[i+1], activation=DENSE_LAYER_ACTIVATION_FUNCTION[i]))
45 |         model.add(tf.keras.layers.BatchNormalization())
46 |    
47 |     
48 |     return model
49 | 


--------------------------------------------------------------------------------
/compare-accuracy-code.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | from scipy.signal import savgol_filter
 5 | from google.colab import files
 6 | 
 7 | # Upload Excel file
 8 | uploaded = files.upload()
 9 | 
10 | # Read Excel file into a DataFrame
11 | df = pd.read_excel(next(iter(uploaded)))
12 | 
13 | # Define line styles and symbols for each column
14 | line_styles = ['-', '--', ':']
15 | #symbols = ['s', 'o', '^']
16 | columns = ['CBS', 'CSCNN-[79]', 'Datanet-[19]']
17 | 
18 | # Plotting
19 | plt.figure(figsize=(10, 6))
20 | 
21 | for i, column in enumerate(columns):
22 |     x = df['Epoch']
23 |     y = df[column]
24 |     
25 |     # Apply smoothing filter
26 |     #y_smooth = savgol_filter(y, window_length=5, polyorder=2)
27 |     # Apply Gaussian smoothing filter
28 |     y_smooth = gaussian_filter1d(y, sigma=1)
29 |     
30 |     # Plot line with specific style and symbol
31 |     plt.plot(x, y_smooth, linestyle=line_styles[i],linewidth=3,label=column)
32 | 
33 | # Add legend and labels
34 | plt.legend()
35 | plt.xlabel('Epoch')
36 | plt.ylabel('Accuracy [%]')
37 | # Set x-axis spacing to 2
38 | #plt.xticks(np.arange(min(x), max(x)+1, 2))
39 | # Set x-axis spacing to 2
40 | plt.xticks(np.arange(min(x), 21, 1))
41 | #plt.title('Line Plot')
42 | # Save the plot as SVG
43 | output_file = input("Please provide the output file name (e.g., plot.svg): ")
44 | plt.savefig(output_file, format='svg')
45 | print("Line plot saved as SVG.")
46 | 
47 | # Download the saved SVG file
48 | files.download(output_file)
49 | 
50 | # Show the plot
51 | plt.show()


--------------------------------------------------------------------------------
/defin_1D-CNN_model_params.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from tensorflow.keras.layers import AveragePooling1D
 3 | def defin_1D-CNN_model_params():
 4 |     size_list = [4,1,5,1]
 5 |     stride_list = [3,1,1,1]
 6 |     parameters_dict = {}
 7 |     parameters_dict['DROPOUT'] = 0.12
 8 |     parameters_dict['KERNEL_SIZE'] = []
 9 |     parameters_dict['FILTERS'] = 2
10 |     parameters_dict['STRIDES'] = []
11 |     parameters_dict['PADDING'] = 'same'
12 |     # Add a key name 'POOL-TYPE' to the parameters_dict dictionary and put its value as AveragePooling1D
13 |     parameters_dict['POOL-TYPE'] = AveragePooling1D
14 |     parameters_dict['POOL_SIZE'] = (2,1)
15 |     parameters_dict['POOL_STRIDE'] = (2,1)
16 |     parameters_dict['INPUT_SHAPE'] = (1500,1)
17 |     parameters_dict['CNN_LAYER_SPEC'] = (2,200,200)
18 |     parameters_dict['DENSE_LAYER'] = (2,300,200)
19 |     for i in range(parameters_dict['FILTERS']):
20 |         parameters_dict['KERNEL_SIZE'].append(size_list[2*i])
21 |         parameters_dict['KERNEL_SIZE'].append(size_list[2*i+1])
22 |         parameters_dict['STRIDES'].append(stride_list[2*i])
23 |         parameters_dict['STRIDES'].append(stride_list[2*i+1])
24 |     return parameters_dict
25 | 


--------------------------------------------------------------------------------
/define_Bi-LSTM_model_params.py:
--------------------------------------------------------------------------------
1 | def define_Bi-LSTM_model_params():
2 |     # Define the model parameters as a dictionary
3 |     params = {
4 |         'input_shape': (1, 1500),  # Shape of input data [1*1500]
5 |         'activation': 'relu',  # Activation function for the fully connected layers
6 |         'dense_neurons': [1024, 512, 256, 74]  # Number of neurons in each dense layer
7 |     }
8 |     return params


--------------------------------------------------------------------------------
/define_FC_model_params.py:
--------------------------------------------------------------------------------
 1 | def define_FC_model_params():
 2 |     parameters = {
 3 |         "input_shape": 1500,
 4 |         "first_layer": 1300,
 5 |         "second_layer": 1000,
 6 |         "third_layer": 512,
 7 |         "fourth_layer": 256,
 8 |         "num_classes": 12
 9 |         "dropout_rate": 0.4,
10 |     }
11 |     return parameters


--------------------------------------------------------------------------------
/define_GAN_model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import pandas as pd
  4 | import tensorflow as tf
  5 | from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Reshape, BatchNormalization, LeakyReLU
  6 | from tensorflow.keras.models import Sequential
  7 | 
  8 | from memory usage-execution time import measure_execution_memory
  9 | 
 10 | # Define the Generator model
 11 | def build_generator(latent_dim):
 12 |     
 13 |     generator = tf.keras.Sequential()
 14 |     generator.add(layers.Conv1D(256, kernel_size=3, activation=tf.keras.layers.LeakyReLU(), input_shape=(1, 1500), input_dim=latent_dim))
 15 |     generator.add(layers.BatchNormalization())
 16 |     generator.add(layers.AveragePooling1D(pool_size=2, strides=2))
 17 |     generator.add(layers.Conv1D(128, kernel_size=3, activation=tf.keras.layers.LeakyReLU()))
 18 |     generator.add(layers.BatchNormalization())
 19 |     generator.add(layers.AveragePooling1D(pool_size=2, strides=2))
 20 |     generator.add(layers.Conv1D(64, kernel_size=3, activation=tf.keras.layers.LeakyReLU()))
 21 |     generator.add(layers.BatchNormalization())
 22 |     generator.add(layers.AveragePooling1D(pool_size=2, strides=2))
 23 |     generator.add(layers.Flatten())
 24 |     generator.add(layers.Dense(1500, activation='tanh')
 25 |     generator.add(layers.Reshape((1, 1500)))
 26 |     return generator
 27 | 
 28 | 
 29 | # Define the Discriminator model
 30 | def build_discriminator():
 31 |     discriminator = tf.keras.Sequential()
 32 |     discriminator.add(layers.Conv1D(256, kernel_size=3, activation=tf.keras.layers.LeakyReLU(), input_shape=(1, 1500)))
 33 |     discriminator.add(layers.BatchNormalization())
 34 |     discriminator.add(layers.AveragePooling1D(pool_size=2, strides=2))
 35 |     discriminator.add(layers.Conv1D(128, kernel_size=3, activation=tf.keras.layers.LeakyReLU()))
 36 |     discriminator.add(layers.BatchNormalization())
 37 |     discriminator.add(layers.AveragePooling1D(pool_size=2, strides=2))
 38 |     discriminator.add(layers.Conv1D(64, kernel_size=3, activation=tf.keras.layers.LeakyReLU()))
 39 |     discriminator.add(layers.BatchNormalization())
 40 |     discriminator.add(layers.AveragePooling1D(pool_size=2, strides=2))
 41 |     discriminator.add(layers.Flatten())
 42 |     discriminator.add(layers.Dense(1, activation='sigmoid'))
 43 |     return discriminator
 44 | 
 45 | 
 46 | # Combine the Generator and Discriminator into a GAN model
 47 | def build_gan(generator, discriminator):
 48 |     discriminator.trainable = False
 49 |     model = Sequential()
 50 |     model.add(generator)
 51 |     model.add(discriminator)
 52 |     return model
 53 | 
 54 | 
 55 | # Load the CSV files from the directory
 56 | def load_csv_files(directory):
 57 |     csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]
 58 |     dataframes = []
 59 |     for file in csv_files:
 60 |         dataframe = pd.read_csv(os.path.join(directory, file))
 61 |         dataframes.append(dataframe)
 62 |     return dataframes
 63 | 
 64 | 
 65 | # Generate artificial data using GAN for specific labels
 66 | def generate_artificial_data(generator, num_samples):
 67 |     latent_dim = 100
 68 |     noise = np.random.normal(0, 1, (num_samples, latent_dim))
 69 |     artificial_data = generator.predict(noise)
 70 |     return artificial_data
 71 | 
 72 | 
 73 | # GAN model network program
 74 | @measure_execution_memory
 75 | def process_csv_files()
 76 |     directory = './data'  # Directory containing the CSV files
 77 |     num_samples = int(input("Enter the number of samples to generate via GAN: "))
 78 | 
 79 |     # Load CSV files
 80 |     dataframes = load_csv_files(directory)
 81 | 
 82 |     # Process each CSV file
 83 |     for dataframe in dataframes:
 84 |         packet_data = dataframe['packet_normalized_data'].values
 85 |         class_labels = dataframe['class_label'].values
 86 | 
 87 |         # Filter class labels requiring artificial data
 88 |         labels_to_generate = [0, 2, 3, 11]
 89 |         filtered_indices = [i for i, label in enumerate(class_labels) if label in labels_to_generate]
 90 |         filtered_packet_data = packet_data[filtered_indices]
 91 |         filtered_class_labels = class_labels[filtered_indices]
 92 | 
 93 |         # Prepare and train GAN only if there are filtered records
 94 |         if len(filtered_indices) > 0:
 95 |             # Prepare data for GAN training (Normalize between -1 and 1)
 96 |             filtered_packet_data = (filtered_packet_data - np.min(filtered_packet_data)) / (
 97 |                     np.max(filtered_packet_data) - np.min(filtered_packet_data))
 98 |             filtered_packet_data = 2 * filtered_packet_data - 1
 99 |             filtered_packet_data = np.expand_dims(filtered_packet_data, axis=-1)
100 | 
101 |             # Build and compile the models
102 |             generator = build_generator(latent_dim=100)
103 |             discriminator = build_discriminator()
104 |             gan = build_gan(generator,discriminator)
105 | 
106 |             discriminator.compile(loss='binary_crossentropy', optimizer='adam')
107 |             gan.compile(loss='binary_crossentropy', optimizer='adam')
108 | 
109 |             # Train the GAN
110 |             batch_size = 32
111 |             epochs = 100
112 |             num_batches = len(filtered_packet_data) // batch_size
113 | 
114 |             for epoch in range(epochs):
115 |                 for batch in range(num_batches):
116 |                     # Select a random batch of real samples
117 |                     real_samples = filtered_packet_data[batch * batch_size:(batch + 1) * batch_size]
118 | 
119 |                     # Generate a batch of fake samples
120 |                     noise = np.random.normal(0, 1, (batch_size, latent_dim))
121 |                     fake_samples = generator.predict(noise)
122 | 
123 |                     # Train the discriminator
124 |                     discriminator.trainable = True
125 |                     d_loss_real = discriminator.train_on_batch(real_samples, np.ones((batch_size, 1)))
126 |                     d_loss_fake = discriminator.train_on_batch(fake_samples, np.zeros((batch_size, 1)))
127 |                     d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
128 | 
129 |                     # Train the generator
130 |                     discriminator.trainable = False
131 |                     g_loss = gan.train_on_batch(noise, np.ones((batch_size, 1)))
132 | 
133 |                 # Print the progress
134 |                 print(f"Epoch {epoch + 1}/{epochs} - D loss: {d_loss} - G loss: {g_loss}")
135 | 
136 |             # Generate artificial data
137 |             artificial_data = generate_artificial_data(generator, num_samples)
138 | 
139 |             # Add the artificial data to the original dataframe
140 |             dataframe = dataframe.append(pd.DataFrame({
141 |                 'packet_normalized_data': artificial_data.squeeze(),
142 |                 'class_label': np.random.choice(labels_to_generate, num_samples)
143 |             }), ignore_index=True)
144 | 
145 |             # Save the updated dataframe back to the original CSV file
146 |             dataframe.to_csv(os.path.join(directory, file), index=False)
147 | =============================================================================================================
148 | 


--------------------------------------------------------------------------------
/define_SAE_model_params.py:
--------------------------------------------------------------------------------
 1 | def define_SAE_model_params():
 2 |     encoder_neurons_sae1 = []
 3 |     code_neurons_sae1 = []
 4 |     decoder_neurons_sae1 = []
 5 |     output_neurons_sae1 = [20, 15, 10, 15, 10, 20]
 6 | 
 7 |     for _ in range(6):
 8 |         encoder_neurons_sae.append([1024, 512, 256, 128])
 9 |         code_neurons_sae.append(10)
10 |         decoder_neurons_sae.append([128, 256, 512, 1024])
11 | 
12 |     dictionary = {
13 |         'encoder_neurons_sae': encoder_neurons_sae1,
14 |         'code_neurons_sae': code_neurons_sae1,
15 |         'decoder_neurons_sae': decoder_neurons_sae1,
16 |         'output_neurons_sae': output_neurons_sae1
17 |     }
18 | 
19 |     return dictionary


--------------------------------------------------------------------------------
/define_autoencoder.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.keras import layers
 3 | import pandas as pd
 4 | 
 5 | def define_autoencoder(input_shape, encoder_neurons, code_neurons, decoder_neurons, output_neurons):
 6 |     # Encoder
 7 |     input_data = tf.keras.Input(shape=input_shape, name='input')
 8 |     encoder = input_data
 9 |     for neurons in encoder_neurons:
10 |         encoder = layers.Dense(neurons, activation='relu')(encoder)
11 | 
12 |     # Code layer
13 |     code = layers.Dense(code_neurons, activation='relu', name='code')(encoder)
14 | 
15 |     # Decoder
16 |     decoder = code
17 |     for neurons in decoder_neurons:
18 |         decoder = layers.Dense(neurons, activation='relu')(decoder)
19 | 
20 |     # Output layer
21 |     output = layers.Dense(output_neurons, name='output')(decoder)
22 | 
23 |     # Define the model
24 |     model = tf.keras.Model(inputs=input_data, outputs=output)
25 |     return model


--------------------------------------------------------------------------------
/extract_header_payload_packets.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The code you have provided is a function called extract_header_payload_packets(). This function extracts the header and payload information from packets in a pcap file and saves it to a CSV file.
  3 | 
  4 | The function first defines a few variables, such as the default MTU, the number of packets to flush to the CSV file at a time, and a flag to indicate whether the CSV file has been created yet.
  5 | 
  6 | The function then iterates over the packets in the pcap file. For each packet, the function checks if the packet has a payload. If the packet does have a payload, the function extracts the header and payload information from the packet and saves it to a DataFrame.
  7 | 
  8 | If the packet is larger than the MTU, the function breaks the packet up into multiple packets and saves each of the smaller packets to the DataFrame.
  9 | 
 10 | Once the function has processed all of the packets in the pcap file, it flushes the DataFrame to the CSV file.
 11 | """
 12 | from ip_masking import ip_masking
 13 | 
 14 | def extract_header_payload_packets(packets,k,v):
 15 |     print('file name processing: {}'.format(k))
 16 |     default_mtu = 1500
 17 |     # flush counter for flusshing packet to csv file
 18 |     number_of_pck = len(packets)
 19 |     flush_counter = 10000
 20 |     if(number_of_pck <= flush_counter):
 21 |         flush_counter = number_of_pck
 22 |     has_flushed = False
 23 |     # counter for checking how many valid packet has been processed
 24 |     total_processed_packet = 0
 25 |     temp_processed_packet = 0
 26 |     # root address for saving extracket packet data is set here
 27 |     extracted_packet_root_dir =  '/media/mehdi/linux/data/must_be_normalized_data/'
 28 |     processed_file_name =b''
 29 |     # create extracted data packet folder as name  extracted_packet_root_dir
 30 |     if not os.path.exists(extracted_packet_root_dir):
 31 |         os.makedirs(extracted_packet_root_dir)
 32 |     # Find All Protocol supported by Scapy
 33 |     f = io.StringIO()
 34 |     protocol = [] # list of all Protocol
 35 |     with redirect_stdout(f):
 36 |         ls()
 37 |     out = f.getvalue()
 38 |     #print("Packet Listing:", out, sep="\n\n")
 39 |     protocol_list = out.split('\n')
 40 |     for i in range(len(protocol_list)):
 41 |         protocol.append(protocol_list[i].split(':')[0].replace(" ", ""))
 42 |     # Create DataFrame for packet
 43 |     df = pd.DataFrame(columns=['Source_IP', 'Dest_IP', 'Source_Port', 'Destination_Port','pckt_protocol',
 44 |                                'src_MAC', 'dst_MAC', 'pckt_ttl','payload','ip_header','packet_lenght','packet_data_lenght','packet_number','class_label'])
 45 |     
 46 |     pkt_count = 0
 47 |     # List to holds srpIPs
 48 |     srpIP = []
 49 |     tls_ip = [field.name for field in TLS().fields_desc]
 50 |     print(tls_ip)
 51 |     cnt = 0
 52 |     pkt_number = 0
 53 |     pktlst = []
 54 |     # Read each packet and appent to srpIP list
 55 |     for pkt in packets:
 56 |         pkt_number += 1
 57 |         
 58 |         if(pkt_number == 40):
 59 |             print("893")
 60 | 
 61 |         print("packet number  :{} has been proccessed".format(pkt_number))
 62 |         has_payload = False
 63 |         if(pkt.haslayer(Raw)):
 64 |             try:
 65 |                 if (pkt.haslayer(SSLv2)):
 66 |                     print("sslv2 layer")
 67 | 
 68 |                 pck_load = TLS(pkt.load)
 69 |                 if pck_load.haslayer('TLS'):
 70 |                     #records = pkt['TLS'].records
 71 |                     print("tls layer")
 72 |                 #pck_load.show()
 73 |                 pck_fields = [field.name for field in pck_load.fields_desc]
 74 |                 has_payload = True
 75 | 
 76 |             except:
 77 |                 print("Oops!", sys.exc_info()[0], "occurred.")
 78 | 
 79 | 
 80 |            
 81 | 
 82 |         if (pkt.haslayer(TLS) == False  and has_payload == True and pck_fields.count('type') > 0 and len(pkt) >= 60):
 83 |             if (len(pck_load.fields.get("msg")) > 0 ):
 84 |                 if(pck_load.msg[0].name == 'TLS Application Data'):
 85 |                     if pkt.haslayer(Ether) and pkt.haslayer(IP) and pkt.haslayer(TCP) and pkt.haslayer(Raw):
 86 |                        if pck_load.type == 23:
 87 |                             # pktlst.append(cnt-1)
 88 |                             src_mac = pkt[Ether].src
 89 |                             dst_mac = pkt[Ether].dst
 90 |                             pckt_ip_dest = pkt[IP].dst
 91 |                             pckt_ip_source = pkt[IP].src
 92 |                             pckt_ttl = pkt[IP].ttl
 93 |                             pckt_protocol = 'TLS'
 94 |                             pckt_dest_port = pkt[TCP].dport
 95 |                             pckt_src_port = pkt[TCP].sport
 96 |                             payload = binascii.hexlify(bytes(pck_load.msg[0].data))
 97 |                             payload_lenght = int(len(payload) / 2)
 98 |                             packet_lenght = len(pkt)
 99 |                             # zero padding do here
100 |                             #if (pkt[IP].len) < 1500:
101 |                             if (len(pkt[IP])) < 1500:
102 |                                 # find tcp header
103 |                                 p = binascii.hexlify(bytes(pkt[IP].payload)[12:13])
104 |                                 # convert hex to binary
105 |                                 binary_length = "{0:08b}".format(int(p, 16))
106 |                                 # convert binary to decimal and normalize number
107 |                                 decimal_lenght = int(binary_length, 2)
108 |                                 numbe_of_tcp_header_byte = int(decimal_lenght / 4)
109 |                                 pad_len = (1500 - len(pkt[IP])) +  5
110 |                                 pad = Padding()
111 |                                 pad.load = '\x00' * int(pad_len)
112 |                                 pkt = pkt / pad
113 |                                 tcp_header = binascii.hexlify(bytes(pkt[IP].payload)[:numbe_of_tcp_header_byte])
114 |                                 pad_payload = binascii.hexlify(bytes(pkt[Raw].payload))
115 |                                 tcp_header += payload
116 |                                 tcp_header += pad_payload
117 |                                 payload = tcp_header
118 |                                 ip_header = ip_masking(pkt)
119 |                                 df.loc[len(df.index)] = [pckt_ip_source, pckt_ip_dest, pckt_src_port, pckt_dest_port,
120 |                                                          pckt_protocol, src_mac, dst_mac, pckt_ttl, payload, ip_header,
121 |                                                          packet_lenght, payload_lenght, pkt_number, v]
122 |                                 # in this section flush dataframe data to csv file
123 |                                 total_processed_packet += 1
124 |                                 temp_processed_packet += 1
125 |                                 # check if csv file has been created or not
126 |                                 if (total_processed_packet == flush_counter):
127 |                                     processed_file_name = extracted_packet_root_dir + os.path.basename(k) + '.' + 'csv'
128 |                                     df.to_csv(processed_file_name,index = False)
129 |                                     # empty df dataframe
130 |                                     # Delete the first flush_counter rows
131 |                                     temp_processed_packet = 0
132 |                                     has_flushed = True
133 |                                     df = df.drop(df.index[range(flush_counter)])
134 |                                 # csv file exist and must flush processed packet to it
135 |                                 else:
136 |                                     if (temp_processed_packet == flush_counter):
137 |                                         # Write the new data to the CSV file in append mode
138 |                                         df.to_csv(processed_file_name, mode='a', header=False, index=False)
139 |                                         temp_processed_packet = 0
140 |                                         # empty df dataframe
141 |                                         # Delete the first flush_counter rows
142 |                                         df = df.drop(df.index[range(flush_counter)])
143 | 
144 |                             # if lenght of packet is greater that MTU w must break it up to multiple packet
145 |                             else:
146 |                                 number_of_fragmnet_pkt = 0
147 |                                 has_reminder = False
148 |                                 # find number of byte in tcp header and find mtu
149 |                                 # find tcp header
150 |                                 p = binascii.hexlify(bytes(pkt[IP].payload)[12:13])
151 |                                 # convert hex to binary
152 |                                 binary_length = "{0:08b}".format(int(p, 16))
153 |                                 # convert binary to decimal and normalize number
154 |                                 decimal_lenght = int(binary_length, 2)
155 |                                 numbe_of_tcp_header_byte = int(decimal_lenght / 4)
156 |                                 mtu = default_mtu - 20 - (numbe_of_tcp_header_byte)
157 |                                 if(int(payload_lenght % mtu) == 0):
158 |                                     number_of_fragmnet_pkt = int(payload_lenght / mtu)
159 |                                 else:
160 |                                     number_of_fragmnet_pkt = int(payload_lenght/mtu) + 1
161 |                                     has_reminder = True
162 |                                
163 |                                 offset = 0
164 |                                 payload = b''
165 |                                 for index in range(number_of_fragmnet_pkt):
166 |                                     if(has_reminder == False):
167 |                                         payload += binascii.hexlify(bytes(pck_load.msg[0].data[offset: (index + 1) * mtu]))
168 |                                         offset += mtu
169 |                                     else:
170 |                                         if(index == number_of_fragmnet_pkt - 1):
171 |                                             payload += binascii.hexlify(bytes(pck_load.msg[0].data[offset: offset + int(payload_lenght % mtu)]))
172 |                                         else:
173 |                                             payload += binascii.hexlify(bytes(pck_load.msg[0].data[offset: (index + 1) * mtu]))
174 |                                             offset += mtu
175 | 
176 |                                     
177 |                                     pad_len = (1500 - ((int(len(payload) / 2)) + 20 + numbe_of_tcp_header_byte))
178 |                                     pad = Padding()
179 |                                     pad.load = '\x00' * int(pad_len)
180 |                                     pkt = pkt / pad
181 |                                     tcp_header = binascii.hexlify(bytes(pkt[IP].payload)[:numbe_of_tcp_header_byte])
182 |                                     pad_payload = binascii.hexlify(bytes(pkt[Raw].payload))
183 |                                     tcp_header += payload
184 |                                     tcp_header += pad_payload
185 |                                     payload = tcp_header
186 |                                     ip_header = ip_masking(pkt)
187 |                                     df.loc[len(df.index)] = [pckt_ip_source, pckt_ip_dest, pckt_src_port,
188 |                                                              pckt_dest_port, pckt_protocol, src_mac, dst_mac, pckt_ttl, payload,
189 |                                                              ip_header, packet_lenght, payload_lenght, pkt_number, v]
190 |                                     payload = b''
191 | 
192 |                                     # in this section flush dataframe data to csv file
193 |                                     total_processed_packet += 1
194 |                                     temp_processed_packet += 1
195 |                                     # check if csv file has been created or not
196 |                                     if (total_processed_packet == flush_counter):
197 |                                         processed_file_name = extracted_packet_root_dir + os.path.basename(
198 |                                             k) + '.' + 'csv'
199 |                                         df.to_csv(processed_file_name,index = False)
200 |                                         # empty df dataframe
201 |                                         # Delete the first flush_counter rows
202 |                                         temp_processed_packet = 0
203 |                                         has_flushed = True
204 |                                         df = df.drop(df.index[range(flush_counter)])
205 |                                     # csv file exist and must flush processed packet to it
206 |                                     else:
207 |                                         if (temp_processed_packet == flush_counter):
208 |                                             # Write the new data to the CSV file in append mode
209 |                                             df.to_csv(processed_file_name, mode='a', header=False, index=False)
210 |                                             temp_processed_packet = 0
211 |                                             # empty df dataframe
212 |                                             # Delete the first flush_counter rows
213 |                                             df = df.drop(df.index[range(flush_counter)])
214 | 
215 | 
216 |                            
217 | 
218 | 
219 |         else:
220 |             SSL2v_flag = False
221 |             #checke if packet is server hello message
222 |             if (pkt.haslayer(TLS) == False and has_payload == True and pck_fields.count('type') == 0 and pck_fields.count('msg')>0 ):
223 |                 if (len(pck_load.fields.get("msg")) > 0):
224 |                     if (pck_load.msg[0].name == 'Raw' and pck_load.name == 'SSLv2'):
225 |                         SSL2v_flag = True
226 |             # find source and destination mac address of packet
227 |             if (pkt.haslayer(Ether) and pkt.haslayer(IP) and pkt.haslayer(TCP) and SSL2v_flag == False and len(pkt) >= 60) :
228 |                 res_list = [i for i, value in enumerate(protocol) if (pkt.haslayer(value) == True and value != 'TCP' and
229 |                                                                       value != 'IP' and value != 'Ether' and value != 'Raw' and value != 'TLS')]
230 |                 # check if a tls or ssl packet
231 |                 cnt += 1
232 |                 if pkt.haslayer(TLS):
233 |                     #print('a')
234 |                     extra_tls_layers = pkt[TLS]
235 |                     if pkt[TLS].type == 23:
236 |                         payload = b''
237 |                         app_data_layer_count = 0
238 |                         has_tls_payload = True
239 |                         pktlst.append(cnt - 1)
240 |                         src_mac = pkt[Ether].src
241 |                         dst_mac = pkt[Ether].dst
242 |                         pckt_ip_dest = pkt[IP].dst
243 |                         pckt_ip_source = pkt[IP].src
244 |                         pckt_ttl = pkt[IP].ttl
245 |                         pckt_protocol = 'TLS'
246 |                         pckt_dest_port = pkt[TCP].dport
247 |                         pckt_src_port = pkt[TCP].sport
248 |                         # fetch all Application Record Layer
249 |                         while(has_tls_payload == True):
250 |                             payload += binascii.hexlify(bytes(extra_tls_layers.msg[0].data))
251 |                             app_data_layer_count += 1
252 |                             if(len(extra_tls_layers.payload) > 0):
253 |                                 extra_tls_layers = extra_tls_layers.payload
254 |                             else:
255 |                                 has_tls_payload = False
256 | 
257 |                         #payload = binascii.hexlify(bytes(pkt[TLS])[5:int(pkt[TLS].len) + 5])
258 |                         payload_lenght = int(len(payload) / 2)
259 |                         packet_lenght = len(pkt)
260 |                         # zero padding do here
261 |                         #if (pkt[IP].len) < 1500:
262 |                         if (len(pkt[IP])) < 1500:
263 |                             # find tcp header
264 |                             p = binascii.hexlify(bytes(pkt[IP].payload)[12:13])
265 |                             # convert hex to binary
266 |                             binary_length = "{0:08b}".format(int(p, 16))
267 |                             # convert binary to decimal and normalize number
268 |                             decimal_lenght = int(binary_length, 2)
269 |                             numbe_of_tcp_header_byte = int(decimal_lenght / 4)
270 |                             pad_len = (1500 - len(pkt[IP])) + app_data_layer_count * 5
271 |                             pad = Padding()
272 |                             pad.load = '\x00' * int(pad_len)
273 |                             pkt = pkt / pad
274 |                             tcp_header = binascii.hexlify(bytes(pkt[IP].payload)[:numbe_of_tcp_header_byte])
275 |                             #pad_payload = binascii.hexlify(bytes(pkt[TLS].payload))
276 |                             pad_payload = binascii.hexlify(bytes(pad.load))
277 |                             tcp_header += payload
278 |                             tcp_header += pad_payload
279 |                             payload = tcp_header
280 |                             ip_header = ip_masking(pkt)
281 |                             df.loc[len(df.index)] = [pckt_ip_dest, pckt_ip_source, pckt_src_port, pckt_dest_port,
282 |                                                  pckt_protocol, src_mac, dst_mac, pckt_ttl, payload, ip_header,packet_lenght,payload_lenght,pkt_number, v]
283 |                             # in this section flush dataframe data to csv file
284 |                             total_processed_packet += 1
285 |                             temp_processed_packet += 1
286 |                             # check if csv file has been created or not
287 |                             if(total_processed_packet == flush_counter):
288 |                                 processed_file_name =  extracted_packet_root_dir + os.path.basename(k) + '.' + 'csv'
289 |                                 df.to_csv(processed_file_name,index = False)
290 |                                 # empty df dataframe
291 |                                 # Delete the first flush_counter rows
292 |                                 temp_processed_packet = 0
293 |                                 has_flushed = True
294 |                                 df = df.drop(df.index[range(flush_counter)])
295 |                             # csv file exist and must flush processed packet to it
296 |                             else:
297 |                                 if(temp_processed_packet == flush_counter):
298 |                                     # Write the new data to the CSV file in append mode
299 |                                     df.to_csv(processed_file_name, mode='a', header=False, index=False)
300 |                                     temp_processed_packet = 0
301 |                                     # empty df dataframe
302 |                                     # Delete the first flush_counter rows
303 |                                     df = df.drop(df.index[range(flush_counter)])
304 | 
305 | 
306 | 
307 |                         # if lenght of packet is greater that MTU w must break it up to multiple packet
308 |                         else:
309 |                             number_of_fragmnet_pkt = 0
310 |                             has_reminder = False
311 |                             # find number of byte in tcp header and find mtu
312 |                             # find tcp header
313 |                             p = binascii.hexlify(bytes(pkt[IP].payload)[12:13])
314 |                             # convert hex to binary
315 |                             binary_length = "{0:08b}".format(int(p, 16))
316 |                             # convert binary to decimal and normalize number
317 |                             decimal_lenght = int(binary_length, 2)
318 |                             numbe_of_tcp_header_byte = int(decimal_lenght / 4)
319 |                             mtu = default_mtu - 20 - (numbe_of_tcp_header_byte)
320 |                             if (int(payload_lenght % mtu) == 0):
321 |                                 number_of_fragmnet_pkt = int(payload_lenght / mtu)
322 |                             else:
323 |                                 number_of_fragmnet_pkt = int(payload_lenght / mtu) + 1
324 |                                 has_reminder = True
325 |                             
326 |                             offset = 0
327 |                             new_payload = b''
328 |                             for index in range(number_of_fragmnet_pkt):
329 |                                 if (has_reminder == False):
330 |                                     new_payload += payload[offset: (index + 1) * mtu]
331 |                                     offset += mtu
332 |                                 else:
333 |                                     if (index == number_of_fragmnet_pkt - 1):
334 |                                         new_payload += payload[offset: offset + int(payload_lenght % mtu)]
335 |                                     else:
336 |                                         new_payload += payload[offset: (index + 1) * mtu]
337 |                                         offset += mtu
338 |                                 
339 |                                 pad_len = (1500 - (int(len(new_payload) / 2) + 20 + numbe_of_tcp_header_byte))
340 |                                 pad = Padding()
341 |                                 pad.load = '\x00' * int(pad_len)
342 |                                 pkt = pkt / pad
343 |                                 tcp_header = binascii.hexlify(bytes(pkt[IP].payload)[:numbe_of_tcp_header_byte])
344 |                                 #pad_payload = binascii.hexlify(bytes(pkt[Raw].payload))
345 |                                 pad_payload = binascii.hexlify(bytes(pad.load))
346 |                                 tcp_header += new_payload
347 |                                 tcp_header += pad_payload
348 |                                 new_payload = tcp_header
349 |                                 ip_header = ip_masking(pkt)
350 |                                 df.loc[len(df.index)] = [pckt_ip_source, pckt_ip_dest, pckt_src_port,
351 |                                                              pckt_dest_port, pckt_protocol, src_mac, dst_mac, pckt_ttl,
352 |                                                              new_payload,ip_header, packet_lenght, payload_lenght, pkt_number, v]
353 |                                 new_payload = b''
354 | 
355 |                                 # in this section flush dataframe data to csv file
356 |                                 total_processed_packet += 1
357 |                                 temp_processed_packet += 1
358 |                                 # check if csv file has been created or not
359 |                                 if (total_processed_packet == flush_counter):
360 |                                     processed_file_name = extracted_packet_root_dir + os.path.basename(k) + '.' + 'csv'
361 |                                     df.to_csv(processed_file_name,index = False)
362 |                                     # empty df dataframe
363 |                                     # Delete the first flush_counter rows
364 |                                     temp_processed_packet = 0
365 |                                     has_flushed = True
366 |                                     df = df.drop(df.index[range(flush_counter)])
367 |                                 # csv file exist and must flush processed packet to it
368 |                                 else:
369 |                                     if (temp_processed_packet == flush_counter):
370 |                                         # Write the new data to the CSV file in append mode
371 |                                         df.to_csv(processed_file_name, mode='a', header=False, index=False)
372 |                                         temp_processed_packet = 0
373 |                                         # empty df dataframe
374 |                                         # Delete the first flush_counter rows
375 |                                         df = df.drop(df.index[range(flush_counter)])
376 | 
377 |                 else:
378 |                     # check if packet has pkt.load and TLS layer. some of packet has Raw data and load data
379 |                     # and load data contain TLS data. so we must check these packet
380 |                     if (has_payload == True ):
381 |                         tls_payload = TLS(pkt.load)
382 |                         if(tls_payload.name == 'TLS'):
383 |                             if pkt[TLS].type == 23:
384 |                                 payload = b''
385 |                                 app_data_layer_count = 0
386 |                                 has_tls_payload = True
387 |                                 pktlst.append(cnt - 1)
388 |                                 src_mac = pkt[Ether].src
389 |                                 dst_mac = pkt[Ether].dst
390 |                                 pckt_ip_dest = pkt[IP].dst
391 |                                 pckt_ip_source = pkt[IP].src
392 |                                 pckt_ttl = pkt[IP].ttl
393 |                                 pckt_protocol = 'TLS'
394 |                                 pckt_dest_port = pkt[TCP].dport
395 |                                 pckt_src_port = pkt[TCP].sport
396 |                                 # fetch all Application Record Layer
397 |                                 while (has_tls_payload == True):
398 |                                     payload += binascii.hexlify(bytes(tls_payload.msg[0].data))
399 |                                     app_data_layer_count += 1
400 |                                     if (len(tls_payload.payload) > 0):
401 |                                         tls_payload = tls_payload.payload
402 |                                     else:
403 |                                         has_tls_payload = False
404 | 
405 |                                 # payload = binascii.hexlify(bytes(pkt[TLS])[5:int(pkt[TLS].len) + 5])
406 |                                 payload_lenght = int(len(payload) / 2)
407 |                                 packet_lenght = len(pkt)
408 |                                 # zero padding do here
409 |                                 # if (pkt[IP].len) < 1500:
410 |                                 if (len(pkt[IP])) < 1500:
411 |                                     # find tcp header
412 |                                     p = binascii.hexlify(bytes(pkt[IP].payload)[12:13])
413 |                                     # convert hex to binary
414 |                                     binary_length = "{0:08b}".format(int(p, 16))
415 |                                     # convert binary to decimal and normalize number
416 |                                     decimal_lenght = int(binary_length, 2)
417 |                                     numbe_of_tcp_header_byte = int(decimal_lenght / 4)
418 |                                     pad_len = (1500 - len(pkt[IP])) + app_data_layer_count * 5
419 |                                     pad = Padding()
420 |                                     pad.load = '\x00' * int(pad_len)
421 |                                     pkt = pkt / pad
422 |                                     tcp_header = binascii.hexlify(bytes(pkt[IP].payload)[:numbe_of_tcp_header_byte])
423 |                                     # pad_payload = binascii.hexlify(bytes(pkt[TLS].payload))
424 |                                     pad_payload = binascii.hexlify(bytes(pad.load))
425 |                                     tcp_header += payload
426 |                                     tcp_header += pad_payload
427 |                                     payload = tcp_header
428 |                                     ip_header = ip_masking(pkt)
429 |                                     df.loc[len(df.index)] = [pckt_ip_dest, pckt_ip_source, pckt_src_port,
430 |                                                              pckt_dest_port,
431 |                                                              pckt_protocol, src_mac, dst_mac, pckt_ttl, payload,
432 |                                                              ip_header, packet_lenght, payload_lenght, pkt_number, v]
433 |                                     # in this section flush dataframe data to csv file
434 |                                     total_processed_packet += 1
435 |                                     temp_processed_packet += 1
436 |                                     # check if csv file has been created or not
437 |                                     if (total_processed_packet == flush_counter):
438 |                                         processed_file_name = extracted_packet_root_dir + os.path.basename(
439 |                                             k) + '.' + 'csv'
440 |                                         df.to_csv(processed_file_name,index = False)
441 |                                         # empty df dataframe
442 |                                         # Delete the first flush_counter rows
443 |                                         temp_processed_packet = 0
444 |                                         has_flushed = True
445 |                                         df = df.drop(df.index[range(flush_counter)])
446 |                                     # csv file exist and must flush processed packet to it
447 |                                     else:
448 |                                         if (temp_processed_packet == flush_counter):
449 |                                             # Write the new data to the CSV file in append mode
450 |                                             df.to_csv(processed_file_name, mode='a', header=False, index=False)
451 |                                             temp_processed_packet = 0
452 |                                             # empty df dataframe
453 |                                             # Delete the first flush_counter rows
454 |                                             df = df.drop(df.index[range(flush_counter)])
455 | 
456 |                                 # if lenght of packet is greater that MTU w must break it up to multiple packet
457 |                                 else:
458 |                                     number_of_fragmnet_pkt = 0
459 |                                     has_reminder = False
460 |                                     # find number of byte in tcp header and find mtu
461 |                                     # find tcp header
462 |                                     p = binascii.hexlify(bytes(pkt[IP].payload)[12:13])
463 |                                     # convert hex to binary
464 |                                     binary_length = "{0:08b}".format(int(p, 16))
465 |                                     # convert binary to decimal and normalize number
466 |                                     decimal_lenght = int(binary_length, 2)
467 |                                     numbe_of_tcp_header_byte = int(decimal_lenght / 4)
468 |                                     mtu = default_mtu - 20 - (numbe_of_tcp_header_byte)
469 |                                     if (int(payload_lenght % mtu) == 0):
470 |                                         number_of_fragmnet_pkt = int(payload_lenght / mtu)
471 |                                     else:
472 |                                         number_of_fragmnet_pkt = int(payload_lenght / mtu) + 1
473 |                                         has_reminder = True
474 |                                    
475 |                                     offset = 0
476 |                                     new_payload = b''
477 |                                     for index in range(number_of_fragmnet_pkt):
478 |                                         if (has_reminder == False):
479 |                                             new_payload += payload[offset: (index + 1) * mtu]
480 |                                             offset += mtu
481 |                                         else:
482 |                                             if (index == number_of_fragmnet_pkt - 1):
483 |                                                 new_payload += payload[offset: offset + int(payload_lenght % mtu)]
484 |                                             else:
485 |                                                 new_payload += payload[offset: (index + 1) * mtu]
486 |                                                 offset += mtu
487 | 
488 |                                         
489 |                                         pad_len = (1500 - (int(len(new_payload) / 2) + 20 + numbe_of_tcp_header_byte))
490 |                                         # pad_len = (1500 - int(len(new_payload)/2))
491 |                                         pad = Padding()
492 |                                         pad.load = '\x00' * int(pad_len)
493 |                                         pkt = pkt / pad
494 |                                         tcp_header = binascii.hexlify(bytes(pkt[IP].payload)[:numbe_of_tcp_header_byte])
495 |                                         # pad_payload = binascii.hexlify(bytes(pkt[Raw].payload))
496 |                                         pad_payload = binascii.hexlify(bytes(pad.payload))
497 | 
498 |                                         tcp_header += new_payload
499 |                                         tcp_header += pad_payload
500 |                                         new_payload = tcp_header
501 |                                         ip_header = ip_masking(pkt)
502 |                                         df.loc[len(df.index)] = [pckt_ip_source, pckt_ip_dest, pckt_src_port,
503 |                                                                  pckt_dest_port, pckt_protocol, src_mac, dst_mac,
504 |                                                                  pckt_ttl,
505 |                                                                  new_payload, ip_header, packet_lenght, payload_lenght,
506 |                                                                  pkt_number, v]
507 |                                         new_payload = b''
508 | 
509 |                                         # in this section flush dataframe data to csv file
510 |                                         total_processed_packet += 1
511 |                                         temp_processed_packet += 1
512 |                                         # check if csv file has been created or not
513 |                                         if (total_processed_packet == flush_counter):
514 |                                             processed_file_name = extracted_packet_root_dir + os.path.basename(
515 |                                                 k) + '.' + 'csv'
516 |                                             df.to_csv(processed_file_name,index = False)
517 |                                             # empty df dataframe
518 |                                             # Delete the first flush_counter rows
519 |                                             temp_processed_packet = 0
520 |                                             has_flushed = True
521 |                                             df = df.drop(df.index[range(flush_counter)])
522 |                                         # csv file exist and must flush processed packet to it
523 |                                         else:
524 |                                             if (temp_processed_packet == flush_counter):
525 |                                                 # Write the new data to the CSV file in append mode
526 |                                                 df.to_csv(processed_file_name, mode='a', header=False, index=False)
527 |                                                 temp_processed_packet = 0
528 |                                                 # empty df dataframe
529 |                                                 # Delete the first flush_counter rows
530 |                                                 df = df.drop(df.index[range(flush_counter)])
531 | 
532 |                     else:
533 |                         # if packet is a tcp packet or not
534 |                         if pkt[IP].proto == 6 and len(res_list) == 0:
535 |                             if pkt.haslayer(Raw):
536 |                                 src_mac = pkt[Ether].src
537 |                                 dst_mac = pkt[Ether].dst
538 |                                 pckt_ip_dest = pkt[IP].dst
539 |                                 pckt_ip_source = pkt[IP].src
540 |                                 pckt_ttl = pkt[IP].ttl
541 |                                 pckt_protocol = 'TCP'
542 |                                 pckt_dest_port = pkt[TCP].dport
543 |                                 pckt_src_port = pkt[TCP].sport
544 |                                 payload = binascii.hexlify(bytes((pkt[Raw])))
545 |                                 payload_lenght = int(len(payload) / 2)
546 |                                 packet_lenght = len(pkt)
547 |                                 # zero padding do here
548 |                                 # if (pkt[IP].len) < 1500:
549 |                                 if (len(pkt[IP])) < 1500:
550 |                                     # find tcp header
551 |                                     p = binascii.hexlify(bytes(pkt[IP].payload)[12:13])
552 |                                     # convert hex to binary
553 |                                     binary_length = "{0:08b}".format(int(p, 16))
554 |                                     # convert binary to decimal and normalize number
555 |                                     decimal_lenght = int(binary_length, 2)
556 |                                     numbe_of_tcp_header_byte = int(decimal_lenght / 4)
557 |                                     pad_len = (1500 - len(pkt[IP]))
558 |                                     pad = Padding()
559 |                                     pad.load = '\x00' * int(pad_len)
560 |                                     pkt = pkt / pad
561 |                                     tcp_header = binascii.hexlify(bytes(pkt[IP].payload)[:numbe_of_tcp_header_byte])
562 |                                     pad_payload = binascii.hexlify(bytes(pkt[Raw]))
563 |                                     tcp_header += pad_payload
564 |                                     payload = tcp_header
565 |                                     ip_header = ip_masking(pkt)
566 |                                     df.loc[len(df.index)] = [pckt_ip_source, pckt_ip_dest, pckt_src_port,
567 |                                                              pckt_dest_port,
568 |                                                              pckt_protocol, src_mac, dst_mac, pckt_ttl,
569 |                                                              payload, ip_header, packet_lenght, payload_lenght,
570 |                                                              pkt_number, v]
571 |                                     # in this section flush dataframe data to csv file
572 |                                     total_processed_packet += 1
573 |                                     temp_processed_packet += 1
574 |                                     # check if csv file has been created or not
575 |                                     if (total_processed_packet == flush_counter):
576 |                                         processed_file_name = extracted_packet_root_dir + os.path.basename(
577 |                                             k) + '.' + 'csv'
578 |                                         df.to_csv(processed_file_name,index = False)
579 |                                         # empty df dataframe
580 |                                         # Delete the first flush_counter rows
581 |                                         temp_processed_packet = 0
582 |                                         has_flushed = True
583 |                                         df = df.drop(df.index[range(flush_counter)])
584 |                                     # csv file exist and must flush processed packet to it
585 |                                     else:
586 |                                         if (temp_processed_packet == flush_counter):
587 |                                             # Write the new data to the CSV file in append mode
588 |                                             df.to_csv(processed_file_name, mode='a', header=False, index=False)
589 |                                             temp_processed_packet = 0
590 |                                             # empty df dataframe
591 |                                             # Delete the first flush_counter rows
592 |                                             df = df.drop(df.index[range(flush_counter)])
593 | 
594 | 
595 |                                 # if lenght of packet is greater that MTU w must break it up to multiple packet
596 |                                 else:
597 |                                     number_of_fragmnet_pkt = 0
598 |                                     has_reminder = False
599 |                                     # find number of byte in tcp header and find mtu
600 |                                     # find tcp header
601 |                                     p = binascii.hexlify(bytes(pkt[IP].payload)[12:13])
602 |                                     # convert hex to binary
603 |                                     binary_length = "{0:08b}".format(int(p, 16))
604 |                                     # convert binary to decimal and normalize number
605 |                                     decimal_lenght = int(binary_length, 2)
606 |                                     numbe_of_tcp_header_byte = int(decimal_lenght / 4)
607 |                                     mtu = default_mtu - 20 - (numbe_of_tcp_header_byte)
608 |                                     if (int(payload_lenght % mtu) == 0):
609 |                                         number_of_fragmnet_pkt = int(payload_lenght / mtu)
610 |                                     else:
611 |                                         number_of_fragmnet_pkt = int(payload_lenght / mtu) + 1
612 |                                         has_reminder = True
613 |                                     
614 |                                     offset = 0
615 |                                     new_payload = b''
616 |                                     for index in range(number_of_fragmnet_pkt):
617 |                                         if (has_reminder == False):
618 |                                             new_payload += payload[offset: (index + 1) * mtu]
619 |                                             offset += mtu
620 |                                         else:
621 |                                             if (index == number_of_fragmnet_pkt - 1):
622 |                                                 new_payload += payload[offset: offset + int(payload_lenght % mtu)]
623 |                                             else:
624 |                                                 new_payload += payload[offset: (index + 1) * mtu]
625 |                                                 offset += mtu
626 | 
627 |                                         
628 |                                         pad_len = (1500 - (int(len(new_payload) / 2) + 20 + numbe_of_tcp_header_byte))
629 |                                         # pad_len = (1500 - int(len(new_payload)/2))
630 |                                         pad = Padding()
631 |                                         pad.load = '\x00' * int(pad_len)
632 |                                         pkt = pkt / pad
633 |                                         tcp_header = binascii.hexlify(bytes(pkt[IP].payload)[:numbe_of_tcp_header_byte])
634 |                                         # pad_payload = binascii.hexlify(bytes(pkt[Raw].payload))
635 |                                         pad_payload = binascii.hexlify(bytes(pad.payload))
636 | 
637 |                                         tcp_header += new_payload
638 |                                         tcp_header += pad_payload
639 |                                         new_payload = tcp_header
640 |                                         ip_header = ip_masking(pkt)
641 |                                         df.loc[len(df.index)] = [pckt_ip_source, pckt_ip_dest, pckt_src_port,
642 |                                                                  pckt_dest_port, pckt_protocol, src_mac, dst_mac,
643 |                                                                  pckt_ttl,
644 |                                                                  new_payload, ip_header, packet_lenght, payload_lenght,
645 |                                                                  pkt_number, v]
646 |                                         new_payload = b''
647 | 
648 |                                         # in this section flush dataframe data to csv file
649 |                                         total_processed_packet += 1
650 |                                         temp_processed_packet += 1
651 |                                         # check if csv file has been created or not
652 |                                         if (total_processed_packet == flush_counter):
653 |                                             processed_file_name = extracted_packet_root_dir + os.path.basename(
654 |                                                 k) + '.' + 'csv'
655 |                                             df.to_csv(processed_file_name,index = False)
656 |                                             # empty df dataframe
657 |                                             # Delete the first flush_counter rows
658 |                                             temp_processed_packet = 0
659 |                                             has_flushed = True
660 |                                             df = df.drop(df.index[range(flush_counter)])
661 |                                         # csv file exist and must flush processed packet to it
662 |                                         else:
663 |                                             if (temp_processed_packet == flush_counter):
664 |                                                 # Write the new data to the CSV file in append mode
665 |                                                 df.to_csv(processed_file_name, mode='a', header=False, index=False)
666 |                                                 temp_processed_packet = 0
667 |                                                 # empty df dataframe
668 |                                                 # Delete the first flush_counter rows
669 |                                                 df = df.drop(df.index[range(flush_counter)])
670 | 
671 |                             
672 | 
673 |             else:
674 |                 if (pkt.haslayer(Ether) and pkt.haslayer(IP) and pkt.haslayer(UDP) and len(pkt) >= 60):
675 |                     res_list = [i for i, value in enumerate(protocol) if
676 |                                 (pkt.haslayer(value) == True and value != 'UDP' and
677 |                                  value != 'IP' and value != 'Ether' and value != 'Raw')]
678 |                     if pkt[IP].proto == 17 and len(res_list) == 0:
679 |                         if pkt.haslayer(Raw):
680 |                             src_mac = pkt[Ether].src
681 |                             dst_mac = pkt[Ether].dst
682 |                             pckt_ip_dest = pkt[IP].dst
683 |                             pckt_ip_source = pkt[IP].src
684 |                             pckt_ttl = pkt[IP].ttl
685 |                             pckt_protocol = 'UDP'
686 |                             pckt_dest_port = pkt[UDP].dport
687 |                             pckt_src_port = pkt[UDP].sport
688 |                             payload = binascii.hexlify(bytes((pkt[Raw])))
689 |                             payload_lenght = int(len(payload) / 2)
690 |                             packet_lenght = len(pkt)
691 |                             # zero padding do here
692 |                             #if (pkt[IP].len) < 1500:
693 |                             if (len(pkt[IP])) < 1500:
694 |                                 pad_len = (1500 - len(pkt[IP]))
695 |                                 pad = Padding()
696 |                                 pad.load = '\x00' * int(pad_len)
697 |                                 pkt = pkt / pad
698 |                                 udp_header = binascii.hexlify(bytes(pkt[IP].payload)[:8])
699 |                                 pad_payload = binascii.hexlify(bytes(pkt[Raw]))
700 |                                 udp_header += pad_payload
701 |                                 payload = udp_header
702 |                             ip_header = ip_masking(pkt)
703 |                             df.loc[len(df.index)] = [pckt_ip_source, pckt_ip_dest, pckt_src_port, pckt_dest_port,
704 |                                                      pckt_protocol, src_mac, dst_mac, pckt_ttl, payload, ip_header,packet_lenght,payload_lenght,pkt_number, v]
705 | 
706 |                             # in this section flush dataframe data to csv file
707 |                             total_processed_packet += 1
708 |                             temp_processed_packet += 1
709 |                             # check if csv file has been created or not
710 |                             if (total_processed_packet == flush_counter):
711 |                                 processed_file_name = extracted_packet_root_dir + os.path.basename(k) + '.' + 'csv'
712 |                                 df.to_csv(processed_file_name,index = False)
713 |                                 # empty df dataframe
714 |                                 # Delete the first flush_counter rows
715 |                                 temp_processed_packet = 0
716 |                                 has_flushed = True
717 |                                 df = df.drop(df.index[range(flush_counter)])
718 |                             # csv file exist and must flush processed packet to it
719 |                             else:
720 |                                 if (temp_processed_packet == flush_counter):
721 |                                     # Write the new data to the CSV file in append mode
722 |                                     df.to_csv(processed_file_name, mode='a', header=False, index=False)
723 |                                     temp_processed_packet = 0
724 |                                     # empty df dataframe
725 |                                     # Delete the first flush_counter rows
726 |                                     df = df.drop(df.index[range(flush_counter)])
727 |                             
728 |                 else:
729 |                     pass
730 |     
731 |     print("finished")
732 |     if(temp_processed_packet > 0):
733 |         if(has_flushed == True):
734 |             # Write the new data to the CSV file in append mode
735 |             df.to_csv(processed_file_name, mode='a', header=False, index=False)
736 |         else:
737 |             processed_file_name = extracted_packet_root_dir + os.path.basename(k) + '.' + 'csv'
738 |             df.to_csv(processed_file_name)
739 |     return extracted_packet_root_dir


--------------------------------------------------------------------------------
/fc_build_model.py:
--------------------------------------------------------------------------------
 1 | def fc_build_model(params):
 2 |     
 3 |     model = Sequential()
 4 |   
 5 |     # First layer
 6 |     model.add(Dense(params['first_layer'], input_shape=(params['input_shape'],)))
 7 |     model.add(Dropout(params['dropout_rate']))
 8 |     model.add(BatchNormalization())
 9 |   
10 |     # Second layer
11 |     model.add(Dense(params['second_layer'], activation='relu'))
12 |     model.add(Dropout(params['dropout_rate']))
13 |     model.add(BatchNormalization())
14 |   
15 |     # Third layer
16 |     model.add(Dense(params['third_layer'], activation='relu'))
17 |     model.add(Dropout(params['dropout_rate']))
18 |     model.add(BatchNormalization())
19 |   
20 |     # Fourth layer
21 |     model.add(Dense(params['fourth_layer'], activation='relu'))
22 |     model.add(Dropout(params['dropout_rate']))
23 |     model.add(BatchNormalization())
24 |   
25 |     # Last layer
26 |     if params['num_classes'] == 12:
27 |         output_units = 12
28 |     elif params['num_classes'] == 17:
29 |         output_units = 17
30 |     else:
31 |         raise ValueError("Invalid number of classes!")
32 |         
33 |     model.add(Dense(output_units, activation='softmax'))
34 |     # Compile the model
35 |     #model.compile(optimizer=params['optimizer'], loss=params['loss_function'])
36 | 
37 |     return model 


--------------------------------------------------------------------------------
/gausian-compare-accuracy-code.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | from scipy.signal import savgol_filter
 5 | from google.colab import files
 6 | 
 7 | # Upload Excel file
 8 | uploaded = files.upload()
 9 | 
10 | # Read Excel file into a DataFrame
11 | df = pd.read_excel(next(iter(uploaded)))
12 | 
13 | # Define line styles and symbols for each column
14 | line_styles = ['-', '--', ':']
15 | #symbols = ['s', 'o', '^']
16 | columns = ['CBS', 'CSCNN-[79]', 'Datanet-[19]']
17 | 
18 | # Plotting
19 | plt.figure(figsize=(10, 6))
20 | 
21 | for i, column in enumerate(columns):
22 |     x = df['Epoch']
23 |     y = df[column]
24 |     
25 |     # Apply smoothing filter
26 |     #y_smooth = savgol_filter(y, window_length=5, polyorder=2)
27 |     # Apply Gaussian smoothing filter
28 |     y_smooth = gaussian_filter1d(y, sigma=2)
29 |     
30 |     # Plot line with specific style and symbol
31 |     plt.plot(x, y_smooth, linestyle=line_styles[i],linewidth=3,label=column)
32 | 
33 | # Add legend and labels
34 | plt.legend()
35 | plt.xlabel('Epoch')
36 | plt.ylabel('Accuracy [%]')
37 | # Set x-axis spacing to 2
38 | #plt.xticks(np.arange(min(x), max(x)+1, 2))
39 | # Set x-axis spacing to 2
40 | plt.xticks(np.arange(min(x), 21, 1))
41 | #plt.title('Line Plot')
42 | # Save the plot as SVG
43 | output_file = input("Please provide the output file name (e.g., plot.svg): ")
44 | plt.savefig(output_file, format='svg')
45 | print("Line plot saved as SVG.")
46 | 
47 | # Download the saved SVG file
48 | files.download(output_file)
49 | 
50 | # Show the plot
51 | plt.show()


--------------------------------------------------------------------------------
/gausian-validation-train-acc.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | from scipy.signal import savgol_filter
 5 | from google.colab import files
 6 | 
 7 | # Upload Excel file
 8 | uploaded = files.upload()
 9 | 
10 | # Read Excel file into a DataFrame
11 | df = pd.read_excel(next(iter(uploaded)))
12 | 
13 | # Define line styles and symbols for each column
14 | line_styles = ['-', '--']
15 | #symbols = ['s', 'o', '^']
16 | columns = ['Val acc', 'Train acc']
17 | 
18 | # Plotting
19 | plt.figure(figsize=(10, 6))
20 | 
21 | for i, column in enumerate(columns):
22 |     x = df['Epoch']
23 |     y = df[column]
24 |     
25 |     # Apply smoothing filter
26 |     #y_smooth = savgol_filter(y, window_length=5, polyorder=2)
27 |       # Apply Gaussian smoothing filter
28 |     y_smooth = gaussian_filter1d(y, sigma=2)
29 |     
30 |     # Plot line with specific style and symbol
31 |     plt.plot(x, y_smooth, linestyle=line_styles[i],linewidth=3,label=column)
32 | 
33 | # Add legend and labels
34 | plt.legend()
35 | plt.xlabel('Epoch')
36 | plt.ylabel('Accuracy [%]')
37 | # Set x-axis spacing to 2
38 | #plt.xticks(np.arange(min(x), max(x)+1, 2))
39 | # Set x-axis spacing to 2
40 | plt.xticks(np.arange(min(x), 21, 1))
41 | #plt.title('Line Plot')
42 | # Save the plot as SVG
43 | output_file = input("Please provide the output file name (e.g., plot.svg): ")
44 | plt.savefig(output_file, format='svg')
45 | print("Line plot saved as SVG.")
46 | 
47 | # Download the saved SVG file
48 | files.download(output_file)
49 | 
50 | # Show the plot
51 | plt.show()


--------------------------------------------------------------------------------
/gausian-validation-training-loss.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | from scipy.signal import savgol_filter
 5 | from google.colab import files
 6 | 
 7 | # Upload Excel file
 8 | uploaded = files.upload()
 9 | 
10 | # Read Excel file into a DataFrame
11 | df = pd.read_excel(next(iter(uploaded)))
12 | 
13 | # Define line styles and symbols for each column
14 | line_styles = ['-', '--']
15 | #symbols = ['s', 'o', '^']
16 | columns = ['Val Loss', 'Train Loss']
17 | 
18 | # Plotting
19 | plt.figure(figsize=(10, 6))
20 | 
21 | for i, column in enumerate(columns):
22 |     x = df['Epoch']
23 |     y = df[column]
24 |     
25 |     # Apply smoothing filter
26 |     #y_smooth = savgol_filter(y, window_length=5, polyorder=2)
27 |       # Apply Gaussian smoothing filter
28 |     y_smooth = gaussian_filter1d(y, sigma=2)
29 |     
30 |     # Plot line with specific style and symbol
31 |     plt.plot(x, y_smooth, linestyle=line_styles[i],linewidth=3,label=column)
32 | 
33 | # Add legend and labels
34 | plt.legend()
35 | plt.xlabel('Epoch')
36 | plt.ylabel('Loss [%]')
37 | # Set x-axis spacing to 2
38 | #plt.xticks(np.arange(min(x), max(x)+1, 2))
39 | # Set x-axis spacing to 2
40 | plt.xticks(np.arange(min(x), 21, 1))
41 | #plt.title('Line Plot')
42 | # Save the plot as SVG
43 | output_file = input("Please provide the output file name (e.g., plot.svg): ")
44 | plt.savefig(output_file, format='svg')
45 | print("Line plot saved as SVG.")
46 | 
47 | # Download the saved SVG file
48 | files.download(output_file)
49 | 
50 | # Show the plot
51 | plt.show()


--------------------------------------------------------------------------------
/histogram_Dataset.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The code you have provided is a function called histogram_Dataset(). This function plots a histogram of the packet lengths in a set of pcap files.
 3 | 
 4 | The function first defines two lists, pkt_numebr_list and pkt_length_list. The pkt_numebr_list stores the packet numbers in the pcap files. The pkt_length_list stores the lengths of the packets in the pcap files.
 5 | 
 6 | The function then iterates over the pcap files in the files list. For each pcap file, the function reads the packets in the file and adds the packet numbers and lengths to the pkt_numebr_list and pkt_length_list lists, respectively.
 7 | 
 8 | The function then calculates the probability mass function (PMF) of the packet lengths. The PMF is a function that gives the probability of a packet having a particular length.
 9 | 
10 | The function then plots a histogram of the PMF. The histogram is a bar chart that shows the number of packets with each length.
11 | 
12 | The function finally returns the plot.
13 | 
14 | The first few lines define the pkt_numebr_list and pkt_length_list lists.
15 | 
16 | The next few lines iterate over the pcap files in the files list. For each pcap file, the function reads the packets in the file and adds the packet numbers and lengths to the pkt_numebr_list and pkt_length_list lists, respectively.
17 | 
18 | The next few lines calculate the PMF of the packet lengths.
19 | 
20 | The next few lines plot the histogram of the PMF.
21 | 
22 | The final line returns the plot.
23 | """
24 | 
25 | 
26 | def histogram_Dataset(files,path):
27 | 
28 |     pkt_numebr_list = []
29 |     pkt_length_list =[]
30 |     n_200 = 0
31 |     n_400 = 0
32 |     n_600 = 0
33 |     n_800 = 0
34 |     n_1000 = 0
35 |     n_1200 = 0
36 |     n_1500 = 0
37 |     bigger_1500 = 0
38 |     for f in files:
39 |         print('befor reading time is :{}'.format(datetime.now().time()))
40 |         packets = rdpcap(os.path.join(path, f))
41 |         print('after reading time is :{}'.format(datetime.now().time()))
42 |         for i in range(len(packets)):
43 |             pkt_numebr_list.append(i)
44 |             pkt_length_list.append(len(packets[i]))
45 |     # in this section we calculate PMF of packet lenght
46 |     packet_lenght_data = dict((x, pkt_length_list.count(x)) for x in set(pkt_length_list))
47 |     key_max = max(packet_lenght_data.keys(), key=(lambda k: packet_lenght_data[k]))
48 |     key_min = min(packet_lenght_data.keys(), key=(lambda k: packet_lenght_data[k]))
49 |     pkt_lenght = list(packet_lenght_data.keys())
50 |     values = list(packet_lenght_data.values())
51 |     total_packets = sum(values)
52 |     
53 |     # naming the x-axis
54 |     plt.xlabel('Packet Lenght')
55 |     # naming the y-axis
56 |     plt.ylabel('PMF')
57 |     # plot title
58 |     plt.title('Packet Lenght Distribution Map')
59 |     
60 |     for i in range(len(pkt_lenght)):
61 |         if(pkt_lenght[i]>=0 and pkt_lenght[i]<=200):
62 |             n_200 = n_200 + values[i]
63 |         elif(pkt_lenght[i]>200 and pkt_lenght[i]<=400):
64 |             n_400 = n_400 + values[i]
65 |         elif(pkt_lenght[i]>400 and pkt_lenght[i]<=600):
66 |             n_600 = n_600 + values[i]
67 |         elif(pkt_lenght[i]>600 and pkt_lenght[i]<=800):
68 |             n_800 = n_800 + values[i]
69 |         elif(pkt_lenght[i]>800 and pkt_lenght[i]<=1000):
70 |             n_1000 = n_1000 + values[i]
71 |         elif(pkt_lenght[i]>1000 and pkt_lenght[i]<=1200):
72 |             n_1200 = n_1200 + values[i]
73 |         elif(pkt_lenght[i]>1200 and pkt_lenght[i]<=1500):
74 |             n_1500 = n_1500 + values[i]
75 |         else:
76 |             bigger_1500 = bigger_1500 + values[i]
77 |     # setting ticks for x-axis
78 |     x_tickes = [n_200,n_400,n_600,n_800,n_1000,n_1200,n_1500,bigger_1500]
79 |     x_ticklabels = ['0-200', '200-400', '400-600', '600-800','800-1000'
80 |                     ,'1000-1200','1200-1500','1500-bigger']
81 |     y_tickes = [round((x/total_packets)) for x in x_tickes ]
82 |     y_tickLabels = [0,15,30,45,60,75,90,100]
83 |    
84 |     plt.bar(x_ticklabels, y_tickes, color ='maroon',width =0.4)
85 |     plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1))
86 |     
87 |     plt.show()
88 | 
89 |     return


--------------------------------------------------------------------------------
/ip_masking.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The code you have provided is a function called ip_masking(). This function takes a packet as input and masks the source and destination IP addresses of the packet.
 3 | 
 4 | The function first gets the IP header of the packet. The IP header is a 20-byte header that contains information about the source and destination IP addresses of the packet, as well as other information.
 5 | 
 6 | The function then splits the source and destination IP addresses into their individual octets. An octet is a group of 8 bits, which is equivalent to one byte.
 7 | 
 8 | The function then randomly selects one of the octets from the source and destination IP addresses. It then sets the value of this octet to 0.
 9 | 
10 | The function then updates the source and destination IP addresses of the packet with the masked octets.
11 | 
12 | Finally, the function returns the masked IP header of the packet.
13 | """
14 | 
15 | def ip_masking(packet):
16 |     #hex_packet = binascii.hexlify(bytes(packet))
17 |     ip_header_hex_packet = binascii.hexlify(bytes(packet[Ether].payload))[:(packet[IP].ihl*4)*2]
18 |     # ip masking
19 |     src_addr = str(packet[IP].src)
20 |     dst_addr = str(packet[IP].dst)
21 |     src = src_addr.split('.')
22 |     dst = dst_addr.split('.')
23 |     src_index = src.index(random.choice(src))
24 |     dst_index = dst.index(random.choice(dst))
25 |     src[src_index] = '0'
26 |     dst[dst_index] = '0'
27 |     msk_scr = '.'.join(src)
28 |     msk_dst = '.'.join(dst)
29 |     packet[IP].src = msk_scr
30 |     packet[IP].dst = msk_dst
31 |     msk_ip_header_hex_packet = binascii.hexlify(bytes(packet[Ether].payload))[:(packet[IP].ihl*4)*2]
32 |     return msk_ip_header_hex_packet


--------------------------------------------------------------------------------
/load_pcap_datatype.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The code you have provided is a function called load_pcap_datatype(). This function loads the packets from a set of pcap files, extracts the header and payload information, and saves it to a CSV file.
 3 | 
 4 | The function first defines a variable called root_must_normalized_dir that stores the directory where the normalized pcap files will be saved.
 5 | 
 6 | The function then defines a variable called normalized_files_name that stores the names of the normalized pcap files.
 7 | 
 8 | The function then defines a variable called chunk_size that specifies the size of the chunks that the pcap files will be broken into.
 9 | 
10 | The function then loads the TLS and SSL layers.
11 | 
12 | The function then iterates over the files in the file_name_dict dictionary. For each file, the function breaks the file into chunks and saves the chunks to the root_must_normalized_dir directory.
13 | 
14 | The function then extracts the header and payload information from the packets in the normalized_files_name list and saves it to a CSV file.
15 | 
16 | The function finally returns the normalized packets.
17 | 
18 | """
19 | from Break_Data_File import Break_Data_File
20 | 
21 | def load_pcap_datatype(file_name_dict):
22 | 
23 |     
24 |     normalized_files_name = []
25 |     # must be deleted
26 |     test_n_filename = []
27 |     chunk_size = 40000
28 |     load_layer("tls")
29 |     load_layer("ssl")
30 | 
31 |     
32 | 
33 |     for k,v in file_name_dict.items():
34 |         chunk_size_file = 10
35 |         # get file size in MB
36 |         size = get_file_size(k, SIZE_UNIT.BYTES)
37 |         print('Size of file is : ', size, 'Byte')
38 |         # breake file based on pcap size file
39 |         if(size > 2*chunk_size_file*1024*1024 ):
40 |             Break_Data_File(k,chunk_size_file)
41 |         
42 |         working_directory = os.path.splitext(os.path.basename(k))[0]
43 |         directory = os.path.splitext(os.path.basename(k))[0]
44 |         # Parent Directory path
45 |         parent_dir = os.path.dirname(k)
46 |         # Path
47 |         path = os.path.join(parent_dir, directory)
48 |         files = os.listdir(path)
49 |         # in this section we show histogram of packet length
50 |         histogram_Dataset(files,path)
51 |         for f in files:
52 |             print('befor reading time is :{}'.format(datetime.now().time()))
53 |             packets = rdpcap(os.path.join(path,f))
54 |             print('after reading time is :{}'.format(datetime.now().time()))
55 |             print("file:{} has been read".format((os.path.join(path,f))))
56 |             extracted_packet_root_dir = extract_header_payload_packets(packets,os.path.join(path,f) , v)
57 |     return extracted_packet_root_dir  
58 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | # This is a sample Python script.
  2 | 
  3 | # Press Shift+F10 to execute it or replace it with your code.
  4 | # Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
  5 | from __future__ import absolute_import, division, print_function, unicode_literals
  6 | 
  7 | import base64
  8 | import binascii
  9 | import enum
 10 | from typing import List, Any, Union
 11 | 
 12 | import numpy as np
 13 | import pandas as pd
 14 | import tensorflow as tf
 15 | import tensorflow_datasets as tfds
 16 | import matplotlib.pyplot as plt
 17 | import seaborn as sns
 18 | import matplotlib.ticker as mtick
 19 | from matplotlib.ticker import AutoMinorLocator
 20 | import plotly
 21 | from docutils.nodes import date
 22 | from pycodestyle import BaseReport
 23 | from pytest import collect
 24 | import os
 25 | import csv
 26 | import glob
 27 | import time
 28 | import datetime
 29 | 
 30 | from scapy.layers.dns import DNS
 31 | from scapy.layers.inet import IP, TCP, UDP
 32 | from scapy.layers.l2 import Ether
 33 | from scapy.layers.tls.handshake import TLSClientHello, TLSServerHello, TLSCertificateVerify
 34 | from scapy.layers.tls.handshake_sslv2 import SSLv2ClientHello, SSLv2ServerHello
 35 | from scapy.layers.tls.record import TLS
 36 | from scapy.layers.tls.record import *
 37 | from scapy.layers.tls.record_sslv2 import SSLv2
 38 | from scapy.layers.tls.record_sslv2 import *
 39 | 
 40 | 
 41 | #from scapy.layers.tls.record_sslv2 import SSLv3
 42 | from sphinx.testing.path import path
 43 | from tensorflow import keras as ks
 44 | #from tensorflow.keras.layers import la
 45 | #from keras import layers
 46 | #from keras import models
 47 | 
 48 | #new import for test
 49 | from tensorflow.keras import layers
 50 | from tensorflow.keras import models
 51 | #new import for test
 52 | 
 53 | from scipy.io import arff
 54 | from scapy.all import *
 55 | from scapy.utils import RawPcapReader
 56 | from sklearn.model_selection import train_test_split
 57 | from collections import Counter
 58 | from contextlib import redirect_stdout
 59 | import b_colors
 60 | 
 61 | import pktDirection
 62 | 
 63 | from define_FC_model_params import define_FC_model_params
 64 | from define_SAE_model_params import define_SAE_model_params
 65 | from define_Bi-LSTM_model_params import define_Bi-LSTM_model_params
 66 | from network_parameters_initializer import network_parameters_initializer
 67 | from load_pcap_datatype import load_pcap_datatype
 68 | from read_pcap_files import read_pcap_files
 69 | from packet_normalization import packet_normalization
 70 | from define_GAN_mode import process_csv_files
 71 | from preprocessing-traffic-label import read_app_pcap_files()
 72 | 
 73 | 
 74 | 
 75 | # Enum for size units
 76 | class SIZE_UNIT(enum.Enum):
 77 |    BYTES = 1
 78 |    KB = 2
 79 |    MB = 3
 80 |    GB = 4
 81 | def convert_unit(size_in_bytes, unit):
 82 |    """ Convert the size from bytes to other units like KB, MB or GB"""
 83 |    if unit == SIZE_UNIT.KB:
 84 |        return round(size_in_bytes/1024,3)
 85 |    elif unit == SIZE_UNIT.MB:
 86 |        return size_in_bytes/(1024*1024)
 87 |    elif unit == SIZE_UNIT.GB:
 88 |        return round(size_in_bytes/(1024*1024*1024),3)
 89 |    else:
 90 |        return size_in_bytes
 91 | 
 92 | def get_file_size(file_name, size_type = SIZE_UNIT.BYTES ):
 93 |    """ Get file in size in given unit like KB, MB or GB"""
 94 |    size = os.path.getsize(file_name)
 95 |    return convert_unit(size, size_type)
 96 | 
 97 | def print_hi(name):
 98 |     # Use a breakpoint in the code line below to debug your script.
 99 |     print(f'Hi, {name}')  # Press Ctrl+F8 to toggle the breakpoint.
100 | 
101 | 
102 | # Press the green button in the gutter to run the script.
103 | if __name__ == '__main__':
104 |     print_hi('PyCharm')
105 | 
106 | # See PyCharm help at https://www.jetbrains.com/help/pycharm/
107 | 
108 | 
109 | "===============================version of tensorflow and keras======================="
110 | 
111 | print("TensorFlow version: {}".format(tf.__version__))
112 | print("Eager execution is: {}".format(tf.executing_eagerly()))
113 | print("Keras version: {}".format(tf.keras.__version__))
114 | 
115 | "====================test on cpu or gpu ======================="
116 | if tf.test.is_gpu_available():
117 |     print('Running on GPU')
118 |     print('GPU #0?')
119 | 
120 | else:
121 |     print('Running on CPU')
122 | print(b_colors.bcolors.warning("This is dangerous"))
123 | my_macs = [get_if_hwaddr(i) for i in get_if_list()]
124 | print('mac address:{}'.format(str(my_macs)))
125 | print(Ether().src)
126 | 
127 | 
128 | 
129 | root_normalized_dir = 'media/mehdi/linux/normalized_data/'
130 | root_app_normalized_dir =  'media/mehdi/linux/normalized_app_data/'
131 | sae-extracted-feature-file = 'mehdi/linux'
132 | # in this function PCAP files read and based on file name the label extract.
133 | extracted_packet_root_dir = read_pcap_files()
134 | extracted_app_root_dir = read_app_pcap_files()
135 | files = []
136 | 
137 | for file in os.listdir(extracted_packet_root_dir):
138 |    if os.path.isfile(os.path.join(extracted_packet_root_dir, file)):
139 |      files.append(file)
140 | app_files = []
141 | for file in os.listdir(extracted_app_root_dir):
142 |     if os.path.isfile(os.path.join(extracted_app_root_dir, file)):
143 |       files.append(file)
144 | # Do packet normalization
145 | packet_normalization(files,1)  
146 | packet_normalization(app_files,0)  
147 | # define network model parameters ( hyper parameters )
148 | net_params = network_parameters_initializer()
149 | #GAN model for producing syntesized data
150 | process_csv_files()
151 | # Define 1D-CNN the model parameters
152 | cnn_model_params = defin_1D-CNN_model_params()
153 | #====================== for traffic type classification ====================
154 | cnn_output,cnn_saved_model,cnn_saved_weights,1d-cnn_path = cnn_Traffic_classification(root_normalized_dir,net_params,cnn_model_params)
155 | # ============for application classification ==============================
156 | cnn1_output,cnn1_saved_model,cnn1_saved_weights,app_1d-cnn_path = cnn_Traffic_classification(root_app_normalized_dir,net_params,cnn_model_params)
157 | # Define Bi-LSTM the model parameters
158 | #======================= for traffic type classification ====================
159 | Bi-LSTM_model_params = define_Bi-LSTM_model_params()
160 | bi-LSTM_output,bilstm_saved_model,bilstm_saved_weights,bi-lstm_path = Bi-LSTM_Traffic_classification(root_normalized_dir,net_params,Bi-LSTM_model_params)
161 | #======================= for application classification ====================
162 | bi-LSTM1_output,bilstm1_saved_model,bilstm1_saved_weights,bi-lstm1_path = Bi-LSTM_Traffic_classification(root_app_normalized_dir,net_params,Bi-LSTM_model_params)
163 | #Define SAE the model parameters
164 | SAE_model_params = define_SAE_model_params()
165 | sae_output,sae_saved_models,sae_saved_weights,sae_path = SAE_Traffic_classification(sae-extracted-feature-file,net_params,SAE_model_params)
166 | # Combine the outputs into a single input array
167 | fc_model_params = define_FC_model_params()
168 | #====================== for traffic type classification ======================================
169 | mlp_model = FC-traffic-classification(root_normalized_dir,net_params,fc_model_params, 1d-cnn_path,cnn_saved_model,bi-lstm_path,bilstm_saved_model,sae_path,sae_saved_models,sae-extracted-feature-file)
170 | # =============================== for application classification =================================
171 | mlp_model2 = FC-traffic-classification(root_app_normalized_dir,net_params,fc_model_params, app_1d-cnn_path,cnn1_saved_model,bi-lstm1_path,bilstm1_saved_model,sae_path,sae_saved_models,sae-extracted-feature-file)
172 | 
173 | 
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 


--------------------------------------------------------------------------------
/memory usage-execution time.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import psutil
 3 | 
 4 | # Function to get current memory usage
 5 | def get_memory_usage():
 6 |     process = psutil.Process()
 7 |     mem_info = process.memory_info()
 8 |     return mem_info.rss  # Resident Set Size (memory usage)
 9 | 
10 | # Function to measure execution time and memory consumption of another function
11 | def measure_execution_memory(func):
12 |     def wrapper(*args, **kwargs):
13 |         # Record start time
14 |         start_time = time.time()
15 |         
16 |         # Measure memory usage before execution
17 |         start_memory = get_memory_usage()
18 |         
19 |         # Execute the wrapped function
20 |         result = func(*args, **kwargs)
21 |         
22 |         # Measure memory usage after execution
23 |         end_memory = get_memory_usage()
24 |         
25 |         # Calculate execution time
26 |         execution_time = time.time() - start_time
27 |         
28 |         # Calculate memory consumption
29 |         memory_consumption_bytes = end_memory - start_memory
30 |         memory_consumption_megabytes = memory_consumption_bytes / (1024 * 1024)  # Convert to megabytes
31 |         
32 |         print(f"Function '{func.__name__}' executed in {execution_time:.2f} seconds.")
33 |         print(f"Memory consumption during execution: {memory_consumption_megabytes:.2f} MB")
34 |         
35 |         return result  # Return the result of the wrapped function
36 |     
37 |     return wrapper
38 | 


--------------------------------------------------------------------------------
/metric-evaluation.py:
--------------------------------------------------------------------------------
 1 | from keras import backend as K
 2 | 
 3 | def recall_m(y_true, y_pred):
 4 |     true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
 5 |     possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
 6 |     recall = true_positives / (possible_positives + K.epsilon())
 7 |     return recall
 8 | 
 9 | def precision_m(y_true, y_pred):
10 |     true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
11 |     predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
12 |     precision = true_positives / (predicted_positives + K.epsilon())
13 |     return precision
14 | 
15 | def f1_m(y_true, y_pred):
16 |     precision = precision_m(y_true, y_pred)
17 |     recall = recall_m(y_true, y_pred)
18 |     return 2*((precision*recall)/(precision+recall+K.epsilon()))
19 | 
20 | 


--------------------------------------------------------------------------------
/network_parameters_initializer.py:
--------------------------------------------------------------------------------
 1 | def network_parameters_initializer():
 2 |     
 3 |     network_parameters_dict = {}
 4 |     network_parameters_dict['BATCH_SIZE'] = 64
 5 |     network_parameters_dict['LEARNING_RATE'] = 0.0001
 6 |     network_parameters_dict['EPOCH'] = 50
 7 |     network_parameters_dict['VERBOSE'] = 1
 8 |     network_parameters_dict['VALIDATION_SPLIT'] = 0.1
 9 |     network_parameters_dict['NUM_CLASSES'] = 3
10 |     network_parameters_dict['OPTIMIZER'] = 'Adam'
11 |     network_parameters_dict['LOSS_FUNCTION'] = 'categorical_crossentropy'
12 |     network_parameters_dict['METRICS'] = ['accuracy',recall_m,precision_m,f1_m]
13 |     network_parameters_dict['DROPOUT'] = 0.30
14 |     network_parameters_dict['HIDEN_ACTIVATION_FUNCTION'] = 'relu'
15 |     network_parameters_dict['OUTPUT_ACTIVATION_FUNCTION'] = 'relu'
16 |     network_parameters_dict['DENSE_LAYER_ACTIVATION_FUNCTION'] = ('relu','relu')
17 |     network_parameters_dict['SOFTMAX_LAYER_ACTIVATION_FUNCTION'] = 'softmax'
18 |     
19 |     return network_parameters_dict
20 | 


--------------------------------------------------------------------------------
/packet_normalization.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The code you have provided is a function called packet_normalization(). This function takes a list of normalized packet files as input and normalizes the packets in each file.
 3 | 
 4 | The function first declares a chunk size of 50000 bytes. This is the size of each chunk that the function will break the normalized packet files into.
 5 | 
 6 | The function then reads the normalized packet files one by one. For each file, the function reads the payload and IP header columns.
 7 | 
 8 | The function then converts the payload and IP header columns from hexadecimal to binary format. It then normalizes the binary values by dividing them by 255 and multiplying them by 10 raised to the power of the decimal place.
 9 | 
10 | The function then merges the header and payload columns into a single vector. It then converts the vector to a string and saves it to the packet_normalized_data column of a Pandas DataFrame.
11 | 
12 | The function then saves the Pandas DataFrame to a new file. It also breaks the new file into smaller files, each with a size of 50000 bytes.
13 | 
14 | Finally, the function returns the directory where the normalized and split files are saved.
15 | 
16 | Here is a breakdown of the code:
17 | 
18 | """
19 | from Break_CSV_File import Break_CSV_File
20 | def packet_normalization(normalized_files_name, type):
21 |     # declare chunk size as BYTE
22 |     chunk_size_file = 50000
23 | 
24 |     # breake file based on pcap size file
25 |     print('Size of file is : ', size, 'Byte')
26 |     if type== 1:
27 |         normalized_dir = 'media/mehdi/linux/normalized_data/'
28 |     else:
29 |         normalized_dir = 'media/mehdi/linux/normalized_app_data/'
30 |     df_normalized = pd.DataFrame(columns=['packet_normalized_data', 'class_label'])
31 |     binary = "{0:08b}".format(int("1a", 16))
32 |     col_list = ["payload","ip_header", "class_label"]
33 |     n = 2
34 |     decPlace = 4
35 |     payload_list = []
36 |     header_list = []
37 |     final_packet_vector = []
38 |     for index1 in range(len(normalized_files_name)):
39 | 
40 |         # df = pd.read_csv("packet.csv", usecols=col_list)
41 |         df = pd.read_csv(normalized_files_name[index1], usecols=col_list)
42 |         print("file:{} has been read".format(normalized_files_name[index1]))
43 |         for index, row in df.iterrows():
44 |             print("index row is :{}".format(index))
45 |             if(index == 65533):
46 |                 print("yes")
47 |             payload = row["payload"].replace("'", "")[1:]
48 |             header = row["ip_header"].replace("'", "")[1:]
49 |             # convert hex format to binary format and binary format to decimal
50 | 
51 |             for i in range(0, len(payload), n):
52 |                 payload_list.append(payload[i:i + n])
53 |             for j in range(0, len(header), n):
54 |                 header_list.append(header[j:j + n])
55 |             for i in range(len(payload_list)):
56 |                 # convert hex to binary
57 |                 payload_list[i] = "{0:08b}".format(int(payload_list[i], 16))
58 |                 # convert binary to decimal and normalize number
59 |                 payload_list[i] = int(int(payload_list[i], 2) / 255.0 * 10 ** decPlace) / 10 ** decPlace
60 |             for j in range(len(header_list)):
61 |                 # convert hex to binary
62 |                 header_list[j] = "{0:08b}".format(int(header_list[j], 16))
63 |                 # convert binary to decimal and normalize number
64 |                 header_list[j] = int(int(header_list[j], 2) / 255.0 * 10 ** decPlace) / 10 ** decPlace
65 | 
66 |             # merge header and payload to eachother
67 |             final_packet_vector = header_list
68 |             for data in payload_list:
69 |                 final_packet_vector.append(data)
70 | 
71 |             # convert list to string
72 |             normalized_packet = ','.join([str(elem) for elem in final_packet_vector])
73 |             df_normalized.loc[len(df_normalized.index)] = [normalized_packet,row["class_label"]]
74 |             payload_list = []
75 |             header_list = []
76 |             final_packet_vector = []
77 |             normalized_packet = ''
78 |         base_filename = os.path.basename(normalized_files_name[index1])
79 |         new_filename = normalized_dir +'normalized_'+ base_filename
80 |         df_normalized.to_csv(new_filename)
81 |         # in this section we break large csv file to smaller ones
82 |         Break_CSV_File(new_filename,chunk_size_file,normalized_dir)
83 |         df_normalized = df_normalized[0:0]
84 |     return normalized_dir
85 | 


--------------------------------------------------------------------------------
/packet_zero_pading.py:
--------------------------------------------------------------------------------
 1 | 
 2 | """
 3 | The code you have provided is a function called packet_zero_padding(). This function takes a list of packets as input and pads the packets with zeros if their length is less than 1500 bytes.
 4 | 
 5 | The function first iterates over the list of packets. For each packet, the function gets the IP header and the payload length.
 6 | 
 7 | If the packet length is less than 1500 bytes, the function creates a padding object with a length of 1500 - packet length. The padding object is then added to the packet.
 8 | 
 9 | The function then prints the length of the padding object.
10 | """
11 | 
12 | def packet_zero_pading (packets):
13 |     namey = 'mehdi'
14 |     print(namey.ljust(8,'0'))
15 |     for pkt in packets:
16 |         print(len(pkt))
17 |         header_length = pkt[IP].ihl
18 |         payload_length = pkt[IP].len - (header_length * 32)/8
19 |         print(binascii.hexlify(struct.pack('i', 00)))
20 |         print('{:x}'.format(123))
21 |         if (pkt[IP].len) < 1500:
22 |             pad_len = 1500 - len(pkt[IP])
23 |             pad_str_len = int(pad_len)*2
24 |             pad = Padding()
25 |             pad.load = '\x00' * int(pad_len)
26 |             firstdata = binascii.hexlify(bytes(pkt[Raw]))
27 |             pkt = pkt / pad
28 |             layer = pkt.getlayer(1)
29 |             if layer.haslayer(Raw) and layer.haslayer(IP):
30 |                 print(b_colors.bcolors.OKBLUE + '\n[Info] Found the following (' + layer.name + ' layer): ' + layer.src + " -> " + layer.dst + b_colors.bcolors.ENDC)
31 |                 tcpdata = layer.getlayer(Raw).load
32 |                 padding = binascii.hexlify(bytes(layer.getlayer(Padding).load))
33 |                 #padding2 = binascii.hexlify(bytes(b'\x00\x00\x00\00'))
34 |                 print(hexdump(pkt[Raw].load))
35 |                 lastdata = binascii.hexlify(bytes(pkt[Raw]))
36 |                 mydata = lastdata.decode()
37 |                 print('before padding len is:{}'.format(len(mydata)))
38 |                 print(mydata)
39 |                 mydata = mydata.ljust(300, '0')
40 |                 print('after padding len is:{}'.format(len(mydata)))
41 |                 print(mydata)
42 |                 #print(hexdump(pkt[Padding].load))
43 | 
44 | 
45 |             print(len(pad))
46 |         if len(pkt[IP]) == 1500:
47 |             print(pkt.show())
48 |     """
49 |           if not isinstance(packet[TCP].payload, scapy.packet.NoPayload):
50 |         payload = json.loads(bytes(packet[TCP].payload).decode('utf-8'))
51 |         p.update(payload)
52 |         p['_data'] = base64.b64decode(payload['data']).decode('utf-8')
53 |         p.__delitem__('data')
54 |     arr.append(p)
55 |     """


--------------------------------------------------------------------------------
/plot_heatmap_result.py:
--------------------------------------------------------------------------------
 1 | def plot_heatmap():
 2 |     uniform_data = np.random.rand(10, 12)
 3 |     ax = sns.heatmap(uniform_data, linewidth=0.5,cmap='winter',annot=True)
 4 |     plt.show()
 5 |     return
 6 | 
 7 | def plot_heatmap_result():
 8 |     rootDir = '/home/mehdi/PycharmProjects/pythonProject/result-heatmap/'
 9 |     csv_files = os.listdir(rootDir)
10 |     for f in csv_files:
11 |         # read the csv file
12 |         path = os.path.join(rootDir, f)
13 |         data = pd.read_csv(path)
14 |         fig, ax = plt.subplots(figsize=(11,9))
15 |         sns.heatmap(data.corr(), cmap='winter')
16 |         plt.show()
17 |     return


--------------------------------------------------------------------------------
/pmf_Dataset.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The code you have provided is a function called pmf_Dataset(). This function calculates the probability mass function (PMF) of the packet lengths in a set of CSV files.
 3 | 
 4 | The function first defines a list called csv_files that stores the names of the CSV files in the specified directory.
 5 | 
 6 | The function then defines a list called my_temp_list that stores the number of packets in each length range. The my_packet_length_list list is initialized with 0s.
 7 | 
 8 | The function then iterates over the csv_files list. For each CSV file, the function reads the file and stores the number of packets in each length range in the my_temp_list list.
 9 | 
10 | The function then calculates the PMF of the packet lengths by dividing the number of packets in each length range by the total number of packets.
11 | 
12 | The function finally returns the PMF of the packet lengths.
13 | 
14 | The first few lines define the csv_files list and the my_temp_list list.
15 | 
16 | The next few lines iterate over the csv_files list. For each CSV file, the function reads the file and stores the number of packets in each length range in the my_temp_list list.
17 | 
18 | The next few lines calculate the PMF of the packet lengths by dividing the number of packets in each length range by the total number of packets.
19 | 
20 | The final few lines format the PMF as strings and return it.
21 | 
22 | """
23 | 
24 | def pmf_Dataset(rootDir):
25 |     csv_files = os.listdir(rootDir)
26 |     my_temp_list = []
27 |     my_packet_length_list = [0]*9
28 |     my_packet_length_list = [float(x) for x in my_packet_length_list]
29 |     # loop over the list of csv files
30 |     for f in csv_files:
31 |         # read the csv file
32 |         path = os.path.join(rootDir, f)
33 |         df = pd.read_csv(path,usecols=['Topic / Item','Count'])
34 | 
35 |         # print the location and filename
36 |         print('Location:', path)
37 |         df_percent= df.iloc[1:10,1:2]
38 |         my_temp_list = df_percent['Count'].values.tolist()
39 |         for item in range(len(my_temp_list)):
40 |             #my_temp_list[item] = str(my_temp_list[item]).replace("%","")
41 |             my_packet_length_list[item] = my_packet_length_list[item] + my_temp_list[item]
42 |         print(my_temp_list)
43 |         # print the content
44 |         print('Content:')
45 |         print(df)
46 |         print()
47 |     total_packet = sum(my_packet_length_list)
48 |     packet_lenght_percent = [(float(x/total_packet)*100) for x in my_packet_length_list]
49 |     print(packet_lenght_percent)
50 |     final_packet_lenght = ['{:.2f}'.format(x) for x in packet_lenght_percent]
51 | 
52 |     return


--------------------------------------------------------------------------------
/preprocessing-traffic-label.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import csv
  3 | from scapy.all import *
  4 | def get_application_from_filename(filename):
  5 |   """Gets the application of a packet from the filename.
  6 | 
  7 |   Args:
  8 |     filename: The filename of the packet.
  9 | 
 10 |   Returns:
 11 |     The application of the packet, or None if the application cannot be determined.
 12 |   """
 13 | # Create a dictionary to map prefixes to application names
 14 |     prefix_to_application = {
 15 |         'icq': 'Icq',
 16 | 	'aim-chat': 'AIM-Chat'
 17 |         'chat_facebook': 'chat_facebook',
 18 |         'chat_hangout': 'chat_hangout',
 19 |         'chat_gmail': 'chat_gmail',
 20 |         'chat_skype': 'chat_skype',
 21 |         'email': 'email',
 22 |         'gmail': 'gmail',
 23 |         'ftps': 'ftps',
 24 |         'sftp': 'sftp',
 25 |         'scp': 'scp',
 26 |         'ftp_skype': 'ftp_skype',
 27 |         'torrent': 'torrent',
 28 | 	'tor': 'tor', 
 29 |         'youtube': 'youtube',
 30 |         'netflix': 'netflix',
 31 |         'spotify': 'spotify',
 32 |         'vimeo': 'vimeo',
 33 |         'streaming_skype': 'streaming_skype',
 34 |         'voip_skype': 'voip_skype',
 35 |         'voipbuster': 'Voipbuster',
 36 |         'voip_hangout': 'voipbuster',
 37 |         'voip_facebook': 'voip_facebook'
 38 |     }
 39 | 
 40 |     # Get the prefix of the filename and convert it to lowercase
 41 |     prefix = filename.split(".")[0].lower()
 42 | 
 43 |     # Use the dictionary to determine the application
 44 |     application = prefix_to_application.get(prefix, None)
 45 | 
 46 |     return application
 47 | 
 48 | def get_label(filename):
 49 |   """Gets the label of a packet from the filename.
 50 | 
 51 |   Args:
 52 |     filename: The filename of the packet.
 53 | 
 54 |   Returns:
 55 |     The label of the packet, or None if the label cannot be determined.
 56 |   """
 57 | 
 58 |   # Get the prefix of the filename.
 59 |   prefix = filename.split(".")[0].lower()
 60 | 
 61 |   # Determine the label based on the prefix.
 62 |   if prefix.startswith("vpn_"):
 63 |     label = "VPN"
 64 |   else:
 65 |     label = "Non-VPN"
 66 | 
 67 |   return label
 68 | # Function to mask the IP layer header
 69 | def ip_mask(packet):
 70 |     if IP in packet:
 71 |         packet[IP].src = '0.0.0.0'
 72 |         packet[IP].dst = '0.0.0.0'
 73 | 
 74 | # Function to normalize a packet's byte values to [0-1]
 75 | def normalize_packet(packet):
 76 |     if Raw in packet:
 77 |         raw_data = bytes(packet[Raw])
 78 |         normalized_data = [byte / 255.0 for byte in raw_data]
 79 |         packet[Raw].load = bytes(normalized_data)
 80 | 
 81 | # Function to split and pad packets if length > 1500
 82 | def split_and_pad(packet):
 83 |     if len(packet) > 1500:
 84 |         num_packets = len(packet) // 1500 + 1
 85 |         split_packets = [packet[i:i + 1500] for i in range(0, len(packet), 1500)]
 86 |         for i in range(num_packets):
 87 |             if len(split_packets[i]) < 1500:
 88 |                 split_packets[i] += b'\x00' * (1500 - len(split_packets[i]))
 89 |         return split_packets
 90 |     else:
 91 |         return [packet]
 92 | 
 93 | # Function to categorize file types based on the filename
 94 | def categorize_file_type(filename):
 95 |     if "vpn" in filename:
 96 |         if "chat" in filename:
 97 |             return 1
 98 |         elif "email" in filename:
 99 |             return 3
100 |         elif any(word in filename for word in ["facebook_audio", "hangouts_audio", "skype_audio", "voip"]):
101 |             return 5
102 |         elif any(word in filename for word in ["ftp", "file", "scp", "sftp"]):
103 |             return 7
104 |         elif any(word in filename for word in ["vimeo", "youtube", "netflix", "hangouts_video", "facebook_video", "skype_video"]):
105 |             return 9
106 |         elif "Torrent01" in filename:
107 |             return 11
108 |         elif "tor" in filename:
109 |             return 13
110 |     else:  # Non-VPN traffic
111 |         if "chat" in filename:
112 |             return 0
113 |         elif "email" in filename:
114 |             return 2
115 |         elif any(word in filename for word in ["facebook_audio", "hangouts_audio", "skype_audio", "voip"]):
116 |             return 4
117 |         elif any(word in filename for word in ["ftp", "file", "scp", "sftp"]):
118 |             return 6
119 |         elif any(word in filename for word in ["vimeo", "youtube", "netflix", "hangouts_video", "facebook_video", "skype_video"]):
120 |             return 8
121 |         elif "Torrent01" in filename:
122 |             return 10
123 |         elif "tor" in filename:
124 |             return 14
125 |     return None
126 | def read_app_pcap_files():
127 |     #file_list = [x for x in os.listdir('/home/mehdi') if x.endswith(".pcap")]
128 |     #print(file_list)
129 |     root_dir = '/media/mehdi/linux/data/CompletePCAPs'
130 |     file_name_list_full_path = []
131 |     file_name_list = []
132 |     file_name_dict = {}
133 |     for path in os.listdir(root_dir):
134 |         full_path = os.path.join(root_dir, path)
135 |         if os.path.isfile(full_path) and (path.endswith(".pcap") or path.endswith("pcapng")):
136 |             print(full_path)
137 |             file_name_list_full_path.append(full_path)
138 |             file_name_list.append(path)
139 | 
140 |     # find category of ISCX VPN-NONVPN DATASET
141 |     for i in range(len(file_name_list_full_path)):
142 | 	    app_name = get_application_from_filename(file_name_list_full_path[i])
143 | 	if "icq" in app_name:
144 |             file_name_dict[file_name_list_full_path[i]] = 1 #"vpn icq"
145 |         elif "chat_facebook" in app_name:
146 |             file_name_dict[file_name_list_full_path[i]] = 2 #"chat_facebook"
147 |         elif "chat_hangout" in app_name:
148 |             file_name_dict[file_name_list_full_path[i]] = 3 #"chat_hangout"
149 |         elif "chat_gmail" in app_name:
150 |             file_name_dict[file_name_list_full_path[i]] = 4 #"chat_gmail"
151 |         elif "chat_skype" in app_name:
152 |             file_name_dict[file_name_list_full_path[i]] = 5 #"chat_skype"
153 |         elif "email" in app_name:
154 |             file_name_dict[file_name_list_full_path[i]] = 6 #"email"
155 |         elif "gmail" in app_name:
156 |             file_name_dict[file_name_list_full_path[i]] = 4 #"gmail"
157 | 	elif "ftps" in app_name:
158 |             file_name_dict[file_name_list_full_path[i]] = 7 #"ftps"
159 | 	elif "sftp" in app_name:
160 |             file_name_dict[file_name_list_full_path[i]] = 8 #"sftp"
161 | 	elif "scp" in app_name:
162 |             file_name_dict[file_name_list_full_path[i]] = 9 #"scp"
163 | 	elif "ftp_skype" in app_name:
164 |             file_name_dict[file_name_list_full_path[i]] = 5 #"ftp_skype"
165 | 	elif "torrent" in app_name:
166 |             file_name_dict[file_name_list_full_path[i]] = 10 #"torrent"
167 | 	elif "youtube" in app_name:
168 |             file_name_dict[file_name_list_full_path[i]] = 11 #"youtube"
169 | 	elif "netflix" in app_name:
170 |             file_name_dict[file_name_list_full_path[i]] = 12 #"netflix"
171 | 	elif "spotify" in app_name:
172 |             file_name_dict[file_name_list_full_path[i]] = 13 #"spotify"
173 | 	elif "vimeo" in app_name:
174 |             file_name_dict[file_name_list_full_path[i]] = 14 #"vimeo"
175 |         elif "streaming_skype" in app_name:
176 |             file_name_dict[file_name_list_full_path[i]] = 5 #"streaming_skype"
177 |         elif "voip_skype" in app_name:
178 |             file_name_dict[file_name_list_full_path[i]] = 5 #"voip_skype"
179 |         elif "voipbuster" in app_name:
180 |             file_name_dict[file_name_list_full_path[i]] = 15 #"voipbuster"			
181 | 	elif "voip_hangout" in app_name:
182 |             file_name_dict[file_name_list_full_path[i]] = 3 #"voip_hangout"
183 | 	elif "voip_facebook" in app_name:
184 |             file_name_dict[file_name_list_full_path[i]] = 2 #"voip_facebook"		
185 | 	elif "aim-chat" in app_name:
186 |             file_name_dict[file_name_list_full_path[i]] = 16 #"AIM-Chat"
187 | 	elif "tor" in app_name:
188 |             file_name_dict[file_name_list_full_path[i]] = 17 #"tor"	
189 | 	else:
190 |                 pass 
191 |             
192 |         
193 |     print(file_name_dict)
194 |     exctracted_root_dir = load_pcap_datatype(file_name_dict)
195 | 
196 |     return exctracted_root_dir
197 | def read_app_pacap_load()
198 | 	# Path to the directory containing PCAP files
199 | 	pcap_directory = '/path/to/pcap/files'
200 | 
201 | 	# Create a CSV file for saving normalized packets
202 | 	csv_file = open('normalized_packets.csv', 'w', newline='')
203 | 	csv_writer = csv.writer(csv_file)
204 | 
205 | 	# Iterate through each PCAP file in the directory
206 | 	or pcap_file in os.listdir(pcap_directory):
207 | 		if pcap_file.endswith('.pcap'):
208 | 			label = categorize_file_type(pcap_file)
209 | 			application = get_application_from_filename(pcap_path.name)
210 | 			app_label = get_label(pcap_path.name)
211 | 			if label is not None:
212 | 				packets = rdpcap(os.path.join(pcap_directory, pcap_file))
213 | 
214 | 				for packet in packets:
215 | 					if Raw in packet and packet.haslayer(Ether) and packet[Ether].type == 0x800:
216 | 						ip_mask(packet)
217 | 						normalized_packets = split_and_pad(packet)
218 | 						for normalized_packet in normalized_packets:
219 | 							normalize_packet(normalized_packet)
220 | 							csv_writer.writerow([bytes(normalized_packet), label,application+app_label])
221 | 
222 | 	# Close the CSV file
223 | 	csv_file.close()
224 | 


--------------------------------------------------------------------------------
/print_summary.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The code you have provided is a function called print_summary(). This function takes a packet as input and prints the source and destination IP addresses and the source and destination TCP ports of the packet.
 3 | 
 4 | The function first checks if the packet contains an IP layer. If it does, then the function gets the source and destination IP addresses of the packet.
 5 | 
 6 | The function then checks if the packet contains a TCP layer. If it does, then the function gets the source and destination TCP ports of the packet.
 7 | 
 8 | Finally, the function prints the source and destination IP addresses and the source and destination TCP ports of the packet.
 9 | 
10 | The first line checks if the packet contains an IP layer. The IP layer is a required layer for all IP packets. If the packet does not contain an IP layer, then the function does not print anything.
11 | 
12 | The second line gets the source IP address of the packet. The src attribute of the IP object returns the source IP address of the packet.
13 | 
14 | The third line gets the destination IP address of the packet. The dst attribute of the IP object returns the destination IP address of the packet.
15 | 
16 | The fourth line checks if the packet contains a TCP layer. The TCP layer is not a required layer for all IP packets. If the packet does not contain a TCP layer, then the function does not print anything.
17 | 
18 | The fifth line gets the source TCP port of the packet. The sport attribute of the TCP object returns the source TCP port of the packet.
19 | 
20 | The sixth line gets the destination TCP port of the packet. The dport attribute of the TCP object returns the destination TCP port of the packet.
21 | 
22 | Finally, the seventh and eighth lines print the source and destination IP addresses and the source and destination TCP ports of the packet.
23 | 
24 | """
25 | def print_summary(pkt):
26 |     if IP in pkt:
27 |         ip_src=pkt[IP].src
28 |         ip_dst=pkt[IP].dst
29 |         print(' IP src is: {}'.format(str(ip_src)))
30 |         print(' IP dst is: {}'.format(str(ip_dst)))
31 |     if TCP in pkt:
32 |         tcp_sport=pkt[TCP].sport
33 |         tcp_dport=pkt[TCP].dport
34 |         print(' TCP sport is: {}'.format(str(tcp_sport)))
35 |         print(' TCP dport is: {}'.format(str(tcp_dport)))


--------------------------------------------------------------------------------
/process_metadata_pcap.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The code you have provided is a function called process_metadata_pcap(). This function reads a pcap file and prints the metadata of the first and the last packets in the connection between the two hosts specified by the client and server parameters.
  3 | 
  4 | The function first opens the pcap file and gets the number of packets in the file. It then iterates over the packets in the file. For each packet, the function first creates an Ether object from the packet data. The Ether class in the packet library represents an Ethernet packet. The function then checks if the type field of the Ether object is equal to 0x0800. If it is, then the packet is an IPv4 packet.
  5 | 
  6 | The function then checks if the proto field of the IP object is equal to 6. If it is, then the packet is a TCP packet.
  7 | 
  8 | The function then checks if the source or destination IP address of the packet matches the client or server parameter. If it does, then the function increments the interesting_packet_count variable.
  9 | 
 10 | If the packet is an interesting packet, then the function checks if it is the first or the last packet in the connection. If it is, then the function stores the timestamp and ordinal number of the packet.
 11 | 
 12 | Finally, the function prints the total number of packets in the file, the number of interesting packets, and the timestamps and ordinal numbers of the first and the last packets in the connection.
 13 | 
 14 | 
 15 | """
 16 | 
 17 | # In this code iteration, we’ll access the packet’s metadata;
 18 | # in particular the timestamps and ordinal numbers (i.e. packet number within the packet capture) of the first and the last packets of the connection that we’re interested in.
 19 | def process_metadata_pcap(file_name):
 20 |     print('Opening {}...'.format(file_name))
 21 | 
 22 |     client = '192.168.43.75:54732'
 23 |     server = '172.217.22.78:443'
 24 | 
 25 |     (client_ip, client_port) = client.split(':')
 26 |     (server_ip, server_port) = server.split(':')
 27 | 
 28 |     count = 0
 29 |     interesting_packet_count = 0
 30 |     first_pkt_timestamp = 0
 31 |     first_pkt_ordinal = 0
 32 |     first_pkt_timestamp_resolution= 0
 33 |     last_pkt_ordinal = 0
 34 |     last_pkt_timestamp_resolution = 0
 35 | 
 36 |     for (pkt_data, pkt_metadata,) in RawPcapReader(file_name):
 37 |         count += 1
 38 |         ether_pkt = Ether(pkt_data)
 39 |         if 'type' not in ether_pkt.fields:
 40 |             # LLC frames will have 'len' instead of 'type'.
 41 |             # We disregard those
 42 |             continue
 43 | 
 44 |         if ether_pkt.type != 0x0800:
 45 |             # disregard non-IPv4 packets
 46 |             continue
 47 | 
 48 |         ip_pkt = ether_pkt[IP]
 49 | 
 50 |         if ip_pkt.proto != 6:
 51 |             # Ignore non-TCP packet
 52 |             continue
 53 | 
 54 |         if (ip_pkt.src != server_ip) and (ip_pkt.src != client_ip):
 55 |             # Uninteresting source IP address
 56 |             continue
 57 | 
 58 |         if (ip_pkt.dst != server_ip) and (ip_pkt.dst != client_ip):
 59 |             # Uninteresting destination IP address
 60 |             continue
 61 | 
 62 |         tcp_pkt = ip_pkt[TCP]
 63 | 
 64 |         if (tcp_pkt.sport != int(server_port)) and \
 65 |                 (tcp_pkt.sport != int(client_port)):
 66 |             # Uninteresting source TCP port
 67 |             continue
 68 | 
 69 |         if (tcp_pkt.dport != int(server_port)) and \
 70 |                 (tcp_pkt.dport != int(client_port)):
 71 |             # Uninteresting destination TCP port
 72 |             continue
 73 | 
 74 |         interesting_packet_count += 1
 75 |         if interesting_packet_count == 1:
 76 |             first_pkt_timestamp = (pkt_metadata.tshigh << 32) | pkt_metadata.tslow
 77 |             first_pkt_timestamp_resolution = pkt_metadata.tsresol
 78 |             first_pkt_ordinal = count
 79 | 
 80 |         last_pkt_timestamp = (pkt_metadata.tshigh << 32) | pkt_metadata.tslow
 81 |         last_pkt_timestamp_resolution = pkt_metadata.tsresol
 82 |         last_pkt_ordinal = count
 83 |     # ---
 84 | 
 85 |     print('{} contains {} packets ({} interesting)'.
 86 |           format(file_name, count, interesting_packet_count))
 87 | 
 88 |     print('First packet in connection: Packet #{} {}'.
 89 |           format(first_pkt_ordinal,
 90 |                  printable_timestamp(first_pkt_timestamp,
 91 |                                      first_pkt_timestamp_resolution)))
 92 |     print(' Last packet in connection: Packet #{} {}'.
 93 |           format(last_pkt_ordinal,
 94 |                  printable_timestamp(last_pkt_timestamp,
 95 |                                      last_pkt_timestamp_resolution)))
 96 | def printable_timestamp(ts, resol):
 97 |     ts_sec = ts // resol
 98 |     ts_subsec = ts % resol
 99 |     ts_sec_str = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ts_sec))
100 |     return '{}.{}'.format(ts_sec_str, ts_subsec)


--------------------------------------------------------------------------------
/process_pcap.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The code you have provided is a function called process_pcap(). 
 3 | This function reads a pcap file and prints some information about each packet in the file.
 4 | 
 5 | The function first opens the pcap file and gets the number of packets in the file. 
 6 | It then iterates over the packets and prints the following information for each packet:
 7 | 
 8 | The packet number
 9 | The source IP address
10 | The destination IP address
11 | The source TCP port
12 | The destination TCP port
13 | Finally, the function prints the total number of packets in the file.
14 | 
15 | 
16 | The first line opens the pcap file and gets the number of packets in the file. 
17 | The RawPcapReader() function takes the name of the pcap file as input and returns an iterator that yields tuples of (packet data, packet metadata). 
18 | The count variable keeps track of the number of packets processed.
19 | 
20 | The next line starts an iteration over the packets in the file. For each packet, the function first creates an Ether object from the packet data. 
21 | The Ether class in the packet library represents an Ethernet packet. The function then creates an IP object and a TCP object from the Ether object. 
22 | The IP class represents an IP packet and the TCP class represents a TCP packet.
23 | 
24 | The function then prints the following information for the packet:
25 | 
26 | The packet number
27 | The source IP address
28 | The destination IP address
29 | The source TCP port
30 | The destination TCP port
31 | Finally, the function prints the total number of packets in the file.
32 | 
33 | Here is a breakdown of the code:
34 | 
35 | """
36 | def process_pcap(file_name):
37 |     print('Opening {}...'.format(file_name))
38 |     count = 0
39 |     for (pkt_data, pkt_metadata,) in RawPcapReader(file_name):
40 |         count += 1
41 |         ether_pkt = Ether(pkt_data)
42 |         ip_pkt = ether_pkt[IP]
43 |         tcp_pkt = ip_pkt[TCP]
44 |         print('packet number is: {}'.format(count))
45 |         print(' IP src is: {}'.format(ip_pkt.src))
46 |         print(' IP dst is: {}'.format(str(ip_pkt.dst)))
47 |         print(' TCP sport is: {}'.format(str(tcp_pkt.sport)))
48 |         print(' TCP dport is: {}'.format(str(tcp_pkt.dport)))
49 | 
50 | 
51 | 
52 |     print('{} contains {} packets'.format(file_name, count))
53 | 


--------------------------------------------------------------------------------
/read_pcap_files.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The code you have provided is a function called read_pcap_files(). This function reads all the pcap files in a directory and creates a dictionary that maps the file name to the corresponding category.
 3 | 
 4 | The function first gets the list of all the files in the directory. It then iterates over the files and checks if the file is a pcap file. If it is, the function adds the file name and its category to the dictionary.
 5 | 
 6 | The function then calls the load_pcap_datatype() function to load the data from the pcap files.
 7 | 
 8 | Finally, the function prints the dictionary and returns it.
 9 | 
10 | Here is a breakdown of the code:
11 | """
12 | def read_pcap_files():
13 |     #file_list = [x for x in os.listdir('/home/mehdi') if x.endswith(".pcap")]
14 |     #print(file_list)
15 |     root_dir = '/media/mehdi/linux/data/CompletePCAPs'
16 |     file_name_list_full_path = []
17 |     file_name_list = []
18 |     file_name_dict = {}
19 |     for path in os.listdir(root_dir):
20 |         full_path = os.path.join(root_dir, path)
21 |         if os.path.isfile(full_path) and (path.endswith(".pcap") or path.endswith("pcapng")):
22 |             print(full_path)
23 |             file_name_list_full_path.append(full_path)
24 |             file_name_list.append(path)
25 | 
26 |     # find category of ISCX VPN-NONVPN DATASET
27 |     for i in range(len(file_name_list_full_path)):
28 |         if "vpn" in file_name_list[i]:
29 |             if "chat" in file_name_list[i]:
30 |                 file_name_dict[file_name_list_full_path[i]] = 1 #"vpn chat"
31 |             elif "email" in file_name_list[i]:
32 |                 file_name_dict[file_name_list_full_path[i]] = 3 #"vpn email"
33 |             elif ("facebook_audio" in file_name_list[i]) or  ("hangouts_audio" in file_name_list[i])\
34 |                         or ("skype_audio" in file_name_list[i]) or ("voip" in file_name_list[i]):
35 |                 file_name_dict[file_name_list_full_path[i]] = 5 #"vpn audio streaming"
36 |             elif ("ftp" in file_name_list[i]) or ("file" in file_name_list[i]) \
37 |                     or ("scp" in file_name_list[i]) or ("sftp" in file_name_list[i]):
38 |                 file_name_dict[file_name_list_full_path[i]] = 7 #"vpn ftp"
39 |             elif ("vimeo" in file_name_list[i]) or ("youtube" in file_name_list[i]) or ("netflix" in file_name_list[i]) \
40 |                     or ("hangouts_video" in file_name_list[i]) or ("facebook_video" in file_name_list[i]) or ("skype_video" in file_name_list[i]):
41 |                 file_name_dict[file_name_list_full_path[i]] = 9 #"vpn video streaming"
42 |             elif "Torrent01" in file_name_list[i]:
43 |                 file_name_dict[file_name_list_full_path[i]] = 11 #"vpn p2p"
44 |             elif "tor" in file_name_list[i]:
45 |                 file_name_dict[file_name_list_full_path[i]] = 13  # "vpn tor"
46 |             else:
47 |                 pass
48 |         else:
49 |             if "chat" in file_name_list[i]:
50 |                 file_name_dict[file_name_list_full_path[i]] = 0 #"chat"
51 |             elif "email" in file_name_list[i]:
52 |                 file_name_dict[file_name_list_full_path[i]] = 2 #"email"
53 |             elif ("facebook_audio" in file_name_list[i]) or ("hangouts_audio" in file_name_list[i]) \
54 |                      or ("skype_audio" in file_name_list[i]) or ("voip" in file_name_list[i]):
55 |                 file_name_dict[file_name_list_full_path[i]] = 4 #"audio streaming"
56 |             elif ("ftp" in file_name_list[i]) or ("file" in file_name_list[i]) \
57 |                      or ("scp" in file_name_list[i]) or ("sftp" in file_name_list[i]):
58 |                 file_name_dict[file_name_list_full_path[i]] = 6 #"ftp"
59 |             elif ("vimeo" in file_name_list[i]) or ("youtube" in file_name_list[i]) or ("netflix" in file_name_list[i]) \
60 |                      or ("hangouts_video" in file_name_list[i]) or ("facebook_video" in file_name_list[i]) or ("skype_video" in file_name_list[i]):
61 |                 file_name_dict[file_name_list_full_path[i]] = 8 #"video streaming"
62 |             elif "Torrent01" in file_name_list[i]:
63 |                 file_name_dict[file_name_list_full_path[i]] = 10 #"p2p"
64 |             elif "tor" in file_name_list[i]:
65 |                 file_name_dict[file_name_list_full_path[i]] = 14  # "tor"
66 |             else:
67 |                 pass
68 |     print(file_name_dict)
69 |     exctracted_root_dir = load_pcap_datatype(file_name_dict)
70 | 
71 |     return exctracted_root_dir
72 | 


--------------------------------------------------------------------------------
/transform_pcap_to_dataframe.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The code you have provided is a function called transform_pcap_to_dataframe(). This function transforms a set of packets into a Pandas DataFrame.
 3 | 
 4 | The function first defines a list of fields for the IP and TCP layers.
 5 | 
 6 | The function then creates a blank DataFrame with the specified fields.
 7 | 
 8 | The function then iterates over the packets. For each packet, the function reads the values of the IP and TCP fields and appends them to a row in the DataFrame.
 9 | 
10 | The function finally saves the DataFrame to a CSV file.
11 | 
12 | The function first defines the list of fields for the IP and TCP layers. This list is used to create the DataFrame.
13 | 
14 | The function then creates a blank DataFrame with the specified fields.
15 | 
16 | The function then iterates over the packets. For each packet, the function reads the values of the IP and TCP fields and appends them to a row in the DataFrame.
17 | 
18 | The function finally saves the DataFrame to a CSV file.
19 | """
20 | 
21 | 
22 | def transform_pcap_to_dataframe(packets):
23 | 
24 |     # Store the pre-defined fields name in IP, TCP layers
25 |     f_ip = [field.name for field in IP().fields_desc]
26 |     f_tcp = [field.name for field in TCP().fields_desc]
27 |     f_udp = [field.name for field in UDP().fields_desc]
28 |     print(f_ip)  # field name of IP Layer
29 |     print(f_tcp)  # field name of TCP Layer
30 |     print(f_udp)   # field name of UDP Layer
31 |     f_all = f_ip + ['time'] + f_tcp + ['payload']
32 |     # Blank DataFrame
33 |     df_field = pd.DataFrame(columns=f_all)
34 |     # store data for each row of DataFrame
35 |     for pkt in packets:
36 |         field_values = []
37 |         # Read values of IP fields
38 |         if pkt.haslayer(TCP) and pkt.haslayer(IP):
39 |             for field in f_ip:
40 |                 try:
41 |                     if field == 'options':
42 |                         # we only store the number of options defined in IP Header
43 |                         field_values.append(len(pkt[IP].fields[field]))
44 |                     else:
45 |                         field_values.append(pkt[IP].fields[field])
46 |                 except:
47 |                     # the field value may not exist
48 |                     field_values.append(None)
49 | 
50 |             # Read values of Time
51 |             field_values.append(packet.time)
52 |             # Read values of TCP fields
53 |             layer_type = type(pkt[IP].payload)
54 |             for field in f_tcp:
55 |                 try:
56 | 
57 |                     if field == 'options':
58 |                         field_values.append(len(pkt[layer_type].fields[field]))
59 |                     else:
60 |                         field_values.append(pkt[layer_type].fields[field])
61 |                 except:
62 |                     # the field value may not exist
63 |                     field_values.append(None)
64 |             # Read values of Payload
65 |             field_values.append(len(pkt[layer_type].payload))
66 |             # Fill the data of one row
67 |             df_append = pd.DataFrame([field_values], columns=f_all)
68 |             # Append row in df
69 |             df_field = pd.concat([df_field, df_append], axis=0)
70 |     df_field.to_csv('packet1.cvs')
71 |     """
72 |     src_addr = df_field.groupby("src")['payload'].sum()  # show the sum of payload for each src ip
73 |     src_addr.plot(kind='barh', figsize=(8, 2))  # plot figure
74 |     plt.show()
75 |     """
76 |     plt.hist(df_field['payload'],bins = 20)
77 |     plt.show()
78 |     return
79 | 


--------------------------------------------------------------------------------
/var_function_initializer.py:
--------------------------------------------------------------------------------
 1 | # Parameter Initialization
 2 | def var_function_initializer():
 3 |     size_list = [4,1,5,1]
 4 |     stride_list = [3,1,1,1]
 5 |     parameters_dict = {}
 6 |     parameters_dict['BATCH_SIZE'] = 64
 7 |     parameters_dict['EPOCH'] = 10
 8 |     parameters_dict['VERBOSE'] = 1
 9 |     parameters_dict['VALIDATION_SPLIT'] = 0.16
10 |     parameters_dict['NUM_CLASSES'] = 3
11 |     parameters_dict['OPTIMIZER'] = 'Adam'
12 |     parameters_dict['LOSS_FUNCTION'] = 'categorical_crossentropy'
13 |     parameters_dict['METRICS'] = ['accuracy',recall_m,precision_m,f1_m]
14 |     parameters_dict['DROPOUT'] = 0.12
15 |     parameters_dict['KERNEL_SIZE'] = []
16 |     parameters_dict['FILTERS'] = 2
17 |     parameters_dict['STRIDES'] = []
18 |     parameters_dict['PADDING'] = 'same'
19 |     parameters_dict['POOL_SIZE'] = (2,1)
20 |     parameters_dict['POOL_STRIDE'] = (2,1)
21 |     parameters_dict['HIDEN_ACTIVATION_FUNCTION'] = 'relu'
22 |     parameters_dict['OUTPUT_ACTIVATION_FUNCTION'] = 'relu'
23 |     parameters_dict['INPUT_SHAPE'] = (1500,1)
24 |     parameters_dict['CNN_LAYER_SPEC'] = (2,200,200)
25 |     parameters_dict['DENSE_LAYER'] = (2,300,200)
26 |     parameters_dict['DENSE_LAYER_ACTIVATION_FUNCTION'] = ('relu','relu')
27 |     parameters_dict['SOFTMAX_LAYER'] = 3
28 |     parameters_dict['SOFTMAX_LAYER_ACTIVATION_FUNCTION'] = 'softmax'
29 |     for i in range(parameters_dict['FILTERS']):
30 |         parameters_dict['KERNEL_SIZE'].append(size_list[2*i])
31 |         parameters_dict['KERNEL_SIZE'].append(size_list[2*i+1])
32 |         parameters_dict['STRIDES'].append(stride_list[2*i])
33 |         parameters_dict['STRIDES'].append(stride_list[2*i+1])
34 | 
35 | 
36 | 
37 | 
38 |     return parameters_dict


--------------------------------------------------------------------------------