├── .gitignore
├── ProblemStatement.pdf
├── FinalsPresentation.pptx
├── Feature_Preparation.py
├── Feature_Selection_Results.txt
├── Feature_Selection.py
├── List of Features.txt
├── Train_Model.py
├── README.md
├── botnetdetect.py
└── Feature_Extraction.py


/.gitignore:
--------------------------------------------------------------------------------
1 | Feature_Selection_Data.zip


--------------------------------------------------------------------------------
/ProblemStatement.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ramneet-Singh/BotNet-Detection-ML/HEAD/ProblemStatement.pdf


--------------------------------------------------------------------------------
/FinalsPresentation.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ramneet-Singh/BotNet-Detection-ML/HEAD/FinalsPresentation.pptx


--------------------------------------------------------------------------------
/Feature_Preparation.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | # 0,1,3,8,11,14,15,16,18,19
 5 | df = pd.read_csv("Data_Features.csv").drop(
 6 |     columns=["Unnamed: 0", "Unnamed: 0.1"])
 7 | data_test = df.iloc[:250000, np.asarray(
 8 |     [0, 1, 2, 3, 5, 10, 13, 16, 17, 18, 20, 21, -1])]
 9 | data_test.to_csv("Selected_Features_Test.csv", index=False)
10 | print(data_test.head)
11 | data_train = df.iloc[250000:, np.asarray(
12 |     [0, 1, 2, 3, 5, 10, 13, 16, 17, 18, 20, 21, -1])]
13 | data_train.to_csv("Selected_Features_Train.csv", index=False)
14 | 


--------------------------------------------------------------------------------
/Feature_Selection_Results.txt:
--------------------------------------------------------------------------------
 1 | Type	m	features
 2 | FPR	5k	0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 25
 3 | FPR	10k	0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 25
 4 | FPR	100k	0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 25
 5 | FPR	1M	0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 25
 6 | 
 7 | RFE	5k	0  1  3  9 10 13 14 15 16 17 18 19
 8 | RFE	10k	0  1  2  3  8 10 14 15 16 17 18 19
 9 | RFE	100k	0  1  2  3 10 13 14 15 16 17 18 19
10 | RFE	1M	0  1  2  3  8 10 13 14 16 17 18 19
11 | 
12 | KBest	100k	0  1  6  8  9 11 14 15 16 18 19 20
13 | KBest	1M	0  1  6  8  9 11 14 15 16 18 19 20
14 | 
15 | Common : 0, 1, 3, 8, 11, 14, 15, 16, 18, 19


--------------------------------------------------------------------------------
/Feature_Selection.py:
--------------------------------------------------------------------------------
 1 | from sklearn.feature_selection import RFE, SelectKBest, chi2, SelectFpr
 2 | from sklearn.ensemble import RandomForestClassifier
 3 | import pandas as pd
 4 | import numpy as np
 5 | 
 6 | df = pd.read_csv('Train_CV_Data.csv')
 7 | X_train = np.asarray(df.loc[:2000000, 'srcPort':'HTTPM4'])
 8 | Y_train = np.asarray(df.loc[:2000000, 'malicious'], dtype=np.int32)
 9 | print(np.sum(Y_train == 1))
10 | 
11 | kBest = SelectKBest(chi2, k=12)
12 | kBest.fit(X_train, Y_train)
13 | mask1 = kBest.get_support(indices=True)
14 | 
15 | fpr = SelectFpr(chi2, alpha=0.0001)
16 | fpr.fit(X_train, Y_train)
17 | mask2 = fpr.get_support(indices=True)
18 | 
19 | rf = RandomForestClassifier(n_estimators=50)
20 | 
21 | rfe = RFE(rf, n_features_to_select=12, step=1)
22 | rfe.fit(X_train, Y_train)
23 | mask3 = rfe.get_support(indices=True)
24 | 
25 | print('K-Best Feat :', mask1)
26 | print('False Positive based :', mask2)
27 | print('RFE based :', mask3)
28 | 


--------------------------------------------------------------------------------
/List of Features.txt:
--------------------------------------------------------------------------------
 1 | 1. Source IP (Source = Originating IP)
 2 | 2. Destination IP
 3 | 3. Source Port
 4 | 4. Destination Port
 5 | 5. Protocol (TCP,UDP,ICMP)
 6 | 6. PX (total number of packets exchanged) *IMP
 7 | 7. NNP (number of null packets (0 length payload) exchanged)
 8 | 8. NSP (number of small packets (length 63-400) exchanged)
 9 | 9. PSP (percentage of small packets exchanged)
10 | 10. PNP (percentage of null packets exchanged)
11 | 11. IOPR (ratio between the number of incoming packets over the number of outgoing packets)
12 | 12. FPS (length of the first packets)
13 | 13. TBT (total number of bytes)
14 | 14. APL (average payload packet length)
15 | 15. PV (standard deviation of payload packet length)
16 | 16. BS (average bits-per-second)
17 | 17. AIT (average inter-arrival time of packets)
18 | 18. PPS (average-packets-per-second)
19 | 19. HTTP Method (GET,POST,PUT,DELETE,Null)
20 | 	19-1. HTTPM0
21 | 	19-2. HTTPM1
22 | 	19-3. HTTPM2
23 | 	19-4. HTTPM3
24 | 	19-5. HTTPM4
25 | 


--------------------------------------------------------------------------------
/Train_Model.py:
--------------------------------------------------------------------------------
 1 | import lightgbm as lg
 2 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
 3 | from sklearn.model_selection import train_test_split
 4 | import pandas as pd
 5 | import numpy as np
 6 | 
 7 | df = pd.read_csv('Selected_Features_Train.csv', chunksize=15000)
 8 | clf = None
 9 | params = {'boosting': 'gbdt',
10 |           'objective': 'binary',
11 |           'learning_rate': 0.01,
12 |           'num_leaves': 31,
13 |           'is_unbalance': True,
14 |           'verbosity': 100,
15 |           'bagging_freq': 5,
16 |           'bagging_fraction': 0.8}
17 | 
18 | for data in df:
19 |     X = np.asarray(data.iloc[:, 2:-1])
20 |     Y = np.asarray(data.iloc[:, -1])
21 |     X_train, X_cv, Y_train, Y_cv = train_test_split(
22 |         X, Y, test_size=0.333, random_state=1)
23 |     clf = lg.train(params=params, train_set=lg.Dataset(X_train, Y_train), num_boost_round=2,
24 |                    init_model=clf, valid_sets=lg.Dataset(X_cv, Y_cv), keep_training_booster=True)
25 | 
26 | del df
27 | 
28 | df = pd.read_csv('Selected_Features_Test.csv')
29 | X_test = np.asarray(df.iloc[:, 2:-1])
30 | Y_test = np.asarray(df.iloc[:, -1])
31 | pred_prob = clf.predict(X_test)
32 | 
33 | pred = pred_prob >= 0.5
34 | acc = accuracy_score(Y_test, pred)
35 | prec = precision_score(Y_test, pred)
36 | rec = recall_score(Y_test, pred)
37 | f1 = f1_score(Y_test, pred)
38 | 
39 | print('Accuracy = ', acc)
40 | print('Precision = ', prec)
41 | print('Recall = ', rec)
42 | print('F1 = ', f1)
43 | 
44 | clf.save_model('Model_LightGBM.txt')
45 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Machine Learning based P2P Bot-Net Detection through Network Flow Analysis
 2 | 
 3 | This repository contains Team SaRaNi's (with me as **Team Leader**) submission to the **HCL Hack IITK 2020 Cybersecurity Hackathon**. We won the **Third Prize** globally for developing this tool. Note that the actual submission contained some preprocessed data as well, which is way too large to upload. We have included our **Finals Presentation**, please check it out at [Presentation](https://drive.google.com/file/d/1Y2dgCFZ0uKDx9qqxJ4AYFhegfl_lPpSA/view?usp=sharing) for a summary of our contributions as well as future directions to improve this tool.
 4 | 
 5 | ## Index
 6 | - [**Motivation**](#motivation)
 7 | - [**Machine Learning Pipeline**](#machine-learning-pipeline)
 8 |     1. [**Feature Extraction**](#1-feature-extraction)
 9 |     2. [**Feature Selection**](#2-feature-selection)
10 |     3. [**Model Building**](#3-model-building)
11 |     4. [**Model Testing**](#4-model-testing)
12 | - [**Unique Contributions**](#unique-contributions)
13 | - [**Execution Instructions**](#execution-instructions)
14 | 
15 | ## Motivation
16 | 
17 | A bot-net is a network of infected hosts (bots) that works independently under the control of a Botmaster (Bot herder), which issues commands to bots using command and control (C&C) servers. Traditionally, bot-nets used a centralized client-server architecture which had a single point of failure but with the advent of peer-to-peer technology, the problem of single point of failure seems to have been resolved. Gaining advantage of the decentralized nature of the P2P architecture, botmasters started using P2P based communication mechanism. P2P bot-nets are **highly resilient** against detection even after some bots are identified or taken down. P2P bot-nets provide central frameworks for different cyber-crimes which include DDoS (Distributed Denial of Service), email spam, phishing, password sniffing, etc.  
18 | 
19 | The objective was to develop a tool for **identifying P2P bot-nets using network traffic analysis**. We also detect the hosts involved in P2P traffic and then the detected hosts are further analyzed to detect bot-nets. We formulated the underlying problem as a **Classification** problem, which was given as input a Flow, which is a 5-Tuple of (srcAddr, sPort, dstAddr, dPort, Protocol), and had to output a label classifying the flow as malicious or benign. The overall tool then, took as input a .pcap file which captured the traffic over a network, parsed the file to identify flows, and then used our trained model to classify each flow on-the-fly as malware/benign. Below we give a brief overview of our machine learning pipeline.
20 | 
21 | ## Machine Learning Pipeline
22 | 
23 | ### 1. Feature Extraction
24 | 
25 | - We manually tested and examined the data, for getting a comprehensive understanding of the dataset provided.
26 | - We consulted multiple research papers to get an understanding of the different methods that we could use to approach the problem and create an exhaustive set of features.
27 | - Raw data files were parsed and the previously decided features were extracted.
28 | - We had a dataset with 2.57 Million examples in total, which we then split into Train, Validation and Test Sets (the sizes are mentioned in subsequent steps).
29 | 
30 | 
31 | ### 2. Feature Selection
32 | 
33 | - We ran 2 Feature Selection algorithms on the train set, namely **Select-K-Best** and **Recursive Feature Elimination (RFE)**.
34 | - Through the results from these two, we selected the 10 best features out of 23 initial features that we had identified.
35 | 
36 | ### 3. Model Building
37 | 
38 | - We used a Gradient Boosting Decision Tree framework called LightGBM, which is efficient and capable of handling large-scale data.
39 | - We tuned the Hyperparameters like max_depth to make our model robust and prevent overfitting.
40 | - The Cross-Validation Set Size was 33.33% of our training set.
41 | - Cross-Validation Accuracy was used as the evaluation metric.
42 | - Batch-learning was used with a Batch-size of 10000.
43 | 
44 | ### 4. Model Testing
45 | 
46 | - We tested the model with 10% of our total data.
47 | - Some of the results that we obtained are:  
48 | 
49 | Accuracy | Precision | Recall | F1 Score
50 | ---------|-----------|--------|---------
51 | 99.90%   | 99.93%    | 99.95% | 99.94%
52 | 	
53 | ## Unique Contributions
54 | 
55 | - All the research papers we consulted used a subset of possible features for classification. We combined all of them to create an **exhaustive** feature set, and then selected the best out of them.
56 | - Most papers only used direct flow-based features. However, we also **hand-engineered certain statistical features**, which we intuitively felt could be useful for classification after manual inspection of sample files.
57 | - The majority of previous approaches were aimed at detecting General Botnets. We focused on **P2P Botnets only**, and did not consider features pertaining to IRC Botnets. 
58 | - Past work that we saw either used basic ML Algorithms like Naive Bayes and Random Forest or computation intensive methods like Neural Networks. We used the **Fast and Sophisticated LightGBM** model, based on Gradient Boosted Decision Trees. This not only **reduced the Training Time**, but also **increased Accuracy**.
59 | 
60 | ## Execution Instructions
61 | 
62 | 1. Packages required :
63 | 	1. numpy
64 | 	2. sklearn
65 | 	3. pandas
66 | 	4. lightgbm
67 | 	5. os
68 | 	6. sys
69 | 	7. csv
70 | 	8. scapy
71 | 
72 | 2. How to install packages :  
73 | 	From Terminal : ```$ pip install <package name>```
74 | 
75 | 3. This folder contains a python program "botnetdetect.py". This program takes as command line input (path to) a .pcap file and outputs in the format  
76 | 
77 | Flow= (srcAddr, sPort, dstAddr, dPort, Protocol) |	Prediction
78 | ------------------------------------------------|------------------
79 |   \<Flow-5-Tuple\>				|	malicious/benign
80 | 


--------------------------------------------------------------------------------
/botnetdetect.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import os
  3 | import sys
  4 | from scapy.all import *
  5 | from scapy.layers.http import *
  6 | import lightgbm as lg
  7 | import numpy as np
  8 | 
  9 | model = lg.Booster(model_file=os.path.join(os.getcwd(), "Model_LightGBM.txt"))
 10 | 
 11 | 
 12 | def create_features(srcAddr, dstAddr, srcPort, dstPort, fps=0, byte_size=0, payl=0, dur=0, incoming=False):
 13 |     features = {
 14 |         'srcAddr': srcAddr,
 15 |         'dstAddr': dstAddr
 16 |     }
 17 |     features['srcPort'] = srcPort
 18 |     features['dstPort'] = dstPort
 19 |     features['PX'] = 1
 20 |     if 63 <= byte_size and byte_size <= 400:
 21 |         features['NSP'] = 1
 22 |     else:
 23 |         features['NSP'] = 0
 24 |     if(features['PX'] > 0):
 25 |         features['PSP'] = (features['NSP']/features['PX'])*100
 26 |     else:
 27 |         features['PSP'] = 0
 28 |     if not incoming:
 29 |         features['in'] = 0
 30 |     else:
 31 |         features['in'] = 1
 32 |     features['dur'] = dur
 33 |     features['FPS'] = fps
 34 |     features['TBT'] = byte_size
 35 |     features['APL'] = payl
 36 |     if(features['dur'] > 0):
 37 |         features['BS'] = features['TBT']/features['dur']
 38 |         features['PPS'] = features['PX']/features['dur']
 39 |     else:
 40 |         features['BS'] = 0
 41 |         features['PPS'] = 0
 42 | 
 43 |     return features
 44 | 
 45 | 
 46 | def update_features(features, nsp=False, incoming=False, byte_size=0, payl=0, dur=0):
 47 | 
 48 |     features['PX'] = features['PX'] + 1
 49 |     if nsp:
 50 |         features['NSP'] = features['NSP'] + 1
 51 |         features['PSP'] = (features['NSP']/features['PX'])*100
 52 |     if incoming:
 53 |         features['in'] = features['in'] + 1
 54 |     features['TBT'] = features['TBT'] + byte_size
 55 |     av = features['APL']
 56 |     n = features['PX'] - 1
 57 |     if(n+1 > 0):
 58 |         features['APL'] = ((av * n) + payl)/(n + 1)
 59 |     features['dur'] = features['dur'] + 1
 60 |     if(features['dur'] > 0):
 61 |         features['BS'] = features['TBT']/features['dur']
 62 |         features['PPS'] = features['PX']/features['dur']
 63 | 
 64 |     return features
 65 | 
 66 | # srcAddr         dstAddr  srcPort  dstPort           PPX    PSP  in  FPS  TBT    APL          BS       PPS
 67 | 
 68 | 
 69 | def flows_from_pcap(filePath):
 70 |     flows = {}
 71 |     fpcap = PcapReader(filePath)
 72 |     f_dup = PcapReader(filePath)
 73 |     pkt_nxt = next(f_dup)
 74 |     c = 0
 75 |     num = 0
 76 |     for pkt in fpcap:
 77 |         num = num+1
 78 |         dur = 0
 79 |         try:
 80 |             pkt_nxt = next(f_dup)
 81 |             dur = pkt_nxt.time - pkt.time
 82 |         except:
 83 |             dur = 0.0001
 84 | 
 85 |         srcAddr, dstAddr, sport, dport, proto = '', '', 0, 0, 3
 86 |         pload = 0
 87 |         tcp_close = False
 88 |         try:
 89 |             bs = pkt.len + 14
 90 |         except:
 91 |             continue
 92 | 
 93 |         if 'Ethernet' in pkt or 'cooked linux' in pkt:
 94 |             flag = False
 95 |             if('Ethernet' in pkt):
 96 |                 eth = pkt['Ethernet']
 97 |                 if eth.type == 2048:
 98 |                     flag = True
 99 |             else:
100 |                 lin = pkt['cooked linux']
101 |                 flag = lin.proto == 2048
102 |             if flag is True:
103 |                 ip = pkt['IP']
104 |                 srcAddr = ip.src
105 |                 dstAddr = ip.dst
106 |                 proto = ip.proto
107 | 
108 |                 if proto == 17 and pkt.haslayer('UDP'):
109 |                     sport = pkt['UDP'].sport
110 |                     dport = pkt['UDP'].dport
111 |                     pload = len(pkt['UDP'].payload)
112 |                 elif proto == 6 and pkt.haslayer('TCP'):
113 |                     sport = pkt['TCP'].sport
114 |                     dport = pkt['TCP'].dport
115 |                     pload = len(pkt['TCP'].payload)
116 |                     tcp_close = ('F' in pkt['TCP'].flags) or (
117 |                         'R' in pkt['TCP'].flags)
118 |                 elif proto == 1 and pkt.haslayer('ICMP'):
119 |                     if pkt['ICMP'].haslayer('IP') and pkt['ICMP']['IP'].proto == 17 and pkt['ICMP'].haslayer('UDP'):
120 |                         sport = pkt['ICMP']['UDP'].sport
121 |                         dport = pkt['ICMP']['UDP'].dport
122 |                         pload = len(pkt['ICMP']['UDP'].payload)
123 |                     elif pkt['ICMP'].haslayer('IP') and pkt['ICMP']['IP'].proto == 6 and pkt['ICMP'].haslayer('TCP'):
124 |                         sport = pkt['ICMP']['TCP'].sport
125 |                         dport = pkt['ICMP']['TCP'].dport
126 |                         pload = len(pkt['ICMP']['TCP'].payload)
127 |                         tcp_close = ('F' in pkt['ICMP']['TCP'].flags) or (
128 |                             'R' in pkt['ICMP']['TCP'].flags)
129 |                     else:
130 |                         continue
131 |                 else:
132 |                     continue
133 |             else:
134 |                 continue
135 |         nsp = 63 <= bs and bs <= 400
136 |         tuple5 = (srcAddr, sport, dstAddr, dport, proto, True)
137 |         tuple5_inv = (dstAddr, dport, srcAddr, sport, proto, True)
138 |         if tuple5 in flows:
139 |             features = flows[tuple5]
140 |             flows[tuple5] = update_features(
141 |                 features, nsp, incoming=False, byte_size=bs, payl=pload, dur=dur)
142 |             if tcp_close:
143 |                 temp = flows[tuple5]
144 |                 del flows[tuple5]
145 |                 tuple5 = (srcAddr, sport, dstAddr, dport, proto, False)
146 |                 flows[tuple5] = temp
147 |         elif tuple5_inv in flows:
148 |             features = flows[tuple5_inv]
149 |             flows[tuple5_inv] = update_features(
150 |                 features, nsp, incoming=True, byte_size=bs, payl=pload, dur=dur)
151 |             if tcp_close:
152 |                 temp = flows[tuple5_inv]
153 |                 del flows[tuple5_inv]
154 |                 tuple5_inv = (dstAddr, dport, srcAddr, sport, proto, False)
155 |                 flows[tuple5_inv] = temp
156 |         else:
157 |             flows[tuple5] = create_features(srcAddr, dstAddr, sport, dport, fps=pload, byte_size=bs, payl=pload,
158 |                                             dur=dur, incoming=False)            # features_list
159 | 
160 |     for flow in flows:
161 |         if(num > 0):
162 |             flows[flow]['PPX'] = flows[flow]['PX']/num
163 |         else:
164 |             flows[flow]['PPX'] = 0
165 |         if('PX' in flows[flow]):
166 |             del flows[flow]['PX']
167 |         if('NSP' in flows[flow]):
168 |             del flows[flow]['NSP']
169 |         if('dur' in flows[flow]):
170 |             del flows[flow]['dur']
171 |     return flows
172 | 
173 | 
174 | field_names = [
175 |     'srcAddr',
176 |     'dstAddr',
177 |     'srcPort',
178 |     'dstPort',
179 |     'PPX',
180 |     'PSP',
181 |     'in',
182 |     'FPS',
183 |     'TBT',
184 |     'APL',
185 |     'BS',
186 |     'PPS'
187 | ]
188 | 
189 | if(__name__ == "__main__"):
190 |     filePath = str(sys.argv[1])
191 |     # Prepare Dictionary of Flow Features from filePath
192 |     outfile = open(os.path.join(os.getcwd(), "results.txt"), 'x')
193 |     csv_delimiter = "\t"
194 |     columns = ["Flow=(srcAddr,srcPort,dstAddr,dstPort,proto)", "Prediction"]
195 |     outfile.write(csv_delimiter.join(columns)+"\n")
196 |     features_dict = flows_from_pcap(filePath)
197 |     for flow in features_dict:
198 |         if('BS' in features_dict[flow]):
199 |             features_dict[flow]['BS'] = float(
200 |                 features_dict[flow]['BS'])
201 |         if('PPS' in features_dict[flow]):
202 |             features_dict[flow]['PPS'] = float(
203 |                 features_dict[flow]['PPS'])
204 |         feature_vector = np.asarray([features_dict[flow]['srcPort'],
205 |                                      features_dict[flow]['dstPort'],
206 |                                      features_dict[flow]['PPX'],
207 |                                      features_dict[flow]['PSP'],
208 |                                      features_dict[flow]['in'],
209 |                                      features_dict[flow]['FPS'],
210 |                                      features_dict[flow]['TBT'],
211 |                                      features_dict[flow]['APL'],
212 |                                      features_dict[flow]['BS'],
213 |                                      features_dict[flow]['PPS']])
214 |         prediction = model.predict(feature_vector[np.newaxis, ...])
215 |         if(prediction[0] >= 0.5):
216 |             malicious = "malicious"
217 |         else:
218 |             malicious = "benign"
219 |         outfile.write("("+str(flow[0])+","+str(flow[1])+","+str(flow[2]) +
220 |                       ","+str(flow[3])+","+str(flow[4])+")"+"\t"+malicious+"\n")
221 | 


--------------------------------------------------------------------------------
/Feature_Extraction.py:
--------------------------------------------------------------------------------
  1 | from scapy.all import *
  2 | from scapy.layers.http import *
  3 | import os
  4 | import csv
  5 | 
  6 | 
  7 | def create_features(label, srcAddr, dstAddr, srcPort, dstPort, proto=4, fps=0, byte_size=0, payl=0, time=0, dur=0, incoming=False, http=4):
  8 |     features = {
  9 |         'malicious': label,
 10 |         'srcAddr': srcAddr,
 11 |         'dstAddr': dstAddr
 12 |     }
 13 |     features['srcPort'] = srcPort
 14 |     features['dstPort'] = dstPort
 15 |     if proto == 2:
 16 |         features['proto'] = 2
 17 |     elif proto == 6:
 18 |         features['proto'] = 0
 19 |     elif proto == 17:
 20 |         features['proto'] = 1
 21 |     else:
 22 |         features['proto'] = 3
 23 |     features['PX'] = 1
 24 |     if byte_size <= 62:
 25 |         features['NNP'] = 1
 26 |     else:
 27 |         features['NNP'] = 0
 28 |     if 63 <= byte_size and byte_size <= 400:
 29 |         features['NSP'] = 1
 30 |     else:
 31 |         features['NSP'] = 0
 32 |     if(features['PX'] > 0):
 33 |         features['PSP'] = (features['NSP']/features['PX'])*100
 34 |         features['PNP'] = (features['NNP']/features['PX'])*100
 35 |     else:
 36 |         features['PSP'] = 0
 37 |         features['PNP'] = 0
 38 |     if not incoming:
 39 |         features['out'] = 1
 40 |         features['in'] = 0
 41 |     else:
 42 |         features['in'] = 1
 43 |         features['out'] = 0
 44 |     if features['out'] > 0:
 45 |         features['IOPR'] = features['in']/features['out']
 46 |     else:
 47 |         features['IOPR'] = 0
 48 |     features['dur'] = dur
 49 |     features['FPS'] = fps
 50 |     features['TBT'] = byte_size
 51 |     features['APL'] = payl
 52 |     features['PV'] = 0.0
 53 |     if(features['dur'] > 0):
 54 |         features['BS'] = features['TBT']/features['dur']
 55 |         features['PPS'] = features['PX']/features['dur']
 56 |     else:
 57 |         features['BS'] = 0
 58 |         features['PPS'] = 0
 59 |     features['AIT'] = 0
 60 |     features['HTTPM'] = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0}
 61 |     features['HTTPM'][http] = 1
 62 |     features['time'] = time
 63 | 
 64 |     return features
 65 | 
 66 | 
 67 | def update_features(features, nnp=False, nsp=False, incoming=False, byte_size=0, payl=0, time=0, dur=0, http=4):
 68 | 
 69 |     features['PX'] = features['PX'] + 1
 70 |     if nnp:
 71 |         features['NNP'] = features['NNP'] + 1
 72 |         features['PNP'] = (features['NNP']/features['PX'])*100
 73 |     if nsp:
 74 |         features['NSP'] = features['NSP'] + 1
 75 |         features['PSP'] = (features['NSP']/features['PX'])*100
 76 |     if incoming:
 77 |         features['in'] = features['in'] + 1
 78 |     else:
 79 |         features['out'] = features['out'] + 1
 80 |     if(features['out'] > 0):
 81 |         features['IOPR'] = features['in'] / features['out']
 82 |     features['TBT'] = features['TBT'] + byte_size
 83 |     av = features['APL']
 84 |     sd = features['PV']
 85 |     n = features['PX'] - 1
 86 |     if(n+1 > 0):
 87 |         features['APL'] = ((av * n) + payl)/(n + 1)
 88 |     if(n > 0):
 89 |         features['PV'] = ((((n-1)*sd**2) + (n * av**2) -
 90 |                            ((n + 1)*features['APL']**2) + payl**2)/n)**0.5
 91 |     features['dur'] = features['dur'] + 1
 92 |     if(features['dur'] > 0):
 93 |         features['BS'] = features['TBT']/features['dur']
 94 |         features['PPS'] = features['PX']/features['dur']
 95 |     del_t = time - features['time']
 96 |     features['time'] = time
 97 |     if(n+1 > 0):
 98 |         features['AIT'] = ((features['AIT'] * n) + del_t) / (n + 1)
 99 |     features['HTTPM'][http] = features['HTTPM'][http] + 1
100 | 
101 |     return features
102 | 
103 | 
104 | def flows_from_pcap(label, filePath):
105 |     flows = {}
106 |     fpcap = PcapReader(filePath)
107 |     f_dup = PcapReader(filePath)
108 |     pkt_nxt = next(f_dup)
109 |     c = 0
110 |     num = 0
111 |     total_bytes = 0
112 |     for pkt in fpcap:
113 |         num = num+1
114 |         dur = 0
115 |         try:
116 |             pkt_nxt = next(f_dup)
117 |             dur = pkt_nxt.time - pkt.time
118 |         except:
119 |             dur = 0.0001
120 | 
121 |         srcAddr, dstAddr, sport, dport, proto = '', '', 0, 0, 3
122 |         pload, http_meth = 0, 4
123 |         tcp_close = False
124 |         try:
125 |             bs = pkt.len + 14
126 |             total_bytes += pkt.len
127 |         except:
128 |             continue
129 | 
130 |         if 'Ethernet' in pkt or 'cooked linux' in pkt:
131 |             flag = False
132 |             if('Ethernet' in pkt):
133 |                 eth = pkt['Ethernet']
134 |                 if eth.type == 2048:
135 |                     flag = True
136 |             else:
137 |                 lin = pkt['cooked linux']
138 |                 flag = lin.proto == 2048
139 |             if flag is True:
140 |                 ip = pkt['IP']
141 |                 proto = ip.proto
142 |                 srcAddr = ip.src
143 |                 dstAddr = ip.dst
144 |                 if(os.path.basename(os.path.dirname(filePath)) == "p2pbox1"):
145 |                     if((str(srcAddr) not in benign_ip["p2pbox1"]) and (str(dstAddr) not in benign_ip["p2pbox1"])):
146 |                         continue
147 |                 if(os.path.basename(os.path.dirname(filePath)) == "p2pbox2"):
148 |                     if((str(srcAddr) not in benign_ip["p2pbox2"]) and (str(dstAddr) not in benign_ip["p2pbox2"])):
149 |                         continue
150 |                 if(os.path.basename(os.path.dirname(filePath)) == "torrent"):
151 |                     if((str(srcAddr) not in benign_ip["torrent"]) and (str(dstAddr) not in benign_ip["torrent"])):
152 |                         continue
153 |                 if(os.path.basename(os.path.dirname(filePath)) == "storm"):
154 |                     if((str(srcAddr) not in malicious_ip["storm"]) and (str(dstAddr) not in malicious_ip["storm"])):
155 |                         continue
156 |                 if(os.path.basename(os.path.dirname(filePath)) == "vinchuca"):
157 |                     if((str(srcAddr) not in malicious_ip["vinchuca"]) and (str(dstAddr) not in malicious_ip["vinchuca"])):
158 |                         continue
159 |                 if(os.path.basename(os.path.dirname(filePath)) == "zeus"):
160 |                     if((str(srcAddr) not in malicious_ip["zeus"]) and (str(dstAddr) not in malicious_ip["zeus"])):
161 |                         continue
162 |                 if proto == 17 and pkt.haslayer('UDP'):
163 |                     sport = pkt['UDP'].sport
164 |                     dport = pkt['UDP'].dport
165 |                     pload = len(pkt['UDP'].payload)
166 |                 elif proto == 6 and pkt.haslayer('TCP'):
167 |                     sport = pkt['TCP'].sport
168 |                     dport = pkt['TCP'].dport
169 |                     pload = len(pkt['TCP'].payload)
170 |                     if pkt.haslayer('HTTPRequest'):
171 |                         meth = pkt['HTTPRequest'].Method
172 |                         if meth == b'GET':
173 |                             http_meth = 0
174 |                         elif meth == b'POST':
175 |                             http_meth = 1
176 |                         elif meth == b'PUT':
177 |                             http_meth = 2
178 |                         elif meth == b'DELETE':
179 |                             http_meth = 3
180 |                         else:
181 |                             http_meth = 4
182 |                     tcp_close = ('F' in pkt['TCP'].flags) or (
183 |                         'R' in pkt['TCP'].flags)
184 |                 elif proto == 1 and pkt.haslayer('ICMP'):
185 |                     if pkt['ICMP'].haslayer('IP') and pkt['ICMP']['IP'].proto == 17 and pkt['ICMP'].haslayer('UDP'):
186 |                         sport = pkt['ICMP']['UDP'].sport
187 |                         dport = pkt['ICMP']['UDP'].dport
188 |                         pload = len(pkt['ICMP']['UDP'].payload)
189 |                     elif pkt['ICMP'].haslayer('IP') and pkt['ICMP']['IP'].proto == 6 and pkt['ICMP'].haslayer('TCP'):
190 |                         sport = pkt['ICMP']['TCP'].sport
191 |                         dport = pkt['ICMP']['TCP'].dport
192 |                         pload = len(pkt['ICMP']['TCP'].payload)
193 |                         if pkt['ICMP'].haslayer('HTTPRequest'):
194 |                             meth = pkt['ICMP']['HTTPRequest'].Method
195 |                             if meth == b'GET':
196 |                                 http_meth = 0
197 |                             elif meth == b'POST':
198 |                                 http_meth = 1
199 |                             elif meth == b'PUT':
200 |                                 http_meth = 2
201 |                             elif meth == b'DELETE':
202 |                                 http_meth = 3
203 |                             else:
204 |                                 http_meth = 4
205 |                         tcp_close = ('F' in pkt['ICMP']['TCP'].flags) or (
206 |                             'R' in pkt['ICMP']['TCP'].flags)
207 |                     else:
208 |                         continue
209 |                 else:
210 |                     continue
211 |             else:
212 |                 continue
213 |         nnp = pload == 0
214 |         nsp = 63 <= bs and bs <= 400
215 |         tuple5 = (srcAddr, sport, dstAddr, dport, proto, True)
216 |         tuple5_inv = (dstAddr, dport, srcAddr, sport, proto, True)
217 |         if tuple5 in flows:
218 |             features = flows[tuple5]
219 |             flows[tuple5] = update_features(
220 |                 features, nnp, nsp, incoming=False, byte_size=bs, payl=pload, time=pkt.time, dur=dur, http=http_meth)
221 |             if tcp_close:
222 |                 temp = flows[tuple5]
223 |                 del flows[tuple5]
224 |                 tuple5 = (srcAddr, sport, dstAddr, dport, proto, False)
225 |                 flows[tuple5] = temp
226 |         elif tuple5_inv in flows:
227 |             features = flows[tuple5_inv]
228 |             flows[tuple5_inv] = update_features(
229 |                 features, nnp, nsp, incoming=True, byte_size=bs, payl=pload, time=pkt.time, dur=dur, http=http_meth)
230 |             if tcp_close:
231 |                 temp = flows[tuple5_inv]
232 |                 del flows[tuple5_inv]
233 |                 tuple5_inv = (dstAddr, dport, srcAddr, sport, proto, False)
234 |                 flows[tuple5_inv] = temp
235 |         else:
236 |             flows[tuple5] = create_features(label, srcAddr, dstAddr, sport, dport, proto, fps=pload, byte_size=bs, payl=pload,
237 |                                             time=pkt.time, dur=dur, incoming=False, http=http_meth)            # features_list
238 | 
239 |     for flow in flows:
240 |         if(num > 0):
241 |             flows[flow]['PPX'] = flows[flow]['PX']/num
242 |         else:
243 |             flows[flow]['PPX'] = 0
244 |         if(total_bytes > 0):
245 |             flows[flow]['PBT'] = flows[flow]['TBT']/total_bytes
246 |         else:
247 |             flows[flow]['PBT'] = 0
248 |     return flows
249 | 
250 | 
251 | benign_ip = {
252 |     "p2pbox1":	["192.168.1.2"],
253 |     "p2pbox2":	["192.168.2.2"],
254 |     "torrent": 	["172.27.28.106"]
255 | }
256 | 
257 | malicious_ip = {
258 |     "storm": ["66.154.80.101",
259 |               "66.154.80.105",
260 |               "66.154.80.111",
261 |               "66.154.80.125",
262 |               "66.154.83.107",
263 |               "66.154.83.113",
264 |               "66.154.83.138",
265 |               "66.154.83.80",
266 |               "66.154.87.39",
267 |               "66.154.87.41",
268 |               "66.154.87.57",
269 |               "66.154.87.58",
270 |               "66.154.87.61"],
271 |     "vinchuca": ["172.27.22.206"],
272 |     "zeus": ["10.0.2.15"]
273 | }
274 | 
275 | field_names = [
276 |     'srcAddr',
277 |     'dstAddr',
278 |     'srcPort',
279 |     'dstPort',
280 |     'proto',
281 |     'PPX',
282 |     'PBT',
283 |     'PX',
284 |     'NNP',
285 |     'NSP',
286 |     'PSP',
287 |     'PNP',
288 |     'out',
289 |     'in',
290 |     'IOPR',
291 |     'dur',
292 |     'FPS',
293 |     'TBT',
294 |     'APL',
295 |     'PV',
296 |     'BS',
297 |     'PPS',
298 |     'AIT',
299 |     'HTTPM0',
300 |     'HTTPM1',
301 |     'HTTPM2',
302 |     'HTTPM3',
303 |     'HTTPM4',
304 |     'malicious'
305 | ]
306 | 
307 | with open('Results_2.csv', 'x') as csvfile:
308 |     writer = csv.DictWriter(csvfile, fieldnames=field_names)
309 |     writer.writeheader()
310 | 
311 |     i = 0
312 |     # Launch Benign
313 |     for root, dirs, files in os.walk(os.path.join('Botnet_Detection_Dataset', 'Benign')):
314 |         for name in files:
315 |             filePath = os.path.join(root, name)
316 |             if(name != "ip_details.txt"):
317 |                 flow_features = flows_from_pcap(0, filePath)
318 |                 for flow in flow_features:
319 |                     if('dur' in flow_features[flow]):
320 |                         flow_features[flow]['dur'] = float(
321 |                             flow_features[flow]['dur'])
322 |                     if('BS' in flow_features[flow]):
323 |                         flow_features[flow]['BS'] = float(
324 |                             flow_features[flow]['BS'])
325 |                     if('PPS' in flow_features[flow]):
326 |                         flow_features[flow]['PPS'] = float(
327 |                             flow_features[flow]['PPS'])
328 |                     if('AIT' in flow_features[flow]):
329 |                         flow_features[flow]['AIT'] = float(
330 |                             flow_features[flow]['AIT'])
331 |                     if('time' in flow_features[flow]):
332 |                         flow_features[flow]['time'] = float(
333 |                             flow_features[flow]['time'])
334 |                     if('HTTPM' in flow_features[flow]):
335 |                         flow_features[flow]['HTTPM0'] = flow_features[flow]['HTTPM'][0]
336 |                         flow_features[flow]['HTTPM1'] = flow_features[flow]['HTTPM'][1]
337 |                         flow_features[flow]['HTTPM2'] = flow_features[flow]['HTTPM'][2]
338 |                         flow_features[flow]['HTTPM3'] = flow_features[flow]['HTTPM'][3]
339 |                         flow_features[flow]['HTTPM4'] = flow_features[flow]['HTTPM'][4]
340 |                         del flow_features[flow]['HTTPM']
341 |                     if('time' in flow_features[flow]):
342 |                         del flow_features[flow]['time']
343 |                     writer.writerow(flow_features[flow])
344 |                 i = i+1
345 |                 print(i, "Files Processed:", filePath, len(flow_features))
346 | 
347 |     # Launch Botnet
348 |     for root, dirs, files in os.walk(os.path.join('Botnet_Detection_Dataset', 'Botnet', 'storm')):
349 |         for name in files:
350 |             filePath = os.path.join(root, name)
351 |             if(name != "storm-IP" and name != "vinchuca_IP" and name != "zeus_IP"):
352 |                 flow_features = flows_from_pcap(1, filePath)
353 |                 for flow in flow_features:
354 |                     if('dur' in flow_features[flow]):
355 |                         flow_features[flow]['dur'] = float(
356 |                             flow_features[flow]['dur'])
357 |                     if('BS' in flow_features[flow]):
358 |                         flow_features[flow]['BS'] = float(
359 |                             flow_features[flow]['BS'])
360 |                     if('PPS' in flow_features[flow]):
361 |                         flow_features[flow]['PPS'] = float(
362 |                             flow_features[flow]['PPS'])
363 |                     if('AIT' in flow_features[flow]):
364 |                         flow_features[flow]['AIT'] = float(
365 |                             flow_features[flow]['AIT'])
366 |                     if('time' in flow_features[flow]):
367 |                         flow_features[flow]['time'] = float(
368 |                             flow_features[flow]['time'])
369 |                     if('HTTPM' in flow_features[flow]):
370 |                         flow_features[flow]['HTTPM0'] = flow_features[flow]['HTTPM'][0]
371 |                         flow_features[flow]['HTTPM1'] = flow_features[flow]['HTTPM'][1]
372 |                         flow_features[flow]['HTTPM2'] = flow_features[flow]['HTTPM'][2]
373 |                         flow_features[flow]['HTTPM3'] = flow_features[flow]['HTTPM'][3]
374 |                         flow_features[flow]['HTTPM4'] = flow_features[flow]['HTTPM'][4]
375 |                         del flow_features[flow]['HTTPM']
376 |                     if('time' in flow_features[flow]):
377 |                         del flow_features[flow]['time']
378 |                     writer.writerow(flow_features[flow])
379 |                 i = i+1
380 |                 print(i, "Files Processed:", filePath)
381 | 


--------------------------------------------------------------------------------