├── .gitignore ├── ProblemStatement.pdf ├── FinalsPresentation.pptx ├── Feature_Preparation.py ├── Feature_Selection_Results.txt ├── Feature_Selection.py ├── List of Features.txt ├── Train_Model.py ├── README.md ├── botnetdetect.py └── Feature_Extraction.py /.gitignore: -------------------------------------------------------------------------------- 1 | Feature_Selection_Data.zip -------------------------------------------------------------------------------- /ProblemStatement.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ramneet-Singh/BotNet-Detection-ML/HEAD/ProblemStatement.pdf -------------------------------------------------------------------------------- /FinalsPresentation.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ramneet-Singh/BotNet-Detection-ML/HEAD/FinalsPresentation.pptx -------------------------------------------------------------------------------- /Feature_Preparation.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | # 0,1,3,8,11,14,15,16,18,19 5 | df = pd.read_csv("Data_Features.csv").drop( 6 | columns=["Unnamed: 0", "Unnamed: 0.1"]) 7 | data_test = df.iloc[:250000, np.asarray( 8 | [0, 1, 2, 3, 5, 10, 13, 16, 17, 18, 20, 21, -1])] 9 | data_test.to_csv("Selected_Features_Test.csv", index=False) 10 | print(data_test.head) 11 | data_train = df.iloc[250000:, np.asarray( 12 | [0, 1, 2, 3, 5, 10, 13, 16, 17, 18, 20, 21, -1])] 13 | data_train.to_csv("Selected_Features_Train.csv", index=False) 14 | -------------------------------------------------------------------------------- /Feature_Selection_Results.txt: -------------------------------------------------------------------------------- 1 | Type m features 2 | FPR 5k 0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 25 3 | FPR 10k 0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 25 4 | FPR 100k 0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 25 5 | FPR 1M 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 25 6 | 7 | RFE 5k 0 1 3 9 10 13 14 15 16 17 18 19 8 | RFE 10k 0 1 2 3 8 10 14 15 16 17 18 19 9 | RFE 100k 0 1 2 3 10 13 14 15 16 17 18 19 10 | RFE 1M 0 1 2 3 8 10 13 14 16 17 18 19 11 | 12 | KBest 100k 0 1 6 8 9 11 14 15 16 18 19 20 13 | KBest 1M 0 1 6 8 9 11 14 15 16 18 19 20 14 | 15 | Common : 0, 1, 3, 8, 11, 14, 15, 16, 18, 19 -------------------------------------------------------------------------------- /Feature_Selection.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_selection import RFE, SelectKBest, chi2, SelectFpr 2 | from sklearn.ensemble import RandomForestClassifier 3 | import pandas as pd 4 | import numpy as np 5 | 6 | df = pd.read_csv('Train_CV_Data.csv') 7 | X_train = np.asarray(df.loc[:2000000, 'srcPort':'HTTPM4']) 8 | Y_train = np.asarray(df.loc[:2000000, 'malicious'], dtype=np.int32) 9 | print(np.sum(Y_train == 1)) 10 | 11 | kBest = SelectKBest(chi2, k=12) 12 | kBest.fit(X_train, Y_train) 13 | mask1 = kBest.get_support(indices=True) 14 | 15 | fpr = SelectFpr(chi2, alpha=0.0001) 16 | fpr.fit(X_train, Y_train) 17 | mask2 = fpr.get_support(indices=True) 18 | 19 | rf = RandomForestClassifier(n_estimators=50) 20 | 21 | rfe = RFE(rf, n_features_to_select=12, step=1) 22 | rfe.fit(X_train, Y_train) 23 | mask3 = rfe.get_support(indices=True) 24 | 25 | print('K-Best Feat :', mask1) 26 | print('False Positive based :', mask2) 27 | print('RFE based :', mask3) 28 | -------------------------------------------------------------------------------- /List of Features.txt: -------------------------------------------------------------------------------- 1 | 1. Source IP (Source = Originating IP) 2 | 2. Destination IP 3 | 3. Source Port 4 | 4. Destination Port 5 | 5. Protocol (TCP,UDP,ICMP) 6 | 6. PX (total number of packets exchanged) *IMP 7 | 7. NNP (number of null packets (0 length payload) exchanged) 8 | 8. NSP (number of small packets (length 63-400) exchanged) 9 | 9. PSP (percentage of small packets exchanged) 10 | 10. PNP (percentage of null packets exchanged) 11 | 11. IOPR (ratio between the number of incoming packets over the number of outgoing packets) 12 | 12. FPS (length of the first packets) 13 | 13. TBT (total number of bytes) 14 | 14. APL (average payload packet length) 15 | 15. PV (standard deviation of payload packet length) 16 | 16. BS (average bits-per-second) 17 | 17. AIT (average inter-arrival time of packets) 18 | 18. PPS (average-packets-per-second) 19 | 19. HTTP Method (GET,POST,PUT,DELETE,Null) 20 | 19-1. HTTPM0 21 | 19-2. HTTPM1 22 | 19-3. HTTPM2 23 | 19-4. HTTPM3 24 | 19-5. HTTPM4 25 | -------------------------------------------------------------------------------- /Train_Model.py: -------------------------------------------------------------------------------- 1 | import lightgbm as lg 2 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 3 | from sklearn.model_selection import train_test_split 4 | import pandas as pd 5 | import numpy as np 6 | 7 | df = pd.read_csv('Selected_Features_Train.csv', chunksize=15000) 8 | clf = None 9 | params = {'boosting': 'gbdt', 10 | 'objective': 'binary', 11 | 'learning_rate': 0.01, 12 | 'num_leaves': 31, 13 | 'is_unbalance': True, 14 | 'verbosity': 100, 15 | 'bagging_freq': 5, 16 | 'bagging_fraction': 0.8} 17 | 18 | for data in df: 19 | X = np.asarray(data.iloc[:, 2:-1]) 20 | Y = np.asarray(data.iloc[:, -1]) 21 | X_train, X_cv, Y_train, Y_cv = train_test_split( 22 | X, Y, test_size=0.333, random_state=1) 23 | clf = lg.train(params=params, train_set=lg.Dataset(X_train, Y_train), num_boost_round=2, 24 | init_model=clf, valid_sets=lg.Dataset(X_cv, Y_cv), keep_training_booster=True) 25 | 26 | del df 27 | 28 | df = pd.read_csv('Selected_Features_Test.csv') 29 | X_test = np.asarray(df.iloc[:, 2:-1]) 30 | Y_test = np.asarray(df.iloc[:, -1]) 31 | pred_prob = clf.predict(X_test) 32 | 33 | pred = pred_prob >= 0.5 34 | acc = accuracy_score(Y_test, pred) 35 | prec = precision_score(Y_test, pred) 36 | rec = recall_score(Y_test, pred) 37 | f1 = f1_score(Y_test, pred) 38 | 39 | print('Accuracy = ', acc) 40 | print('Precision = ', prec) 41 | print('Recall = ', rec) 42 | print('F1 = ', f1) 43 | 44 | clf.save_model('Model_LightGBM.txt') 45 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning based P2P Bot-Net Detection through Network Flow Analysis 2 | 3 | This repository contains Team SaRaNi's (with me as **Team Leader**) submission to the **HCL Hack IITK 2020 Cybersecurity Hackathon**. We won the **Third Prize** globally for developing this tool. Note that the actual submission contained some preprocessed data as well, which is way too large to upload. We have included our **Finals Presentation**, please check it out at [Presentation](https://drive.google.com/file/d/1Y2dgCFZ0uKDx9qqxJ4AYFhegfl_lPpSA/view?usp=sharing) for a summary of our contributions as well as future directions to improve this tool. 4 | 5 | ## Index 6 | - [**Motivation**](#motivation) 7 | - [**Machine Learning Pipeline**](#machine-learning-pipeline) 8 | 1. [**Feature Extraction**](#1-feature-extraction) 9 | 2. [**Feature Selection**](#2-feature-selection) 10 | 3. [**Model Building**](#3-model-building) 11 | 4. [**Model Testing**](#4-model-testing) 12 | - [**Unique Contributions**](#unique-contributions) 13 | - [**Execution Instructions**](#execution-instructions) 14 | 15 | ## Motivation 16 | 17 | A bot-net is a network of infected hosts (bots) that works independently under the control of a Botmaster (Bot herder), which issues commands to bots using command and control (C&C) servers. Traditionally, bot-nets used a centralized client-server architecture which had a single point of failure but with the advent of peer-to-peer technology, the problem of single point of failure seems to have been resolved. Gaining advantage of the decentralized nature of the P2P architecture, botmasters started using P2P based communication mechanism. P2P bot-nets are **highly resilient** against detection even after some bots are identified or taken down. P2P bot-nets provide central frameworks for different cyber-crimes which include DDoS (Distributed Denial of Service), email spam, phishing, password sniffing, etc. 18 | 19 | The objective was to develop a tool for **identifying P2P bot-nets using network traffic analysis**. We also detect the hosts involved in P2P traffic and then the detected hosts are further analyzed to detect bot-nets. We formulated the underlying problem as a **Classification** problem, which was given as input a Flow, which is a 5-Tuple of (srcAddr, sPort, dstAddr, dPort, Protocol), and had to output a label classifying the flow as malicious or benign. The overall tool then, took as input a .pcap file which captured the traffic over a network, parsed the file to identify flows, and then used our trained model to classify each flow on-the-fly as malware/benign. Below we give a brief overview of our machine learning pipeline. 20 | 21 | ## Machine Learning Pipeline 22 | 23 | ### 1. Feature Extraction 24 | 25 | - We manually tested and examined the data, for getting a comprehensive understanding of the dataset provided. 26 | - We consulted multiple research papers to get an understanding of the different methods that we could use to approach the problem and create an exhaustive set of features. 27 | - Raw data files were parsed and the previously decided features were extracted. 28 | - We had a dataset with 2.57 Million examples in total, which we then split into Train, Validation and Test Sets (the sizes are mentioned in subsequent steps). 29 | 30 | 31 | ### 2. Feature Selection 32 | 33 | - We ran 2 Feature Selection algorithms on the train set, namely **Select-K-Best** and **Recursive Feature Elimination (RFE)**. 34 | - Through the results from these two, we selected the 10 best features out of 23 initial features that we had identified. 35 | 36 | ### 3. Model Building 37 | 38 | - We used a Gradient Boosting Decision Tree framework called LightGBM, which is efficient and capable of handling large-scale data. 39 | - We tuned the Hyperparameters like max_depth to make our model robust and prevent overfitting. 40 | - The Cross-Validation Set Size was 33.33% of our training set. 41 | - Cross-Validation Accuracy was used as the evaluation metric. 42 | - Batch-learning was used with a Batch-size of 10000. 43 | 44 | ### 4. Model Testing 45 | 46 | - We tested the model with 10% of our total data. 47 | - Some of the results that we obtained are: 48 | 49 | Accuracy | Precision | Recall | F1 Score 50 | ---------|-----------|--------|--------- 51 | 99.90% | 99.93% | 99.95% | 99.94% 52 | 53 | ## Unique Contributions 54 | 55 | - All the research papers we consulted used a subset of possible features for classification. We combined all of them to create an **exhaustive** feature set, and then selected the best out of them. 56 | - Most papers only used direct flow-based features. However, we also **hand-engineered certain statistical features**, which we intuitively felt could be useful for classification after manual inspection of sample files. 57 | - The majority of previous approaches were aimed at detecting General Botnets. We focused on **P2P Botnets only**, and did not consider features pertaining to IRC Botnets. 58 | - Past work that we saw either used basic ML Algorithms like Naive Bayes and Random Forest or computation intensive methods like Neural Networks. We used the **Fast and Sophisticated LightGBM** model, based on Gradient Boosted Decision Trees. This not only **reduced the Training Time**, but also **increased Accuracy**. 59 | 60 | ## Execution Instructions 61 | 62 | 1. Packages required : 63 | 1. numpy 64 | 2. sklearn 65 | 3. pandas 66 | 4. lightgbm 67 | 5. os 68 | 6. sys 69 | 7. csv 70 | 8. scapy 71 | 72 | 2. How to install packages : 73 | From Terminal : ```$ pip install ``` 74 | 75 | 3. This folder contains a python program "botnetdetect.py". This program takes as command line input (path to) a .pcap file and outputs in the format 76 | 77 | Flow= (srcAddr, sPort, dstAddr, dPort, Protocol) | Prediction 78 | ------------------------------------------------|------------------ 79 | \ | malicious/benign 80 | -------------------------------------------------------------------------------- /botnetdetect.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | import sys 4 | from scapy.all import * 5 | from scapy.layers.http import * 6 | import lightgbm as lg 7 | import numpy as np 8 | 9 | model = lg.Booster(model_file=os.path.join(os.getcwd(), "Model_LightGBM.txt")) 10 | 11 | 12 | def create_features(srcAddr, dstAddr, srcPort, dstPort, fps=0, byte_size=0, payl=0, dur=0, incoming=False): 13 | features = { 14 | 'srcAddr': srcAddr, 15 | 'dstAddr': dstAddr 16 | } 17 | features['srcPort'] = srcPort 18 | features['dstPort'] = dstPort 19 | features['PX'] = 1 20 | if 63 <= byte_size and byte_size <= 400: 21 | features['NSP'] = 1 22 | else: 23 | features['NSP'] = 0 24 | if(features['PX'] > 0): 25 | features['PSP'] = (features['NSP']/features['PX'])*100 26 | else: 27 | features['PSP'] = 0 28 | if not incoming: 29 | features['in'] = 0 30 | else: 31 | features['in'] = 1 32 | features['dur'] = dur 33 | features['FPS'] = fps 34 | features['TBT'] = byte_size 35 | features['APL'] = payl 36 | if(features['dur'] > 0): 37 | features['BS'] = features['TBT']/features['dur'] 38 | features['PPS'] = features['PX']/features['dur'] 39 | else: 40 | features['BS'] = 0 41 | features['PPS'] = 0 42 | 43 | return features 44 | 45 | 46 | def update_features(features, nsp=False, incoming=False, byte_size=0, payl=0, dur=0): 47 | 48 | features['PX'] = features['PX'] + 1 49 | if nsp: 50 | features['NSP'] = features['NSP'] + 1 51 | features['PSP'] = (features['NSP']/features['PX'])*100 52 | if incoming: 53 | features['in'] = features['in'] + 1 54 | features['TBT'] = features['TBT'] + byte_size 55 | av = features['APL'] 56 | n = features['PX'] - 1 57 | if(n+1 > 0): 58 | features['APL'] = ((av * n) + payl)/(n + 1) 59 | features['dur'] = features['dur'] + 1 60 | if(features['dur'] > 0): 61 | features['BS'] = features['TBT']/features['dur'] 62 | features['PPS'] = features['PX']/features['dur'] 63 | 64 | return features 65 | 66 | # srcAddr dstAddr srcPort dstPort PPX PSP in FPS TBT APL BS PPS 67 | 68 | 69 | def flows_from_pcap(filePath): 70 | flows = {} 71 | fpcap = PcapReader(filePath) 72 | f_dup = PcapReader(filePath) 73 | pkt_nxt = next(f_dup) 74 | c = 0 75 | num = 0 76 | for pkt in fpcap: 77 | num = num+1 78 | dur = 0 79 | try: 80 | pkt_nxt = next(f_dup) 81 | dur = pkt_nxt.time - pkt.time 82 | except: 83 | dur = 0.0001 84 | 85 | srcAddr, dstAddr, sport, dport, proto = '', '', 0, 0, 3 86 | pload = 0 87 | tcp_close = False 88 | try: 89 | bs = pkt.len + 14 90 | except: 91 | continue 92 | 93 | if 'Ethernet' in pkt or 'cooked linux' in pkt: 94 | flag = False 95 | if('Ethernet' in pkt): 96 | eth = pkt['Ethernet'] 97 | if eth.type == 2048: 98 | flag = True 99 | else: 100 | lin = pkt['cooked linux'] 101 | flag = lin.proto == 2048 102 | if flag is True: 103 | ip = pkt['IP'] 104 | srcAddr = ip.src 105 | dstAddr = ip.dst 106 | proto = ip.proto 107 | 108 | if proto == 17 and pkt.haslayer('UDP'): 109 | sport = pkt['UDP'].sport 110 | dport = pkt['UDP'].dport 111 | pload = len(pkt['UDP'].payload) 112 | elif proto == 6 and pkt.haslayer('TCP'): 113 | sport = pkt['TCP'].sport 114 | dport = pkt['TCP'].dport 115 | pload = len(pkt['TCP'].payload) 116 | tcp_close = ('F' in pkt['TCP'].flags) or ( 117 | 'R' in pkt['TCP'].flags) 118 | elif proto == 1 and pkt.haslayer('ICMP'): 119 | if pkt['ICMP'].haslayer('IP') and pkt['ICMP']['IP'].proto == 17 and pkt['ICMP'].haslayer('UDP'): 120 | sport = pkt['ICMP']['UDP'].sport 121 | dport = pkt['ICMP']['UDP'].dport 122 | pload = len(pkt['ICMP']['UDP'].payload) 123 | elif pkt['ICMP'].haslayer('IP') and pkt['ICMP']['IP'].proto == 6 and pkt['ICMP'].haslayer('TCP'): 124 | sport = pkt['ICMP']['TCP'].sport 125 | dport = pkt['ICMP']['TCP'].dport 126 | pload = len(pkt['ICMP']['TCP'].payload) 127 | tcp_close = ('F' in pkt['ICMP']['TCP'].flags) or ( 128 | 'R' in pkt['ICMP']['TCP'].flags) 129 | else: 130 | continue 131 | else: 132 | continue 133 | else: 134 | continue 135 | nsp = 63 <= bs and bs <= 400 136 | tuple5 = (srcAddr, sport, dstAddr, dport, proto, True) 137 | tuple5_inv = (dstAddr, dport, srcAddr, sport, proto, True) 138 | if tuple5 in flows: 139 | features = flows[tuple5] 140 | flows[tuple5] = update_features( 141 | features, nsp, incoming=False, byte_size=bs, payl=pload, dur=dur) 142 | if tcp_close: 143 | temp = flows[tuple5] 144 | del flows[tuple5] 145 | tuple5 = (srcAddr, sport, dstAddr, dport, proto, False) 146 | flows[tuple5] = temp 147 | elif tuple5_inv in flows: 148 | features = flows[tuple5_inv] 149 | flows[tuple5_inv] = update_features( 150 | features, nsp, incoming=True, byte_size=bs, payl=pload, dur=dur) 151 | if tcp_close: 152 | temp = flows[tuple5_inv] 153 | del flows[tuple5_inv] 154 | tuple5_inv = (dstAddr, dport, srcAddr, sport, proto, False) 155 | flows[tuple5_inv] = temp 156 | else: 157 | flows[tuple5] = create_features(srcAddr, dstAddr, sport, dport, fps=pload, byte_size=bs, payl=pload, 158 | dur=dur, incoming=False) # features_list 159 | 160 | for flow in flows: 161 | if(num > 0): 162 | flows[flow]['PPX'] = flows[flow]['PX']/num 163 | else: 164 | flows[flow]['PPX'] = 0 165 | if('PX' in flows[flow]): 166 | del flows[flow]['PX'] 167 | if('NSP' in flows[flow]): 168 | del flows[flow]['NSP'] 169 | if('dur' in flows[flow]): 170 | del flows[flow]['dur'] 171 | return flows 172 | 173 | 174 | field_names = [ 175 | 'srcAddr', 176 | 'dstAddr', 177 | 'srcPort', 178 | 'dstPort', 179 | 'PPX', 180 | 'PSP', 181 | 'in', 182 | 'FPS', 183 | 'TBT', 184 | 'APL', 185 | 'BS', 186 | 'PPS' 187 | ] 188 | 189 | if(__name__ == "__main__"): 190 | filePath = str(sys.argv[1]) 191 | # Prepare Dictionary of Flow Features from filePath 192 | outfile = open(os.path.join(os.getcwd(), "results.txt"), 'x') 193 | csv_delimiter = "\t" 194 | columns = ["Flow=(srcAddr,srcPort,dstAddr,dstPort,proto)", "Prediction"] 195 | outfile.write(csv_delimiter.join(columns)+"\n") 196 | features_dict = flows_from_pcap(filePath) 197 | for flow in features_dict: 198 | if('BS' in features_dict[flow]): 199 | features_dict[flow]['BS'] = float( 200 | features_dict[flow]['BS']) 201 | if('PPS' in features_dict[flow]): 202 | features_dict[flow]['PPS'] = float( 203 | features_dict[flow]['PPS']) 204 | feature_vector = np.asarray([features_dict[flow]['srcPort'], 205 | features_dict[flow]['dstPort'], 206 | features_dict[flow]['PPX'], 207 | features_dict[flow]['PSP'], 208 | features_dict[flow]['in'], 209 | features_dict[flow]['FPS'], 210 | features_dict[flow]['TBT'], 211 | features_dict[flow]['APL'], 212 | features_dict[flow]['BS'], 213 | features_dict[flow]['PPS']]) 214 | prediction = model.predict(feature_vector[np.newaxis, ...]) 215 | if(prediction[0] >= 0.5): 216 | malicious = "malicious" 217 | else: 218 | malicious = "benign" 219 | outfile.write("("+str(flow[0])+","+str(flow[1])+","+str(flow[2]) + 220 | ","+str(flow[3])+","+str(flow[4])+")"+"\t"+malicious+"\n") 221 | -------------------------------------------------------------------------------- /Feature_Extraction.py: -------------------------------------------------------------------------------- 1 | from scapy.all import * 2 | from scapy.layers.http import * 3 | import os 4 | import csv 5 | 6 | 7 | def create_features(label, srcAddr, dstAddr, srcPort, dstPort, proto=4, fps=0, byte_size=0, payl=0, time=0, dur=0, incoming=False, http=4): 8 | features = { 9 | 'malicious': label, 10 | 'srcAddr': srcAddr, 11 | 'dstAddr': dstAddr 12 | } 13 | features['srcPort'] = srcPort 14 | features['dstPort'] = dstPort 15 | if proto == 2: 16 | features['proto'] = 2 17 | elif proto == 6: 18 | features['proto'] = 0 19 | elif proto == 17: 20 | features['proto'] = 1 21 | else: 22 | features['proto'] = 3 23 | features['PX'] = 1 24 | if byte_size <= 62: 25 | features['NNP'] = 1 26 | else: 27 | features['NNP'] = 0 28 | if 63 <= byte_size and byte_size <= 400: 29 | features['NSP'] = 1 30 | else: 31 | features['NSP'] = 0 32 | if(features['PX'] > 0): 33 | features['PSP'] = (features['NSP']/features['PX'])*100 34 | features['PNP'] = (features['NNP']/features['PX'])*100 35 | else: 36 | features['PSP'] = 0 37 | features['PNP'] = 0 38 | if not incoming: 39 | features['out'] = 1 40 | features['in'] = 0 41 | else: 42 | features['in'] = 1 43 | features['out'] = 0 44 | if features['out'] > 0: 45 | features['IOPR'] = features['in']/features['out'] 46 | else: 47 | features['IOPR'] = 0 48 | features['dur'] = dur 49 | features['FPS'] = fps 50 | features['TBT'] = byte_size 51 | features['APL'] = payl 52 | features['PV'] = 0.0 53 | if(features['dur'] > 0): 54 | features['BS'] = features['TBT']/features['dur'] 55 | features['PPS'] = features['PX']/features['dur'] 56 | else: 57 | features['BS'] = 0 58 | features['PPS'] = 0 59 | features['AIT'] = 0 60 | features['HTTPM'] = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0} 61 | features['HTTPM'][http] = 1 62 | features['time'] = time 63 | 64 | return features 65 | 66 | 67 | def update_features(features, nnp=False, nsp=False, incoming=False, byte_size=0, payl=0, time=0, dur=0, http=4): 68 | 69 | features['PX'] = features['PX'] + 1 70 | if nnp: 71 | features['NNP'] = features['NNP'] + 1 72 | features['PNP'] = (features['NNP']/features['PX'])*100 73 | if nsp: 74 | features['NSP'] = features['NSP'] + 1 75 | features['PSP'] = (features['NSP']/features['PX'])*100 76 | if incoming: 77 | features['in'] = features['in'] + 1 78 | else: 79 | features['out'] = features['out'] + 1 80 | if(features['out'] > 0): 81 | features['IOPR'] = features['in'] / features['out'] 82 | features['TBT'] = features['TBT'] + byte_size 83 | av = features['APL'] 84 | sd = features['PV'] 85 | n = features['PX'] - 1 86 | if(n+1 > 0): 87 | features['APL'] = ((av * n) + payl)/(n + 1) 88 | if(n > 0): 89 | features['PV'] = ((((n-1)*sd**2) + (n * av**2) - 90 | ((n + 1)*features['APL']**2) + payl**2)/n)**0.5 91 | features['dur'] = features['dur'] + 1 92 | if(features['dur'] > 0): 93 | features['BS'] = features['TBT']/features['dur'] 94 | features['PPS'] = features['PX']/features['dur'] 95 | del_t = time - features['time'] 96 | features['time'] = time 97 | if(n+1 > 0): 98 | features['AIT'] = ((features['AIT'] * n) + del_t) / (n + 1) 99 | features['HTTPM'][http] = features['HTTPM'][http] + 1 100 | 101 | return features 102 | 103 | 104 | def flows_from_pcap(label, filePath): 105 | flows = {} 106 | fpcap = PcapReader(filePath) 107 | f_dup = PcapReader(filePath) 108 | pkt_nxt = next(f_dup) 109 | c = 0 110 | num = 0 111 | total_bytes = 0 112 | for pkt in fpcap: 113 | num = num+1 114 | dur = 0 115 | try: 116 | pkt_nxt = next(f_dup) 117 | dur = pkt_nxt.time - pkt.time 118 | except: 119 | dur = 0.0001 120 | 121 | srcAddr, dstAddr, sport, dport, proto = '', '', 0, 0, 3 122 | pload, http_meth = 0, 4 123 | tcp_close = False 124 | try: 125 | bs = pkt.len + 14 126 | total_bytes += pkt.len 127 | except: 128 | continue 129 | 130 | if 'Ethernet' in pkt or 'cooked linux' in pkt: 131 | flag = False 132 | if('Ethernet' in pkt): 133 | eth = pkt['Ethernet'] 134 | if eth.type == 2048: 135 | flag = True 136 | else: 137 | lin = pkt['cooked linux'] 138 | flag = lin.proto == 2048 139 | if flag is True: 140 | ip = pkt['IP'] 141 | proto = ip.proto 142 | srcAddr = ip.src 143 | dstAddr = ip.dst 144 | if(os.path.basename(os.path.dirname(filePath)) == "p2pbox1"): 145 | if((str(srcAddr) not in benign_ip["p2pbox1"]) and (str(dstAddr) not in benign_ip["p2pbox1"])): 146 | continue 147 | if(os.path.basename(os.path.dirname(filePath)) == "p2pbox2"): 148 | if((str(srcAddr) not in benign_ip["p2pbox2"]) and (str(dstAddr) not in benign_ip["p2pbox2"])): 149 | continue 150 | if(os.path.basename(os.path.dirname(filePath)) == "torrent"): 151 | if((str(srcAddr) not in benign_ip["torrent"]) and (str(dstAddr) not in benign_ip["torrent"])): 152 | continue 153 | if(os.path.basename(os.path.dirname(filePath)) == "storm"): 154 | if((str(srcAddr) not in malicious_ip["storm"]) and (str(dstAddr) not in malicious_ip["storm"])): 155 | continue 156 | if(os.path.basename(os.path.dirname(filePath)) == "vinchuca"): 157 | if((str(srcAddr) not in malicious_ip["vinchuca"]) and (str(dstAddr) not in malicious_ip["vinchuca"])): 158 | continue 159 | if(os.path.basename(os.path.dirname(filePath)) == "zeus"): 160 | if((str(srcAddr) not in malicious_ip["zeus"]) and (str(dstAddr) not in malicious_ip["zeus"])): 161 | continue 162 | if proto == 17 and pkt.haslayer('UDP'): 163 | sport = pkt['UDP'].sport 164 | dport = pkt['UDP'].dport 165 | pload = len(pkt['UDP'].payload) 166 | elif proto == 6 and pkt.haslayer('TCP'): 167 | sport = pkt['TCP'].sport 168 | dport = pkt['TCP'].dport 169 | pload = len(pkt['TCP'].payload) 170 | if pkt.haslayer('HTTPRequest'): 171 | meth = pkt['HTTPRequest'].Method 172 | if meth == b'GET': 173 | http_meth = 0 174 | elif meth == b'POST': 175 | http_meth = 1 176 | elif meth == b'PUT': 177 | http_meth = 2 178 | elif meth == b'DELETE': 179 | http_meth = 3 180 | else: 181 | http_meth = 4 182 | tcp_close = ('F' in pkt['TCP'].flags) or ( 183 | 'R' in pkt['TCP'].flags) 184 | elif proto == 1 and pkt.haslayer('ICMP'): 185 | if pkt['ICMP'].haslayer('IP') and pkt['ICMP']['IP'].proto == 17 and pkt['ICMP'].haslayer('UDP'): 186 | sport = pkt['ICMP']['UDP'].sport 187 | dport = pkt['ICMP']['UDP'].dport 188 | pload = len(pkt['ICMP']['UDP'].payload) 189 | elif pkt['ICMP'].haslayer('IP') and pkt['ICMP']['IP'].proto == 6 and pkt['ICMP'].haslayer('TCP'): 190 | sport = pkt['ICMP']['TCP'].sport 191 | dport = pkt['ICMP']['TCP'].dport 192 | pload = len(pkt['ICMP']['TCP'].payload) 193 | if pkt['ICMP'].haslayer('HTTPRequest'): 194 | meth = pkt['ICMP']['HTTPRequest'].Method 195 | if meth == b'GET': 196 | http_meth = 0 197 | elif meth == b'POST': 198 | http_meth = 1 199 | elif meth == b'PUT': 200 | http_meth = 2 201 | elif meth == b'DELETE': 202 | http_meth = 3 203 | else: 204 | http_meth = 4 205 | tcp_close = ('F' in pkt['ICMP']['TCP'].flags) or ( 206 | 'R' in pkt['ICMP']['TCP'].flags) 207 | else: 208 | continue 209 | else: 210 | continue 211 | else: 212 | continue 213 | nnp = pload == 0 214 | nsp = 63 <= bs and bs <= 400 215 | tuple5 = (srcAddr, sport, dstAddr, dport, proto, True) 216 | tuple5_inv = (dstAddr, dport, srcAddr, sport, proto, True) 217 | if tuple5 in flows: 218 | features = flows[tuple5] 219 | flows[tuple5] = update_features( 220 | features, nnp, nsp, incoming=False, byte_size=bs, payl=pload, time=pkt.time, dur=dur, http=http_meth) 221 | if tcp_close: 222 | temp = flows[tuple5] 223 | del flows[tuple5] 224 | tuple5 = (srcAddr, sport, dstAddr, dport, proto, False) 225 | flows[tuple5] = temp 226 | elif tuple5_inv in flows: 227 | features = flows[tuple5_inv] 228 | flows[tuple5_inv] = update_features( 229 | features, nnp, nsp, incoming=True, byte_size=bs, payl=pload, time=pkt.time, dur=dur, http=http_meth) 230 | if tcp_close: 231 | temp = flows[tuple5_inv] 232 | del flows[tuple5_inv] 233 | tuple5_inv = (dstAddr, dport, srcAddr, sport, proto, False) 234 | flows[tuple5_inv] = temp 235 | else: 236 | flows[tuple5] = create_features(label, srcAddr, dstAddr, sport, dport, proto, fps=pload, byte_size=bs, payl=pload, 237 | time=pkt.time, dur=dur, incoming=False, http=http_meth) # features_list 238 | 239 | for flow in flows: 240 | if(num > 0): 241 | flows[flow]['PPX'] = flows[flow]['PX']/num 242 | else: 243 | flows[flow]['PPX'] = 0 244 | if(total_bytes > 0): 245 | flows[flow]['PBT'] = flows[flow]['TBT']/total_bytes 246 | else: 247 | flows[flow]['PBT'] = 0 248 | return flows 249 | 250 | 251 | benign_ip = { 252 | "p2pbox1": ["192.168.1.2"], 253 | "p2pbox2": ["192.168.2.2"], 254 | "torrent": ["172.27.28.106"] 255 | } 256 | 257 | malicious_ip = { 258 | "storm": ["66.154.80.101", 259 | "66.154.80.105", 260 | "66.154.80.111", 261 | "66.154.80.125", 262 | "66.154.83.107", 263 | "66.154.83.113", 264 | "66.154.83.138", 265 | "66.154.83.80", 266 | "66.154.87.39", 267 | "66.154.87.41", 268 | "66.154.87.57", 269 | "66.154.87.58", 270 | "66.154.87.61"], 271 | "vinchuca": ["172.27.22.206"], 272 | "zeus": ["10.0.2.15"] 273 | } 274 | 275 | field_names = [ 276 | 'srcAddr', 277 | 'dstAddr', 278 | 'srcPort', 279 | 'dstPort', 280 | 'proto', 281 | 'PPX', 282 | 'PBT', 283 | 'PX', 284 | 'NNP', 285 | 'NSP', 286 | 'PSP', 287 | 'PNP', 288 | 'out', 289 | 'in', 290 | 'IOPR', 291 | 'dur', 292 | 'FPS', 293 | 'TBT', 294 | 'APL', 295 | 'PV', 296 | 'BS', 297 | 'PPS', 298 | 'AIT', 299 | 'HTTPM0', 300 | 'HTTPM1', 301 | 'HTTPM2', 302 | 'HTTPM3', 303 | 'HTTPM4', 304 | 'malicious' 305 | ] 306 | 307 | with open('Results_2.csv', 'x') as csvfile: 308 | writer = csv.DictWriter(csvfile, fieldnames=field_names) 309 | writer.writeheader() 310 | 311 | i = 0 312 | # Launch Benign 313 | for root, dirs, files in os.walk(os.path.join('Botnet_Detection_Dataset', 'Benign')): 314 | for name in files: 315 | filePath = os.path.join(root, name) 316 | if(name != "ip_details.txt"): 317 | flow_features = flows_from_pcap(0, filePath) 318 | for flow in flow_features: 319 | if('dur' in flow_features[flow]): 320 | flow_features[flow]['dur'] = float( 321 | flow_features[flow]['dur']) 322 | if('BS' in flow_features[flow]): 323 | flow_features[flow]['BS'] = float( 324 | flow_features[flow]['BS']) 325 | if('PPS' in flow_features[flow]): 326 | flow_features[flow]['PPS'] = float( 327 | flow_features[flow]['PPS']) 328 | if('AIT' in flow_features[flow]): 329 | flow_features[flow]['AIT'] = float( 330 | flow_features[flow]['AIT']) 331 | if('time' in flow_features[flow]): 332 | flow_features[flow]['time'] = float( 333 | flow_features[flow]['time']) 334 | if('HTTPM' in flow_features[flow]): 335 | flow_features[flow]['HTTPM0'] = flow_features[flow]['HTTPM'][0] 336 | flow_features[flow]['HTTPM1'] = flow_features[flow]['HTTPM'][1] 337 | flow_features[flow]['HTTPM2'] = flow_features[flow]['HTTPM'][2] 338 | flow_features[flow]['HTTPM3'] = flow_features[flow]['HTTPM'][3] 339 | flow_features[flow]['HTTPM4'] = flow_features[flow]['HTTPM'][4] 340 | del flow_features[flow]['HTTPM'] 341 | if('time' in flow_features[flow]): 342 | del flow_features[flow]['time'] 343 | writer.writerow(flow_features[flow]) 344 | i = i+1 345 | print(i, "Files Processed:", filePath, len(flow_features)) 346 | 347 | # Launch Botnet 348 | for root, dirs, files in os.walk(os.path.join('Botnet_Detection_Dataset', 'Botnet', 'storm')): 349 | for name in files: 350 | filePath = os.path.join(root, name) 351 | if(name != "storm-IP" and name != "vinchuca_IP" and name != "zeus_IP"): 352 | flow_features = flows_from_pcap(1, filePath) 353 | for flow in flow_features: 354 | if('dur' in flow_features[flow]): 355 | flow_features[flow]['dur'] = float( 356 | flow_features[flow]['dur']) 357 | if('BS' in flow_features[flow]): 358 | flow_features[flow]['BS'] = float( 359 | flow_features[flow]['BS']) 360 | if('PPS' in flow_features[flow]): 361 | flow_features[flow]['PPS'] = float( 362 | flow_features[flow]['PPS']) 363 | if('AIT' in flow_features[flow]): 364 | flow_features[flow]['AIT'] = float( 365 | flow_features[flow]['AIT']) 366 | if('time' in flow_features[flow]): 367 | flow_features[flow]['time'] = float( 368 | flow_features[flow]['time']) 369 | if('HTTPM' in flow_features[flow]): 370 | flow_features[flow]['HTTPM0'] = flow_features[flow]['HTTPM'][0] 371 | flow_features[flow]['HTTPM1'] = flow_features[flow]['HTTPM'][1] 372 | flow_features[flow]['HTTPM2'] = flow_features[flow]['HTTPM'][2] 373 | flow_features[flow]['HTTPM3'] = flow_features[flow]['HTTPM'][3] 374 | flow_features[flow]['HTTPM4'] = flow_features[flow]['HTTPM'][4] 375 | del flow_features[flow]['HTTPM'] 376 | if('time' in flow_features[flow]): 377 | del flow_features[flow]['time'] 378 | writer.writerow(flow_features[flow]) 379 | i = i+1 380 | print(i, "Files Processed:", filePath) 381 | --------------------------------------------------------------------------------