├── .gitignore ├── Security Tasks Evaluation ├── BotnetAnalysis │ ├── peershark │ │ ├── __init__.py │ │ ├── PcapInputFiles.txt │ │ ├── TsharkOptions.txt │ │ ├── P2P_CONSTANTS.py │ │ ├── Packet.py │ │ ├── createTrainingData.py │ │ ├── generateSuperFlows.py │ │ ├── FilterPackets.py │ │ ├── SuperFlow.py │ │ ├── GenerateFlows.py │ │ ├── FilterPacketsHelper.py │ │ ├── README.md │ │ └── Flow.py │ ├── Data │ │ ├── Storm │ │ │ └── placeholder.csv │ │ ├── P2PTraffic │ │ │ └── placeholder.csv │ │ └── Waledac │ │ │ └── placeholder.csv │ ├── TrafficCaptures │ │ ├── Storm │ │ │ └── placeholder.pcap │ │ ├── Waledac │ │ │ └── placeholder.pcap │ │ └── P2PTraffic │ │ │ └── placeholder.pcap │ ├── fullRun.sh │ ├── README.md │ ├── quantize.py │ └── runExperiment.py ├── WFAnalysis │ ├── AllWebsiteAnalysis │ │ ├── Data │ │ │ └── placeholder.data │ │ ├── ParsingUtilities │ │ │ ├── CSVParseToWeka.py │ │ │ ├── CSVParseToSimulateHerrman.py │ │ │ └── CSVParseWebsiteUnbalanced.py │ │ ├── generateFigures.py │ │ └── runExperiment.py │ ├── SingleWebsiteAnalysis │ │ ├── Data │ │ │ └── placeholder.data │ │ ├── ParsingUtilities │ │ │ ├── CSVParseToWeka.py │ │ │ └── CSVParseWebsiteUnbalanced.py │ │ ├── runExperiment.py │ │ └── generateFigures.py │ └── README.md └── MPTAnalysis │ ├── README.md │ ├── FacetAnalysis │ ├── runExperiment.py │ ├── online_sketching.py │ ├── generateFigures.py │ ├── compressive_ta.py │ └── generateFeatures.py │ └── DeltaShaperAnalysis │ ├── runExperiment.py │ ├── online_sketching.py │ ├── generateFigures.py │ └── compressive_ta.py ├── README.md └── Flow Marker Accumulator └── flowlens-v1model.p4 /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | -------------------------------------------------------------------------------- /Security Tasks Evaluation/BotnetAnalysis/peershark/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Security Tasks Evaluation/BotnetAnalysis/Data/Storm/placeholder.csv: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Security Tasks Evaluation/BotnetAnalysis/Data/P2PTraffic/placeholder.csv: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Security Tasks Evaluation/BotnetAnalysis/Data/Waledac/placeholder.csv: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Security Tasks Evaluation/BotnetAnalysis/TrafficCaptures/Storm/placeholder.pcap: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Security Tasks Evaluation/BotnetAnalysis/TrafficCaptures/Waledac/placeholder.pcap: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Security Tasks Evaluation/BotnetAnalysis/TrafficCaptures/P2PTraffic/placeholder.pcap: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Security Tasks Evaluation/WFAnalysis/AllWebsiteAnalysis/Data/placeholder.data: -------------------------------------------------------------------------------- 1 | Place openssh.data here -------------------------------------------------------------------------------- /Security Tasks Evaluation/WFAnalysis/SingleWebsiteAnalysis/Data/placeholder.data: -------------------------------------------------------------------------------- 1 | Place openssh.data here -------------------------------------------------------------------------------- /Security Tasks Evaluation/BotnetAnalysis/peershark/PcapInputFiles.txt: -------------------------------------------------------------------------------- 1 | /Users/dmbb/Desktop/flowscope/BotnetAnalysis/Data/P2PTraffic 2 | -------------------------------------------------------------------------------- /Security Tasks Evaluation/BotnetAnalysis/peershark/TsharkOptions.txt: -------------------------------------------------------------------------------- 1 | -t e 2 | -T fields 3 | -E separator=, 4 | -e ip.src -e ip.dst -e ip.proto -e frame.time_epoch -e tcp.len -e udp.length 5 | -Y "(ip.proto==6)||(ip.proto==17)" 6 | -------------------------------------------------------------------------------- /Security Tasks Evaluation/BotnetAnalysis/fullRun.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for BIN_WIDTH in 1 16 32 64 128 256 4 | do 5 | for IPT_BIN_WIDTH in 0 1 10 60 300 900 6 | do 7 | python runExperiment.py $BIN_WIDTH $IPT_BIN_WIDTH 8 | done 9 | done -------------------------------------------------------------------------------- /Security Tasks Evaluation/BotnetAnalysis/peershark/P2P_CONSTANTS.py: -------------------------------------------------------------------------------- 1 | PCAPDATADIR = './pcapdata/' 2 | PCAPFILES = 'PcapInputFiles.txt' 3 | TSHARKOPTIONSFILE = 'TsharkOptions.txt' 4 | TCP_PROTO = '6' 5 | UDP_PROTO = '17' 6 | UDP_HEADERLENGTH = 8 7 | 8 | #utility functions 9 | import os 10 | def getCSVFiles(dirname): 11 | csvfiles = [] 12 | for eachfile in os.listdir(dirname): 13 | if eachfile.endswith('.csv'): 14 | csvfiles.append(dirname + eachfile) 15 | return csvfiles -------------------------------------------------------------------------------- /Security Tasks Evaluation/WFAnalysis/README.md: -------------------------------------------------------------------------------- 1 | ##Dependencies and Data 2 | 3 | 4 | ### Website Fingerprinting 5 | 6 | - Download the OpenSSH dataset (parsed and obtained from the original Herrman MySQL database) available [here](https://turbina.gsd.inesc-id.pt/resources/openSSH_herrman/openssh_data.tar.gz) 7 | - Place it inside `WFAnalysis/AllWebsiteAnalysis/Data` and `WFAnalysis/SingleWebsiteAnalysis/Data`. 8 | 9 | 10 | 11 | ### Running the code 12 | 13 | - Execute the `RunExperiment.py` script in each of the considered website fingerprinting settings -------------------------------------------------------------------------------- /Security Tasks Evaluation/MPTAnalysis/README.md: -------------------------------------------------------------------------------- 1 | ##Dependencies and Data 2 | 3 | 4 | ### Multimedia Protocol Tunneling 5 | 6 | - Download the traffic captures of covert channel tools available [here](https://turbina.gsd.inesc-id.pt/resources/mpt_detection/) 7 | - Place them in `MPTAnalysis/DeltaShaperAnalysis/TrafficCaptures` and `MPTAnalysis/FacetAnalysis/TrafficCaptures`, respectively. 8 | 9 | 10 | ### Running the code 11 | 12 | - Execute the `RunExperiment.py` script in each of the particular covert channel generating tools folder. Then execute `generateFigures.py` to generate figures from the obtained results. -------------------------------------------------------------------------------- /Security Tasks Evaluation/BotnetAnalysis/peershark/Packet.py: -------------------------------------------------------------------------------- 1 | import socket 2 | #defines properties of a packet 3 | class Packet: 4 | def __init__(self,fields): 5 | if fields == None: 6 | self.source = None 7 | self.dest = None 8 | self.timestamp = None 9 | self.size = 0 10 | self.key = None 11 | else: 12 | self.source = socket.inet_aton(fields[0]) 13 | self.dest = socket.inet_aton(fields[1]) 14 | self.timestamp = float(fields[2]) 15 | self.size = int(fields[3]) 16 | if self.source < self.dest: 17 | self.key = self.source + self.dest 18 | else: 19 | self.key = self.dest + self.source 20 | 21 | -------------------------------------------------------------------------------- /Security Tasks Evaluation/BotnetAnalysis/README.md: -------------------------------------------------------------------------------- 1 | ##Dependencies and Data 2 | 3 | 4 | ### Botnets 5 | 6 | - Download the P2P and botnet datasets gathered for PeerRush, available [here](http://peerrush.cs.uga.edu/peerrush/) 7 | - Place them inside `BotnetAnalysis/Data` 8 | - Botnet detection code by Pratik Narang is available [here](https://github.com/pratiknarang/peershark) 9 | 10 | ### Parse Original Captures Used in PeerShark 11 | 12 | - For each dataset (Waledac, Storm, P2P) 13 | - Run `peershark/FilterPackets.py` 14 | - Retrieve original parse of the .pcap at `pcapdata` folder 15 | 16 | *Note: Storm data samples must be appended with ".pcap"* 17 | `for f in *; do mv "$f" "$f.pcap"; done` 18 | 19 | ### Run the FlowLens botnet detection experiment 20 | 21 | Run `fullRun.sh`, which is responsible for applying different quantization parameter combinations on the PL and IPT of P2P packet flows -------------------------------------------------------------------------------- /Security Tasks Evaluation/BotnetAnalysis/peershark/createTrainingData.py: -------------------------------------------------------------------------------- 1 | from P2P_CONSTANTS import * 2 | 3 | 4 | def runTrainingDataGenerator(super_flow_data_dir, training_data_dir, bin_width, ipt_bin_width): 5 | #takes takes 50,000 examples and puts it in necessary format for training 6 | csvfiles = [] 7 | if os.path.isdir(super_flow_data_dir): 8 | csvfiles += getCSVFiles(super_flow_data_dir) 9 | 10 | #print ".csv files to generate training data: %s"%(csvfiles) 11 | 12 | outfile = open(training_data_dir + 'trainingdata_' + str(bin_width) + "_" + str(ipt_bin_width) + '.csv','w') 13 | for filename in csvfiles: 14 | label = filename.split('/')[-2] 15 | inputfile = open(filename) 16 | line = inputfile.readline().strip() 17 | while line!='': 18 | fields = line.split(',') 19 | if float(fields[4])!=0 and float(fields[3])!=0 and float(fields[7])!=0: 20 | outfile.write( 21 | fields[2] + ',' + 22 | fields[3] + ',' + 23 | fields[4] + ',' + 24 | fields[7] + ',' + 25 | label + '\n') 26 | line = inputfile.readline().strip() 27 | inputfile.close() 28 | outfile.close() -------------------------------------------------------------------------------- /Security Tasks Evaluation/BotnetAnalysis/peershark/generateSuperFlows.py: -------------------------------------------------------------------------------- 1 | from P2P_CONSTANTS import * 2 | import socket 3 | import Flow 4 | import SuperFlow 5 | import sys 6 | 7 | 8 | def runGenerateSuperFlows(flow_data_dir, super_flow_data_dir, flowgap): 9 | #TIMEGAP IN SECONDS 10 | csvfiles = getCSVFiles(flow_data_dir) 11 | #print csvfiles 12 | 13 | flowdata = [] 14 | for filename in csvfiles: 15 | inputfile = open(filename) 16 | data = [line.strip() for line in inputfile] 17 | inputfile.close() 18 | 19 | for eachline in data: 20 | fields = eachline.split(',') 21 | flowdata.append(SuperFlow.SuperFlow(fields)) 22 | print '\tNo. of flows to be processed: ' + str(len(flowdata)) 23 | 24 | 25 | flowdata = Flow.combineFlows(flowdata, flowgap) 26 | print '\tSuperflows (Flows with flowgap = ' + str(flowgap) + ' sec) : ' + str(len(flowdata)) 27 | 28 | outfile = open(super_flow_data_dir + str(flowgap) + '.csv', 'w') 29 | 30 | to_write = [] 31 | for flow in flowdata: 32 | to_write.append( 33 | socket.inet_ntoa(flow.ip1) + ',' + 34 | socket.inet_ntoa(flow.ip2) + ',' + 35 | str(flow.getNoOfPackets()) + ',' + 36 | str(flow.getNoOfBytes()) + ',' + 37 | '%.6f'%flow.getInterArrivaltime() + ',' + 38 | '%.6f'%flow.getStart() + ',' + 39 | '%.6f'%flow.getEnd() + ',' + 40 | '%.6f'%flow.getDurationInSeconds()) 41 | outfile.write("\n".join(to_write)) 42 | outfile.close() -------------------------------------------------------------------------------- /Security Tasks Evaluation/BotnetAnalysis/peershark/FilterPackets.py: -------------------------------------------------------------------------------- 1 | ## Module to obtain packet data from a pcap/dump file 2 | ## and save it in csv format using tshark. 3 | ## Filenames of input pcap files are taken from InputFiles.txt 4 | ## Tshark options are present in TsharkOptions.txt 5 | ## TsharkOptions.txt should not contain the -r option. 6 | 7 | ## usage: python FilterPackets.py 8 | 9 | #import global constants 10 | from P2P_CONSTANTS import * 11 | from FilterPacketsHelper import * 12 | import multiprocessing as MP 13 | import subprocess 14 | 15 | #execute a shell command as a child process 16 | def executeCommand(command,outfilename): 17 | sem.acquire() 18 | 19 | subprocess.call(command, shell = True) 20 | 21 | infile = open(outfilename, 'r') 22 | data = [eachline.strip() for eachline in infile] 23 | infile.close() 24 | 25 | data = preprocess(data) 26 | 27 | outfile = open(outfilename,'w') 28 | for eachcomponent in data: 29 | outfile.write(eachcomponent) 30 | outfile.close() 31 | 32 | print 'done processing : ' + outfilename 33 | sem.release() 34 | 35 | #obtain input parameters and pcapfilenames 36 | inputfiles = getPCapFileNames() 37 | print "Input Files: " + str(inputfiles) 38 | tsharkOptions = getTsharkOptions() 39 | 40 | #create a semaphore so as not to exceed threadlimit 41 | sem = MP.Semaphore(THREADLIMIT) 42 | 43 | #get tshark commands to be executed 44 | for filename in inputfiles: 45 | print filename 46 | (command,outfilename) = contructTsharkCommand(filename,tsharkOptions) 47 | task = MP.Process(target = executeCommand, args = (command, outfilename,)) 48 | task.start() -------------------------------------------------------------------------------- /Security Tasks Evaluation/WFAnalysis/AllWebsiteAnalysis/ParsingUtilities/CSVParseToWeka.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import csv 4 | 5 | def main(argv): 6 | OutputFile = open(argv[1], 'w') 7 | InputFile = open(argv[0]) 8 | 9 | bin_dict = {} 10 | 11 | OutputFile.write("@relation\'WF\'\n\n") 12 | OutputFile.write("@attribute Text string\n") 13 | OutputFile.write("@attribute class {") 14 | 15 | csv_reader = csv.reader(InputFile, delimiter=',') 16 | 17 | csv_header = "" 18 | website_list = set() 19 | text = [] 20 | 21 | for n, row in enumerate(csv_reader): 22 | if(n == 0): 23 | #Init bin dict 24 | csv_header = row 25 | prefix = "packetLengthBin_" 26 | for i in range(len(csv_header)-1): 27 | parsedBucketSize = csv_header[i][(len(prefix) + 1):] 28 | bin_dict[i] = parsedBucketSize 29 | continue 30 | 31 | currWebsite = row[-1] 32 | website_list.add(currWebsite) 33 | bin_list = row[:-1] 34 | 35 | text.append("\'") 36 | if("Online" in argv[1]): #Fix for online Sketching (Coskun et al.) 37 | for i in range(len(bin_list)): 38 | text.append(str(bin_list[i]) + " ") 39 | else: #For the others 40 | for i in range(len(bin_list)): 41 | for _ in range(int(bin_list[i])): 42 | text.append(str(bin_dict[i]) + " ") 43 | 44 | text.append("\'," + currWebsite + "\n") 45 | 46 | #Write classes on header 47 | OutputFile.write(",".join(website_list)) 48 | OutputFile.write("}\n\n") 49 | #Write data 50 | OutputFile.write("@data\n\n") 51 | OutputFile.write("".join(text)) 52 | 53 | 54 | OutputFile.close() 55 | 56 | 57 | if __name__ == "__main__": 58 | main(sys.argv[1:]) -------------------------------------------------------------------------------- /Security Tasks Evaluation/WFAnalysis/SingleWebsiteAnalysis/ParsingUtilities/CSVParseToWeka.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import csv 4 | 5 | def RoundToNearest(n, m): 6 | if (m == 1): 7 | return n 8 | if (n > 0): 9 | r = n % m 10 | return n + m - r if r + r >= m else n - r 11 | else: 12 | if (n < 0): 13 | return RoundToNearest(abs(n), m) * -1 14 | return 0 15 | 16 | def main(argv): 17 | OutputFile = open(argv[1], 'w') 18 | InputFile = open(argv[0], 'rb') 19 | website = argv[2] 20 | 21 | bin_dict = {} 22 | 23 | OutputFile.write("@relation\'WF\'\n\n") 24 | OutputFile.write("@attribute Text string\n") 25 | OutputFile.write("@attribute class {Nope,%s}\n\n"%(website)) 26 | OutputFile.write("@data\n\n") 27 | 28 | 29 | csv_reader = csv.reader(InputFile, delimiter=',') 30 | 31 | csv_header = "" 32 | text = [] 33 | 34 | for n, row in enumerate(csv_reader): 35 | if(n == 0): 36 | #Init bin dict 37 | csv_header = row 38 | prefix = "packetLengthBin_" 39 | for i in range(len(csv_header)-1): 40 | parsedBucketSize = csv_header[i][(len(prefix) + 1):] 41 | bin_dict[i] = parsedBucketSize 42 | continue 43 | 44 | currWebsite = row[-1] 45 | bin_list = row[:-1] 46 | 47 | text.append("\'") 48 | for i in range(len(bin_list)): 49 | for _ in range(int(bin_list[i])): 50 | text.append(str(bin_dict[i]) + " ") 51 | 52 | if (website not in currWebsite): 53 | text.append("\'," + "Nope" + "\n") 54 | else: 55 | text.append("\'," + website + "\n") 56 | 57 | 58 | OutputFile.write("".join(text)) 59 | OutputFile.close() 60 | 61 | if __name__ == "__main__": 62 | main(sys.argv[1:]) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FlowLens 2 | 3 | This repository holds the code for the paper "FlowLens: Enabling Efficient Flow Classification for ML-based Network Security Applications". 4 | If you end up using our code for your experiments, please cite our work as follows: 5 | 6 | ``` 7 | @inproceedings{protozoa, 8 | title={FlowLens: Enabling Efficient Flow Classification for ML-based Network Security Applications}, 9 | author={Barradas, Diogo and Santos, Nuno and Rodrigues, Lu{\'i}s and Signorello, Salvatore and Ramos, Fernando M. V. and Madeira, Andr{\'e}}, 10 | booktitle={Proceedings of the 28th Network and Distributed System Security Symposium}, 11 | year={2021}, 12 | address={San Diego, CA, USA}, 13 | } 14 | ``` 15 | 16 | ##Dependencies and Data 17 | 18 | 19 | ### General Dependencies 20 | 21 | - Install WEKA 22 | - Run `pip install -r requirements.txt` 23 | 24 | ### Datasets 25 | 26 | - Please check the `README.md` in each specific security task folder 27 | 28 | 29 | ## How may I use your code? 30 | 31 | - The `Security Tasks Evaluation` folder includes the code we used for evaluating different ML-based security tasks when using FlowLens. The code applies different combinations of our quantization and truncation approaches and allows for checking FlowLens flow markers trade-offs between accuracy and memory footprint 32 | 33 | - The `Flow Marker Accumulator` folder includes an adaptation of the P416 code we used for implementing FlowLens' flow marker accumulator in a Barefoot Tofino switch. Due to NDA concerns, we make public this adapted version of our code that can be run on the P4's BMV2 behavioral model. 34 | 35 | 36 | *Todo: Provide a full end-to-end dummy example of FlowLens running in BMV2 - e.g. on P4's tutorial VM.* -------------------------------------------------------------------------------- /Security Tasks Evaluation/BotnetAnalysis/peershark/SuperFlow.py: -------------------------------------------------------------------------------- 1 | from Packet import * 2 | import socket 3 | import Flow 4 | 5 | #get median of interarrival time 6 | def getMedian(vallist): 7 | vallist.sort(key = lambda val:val[0]) 8 | tot = 0 9 | cfreq = [] 10 | for val in vallist: 11 | tot += val[1] 12 | cfreq.append(tot) 13 | medianindex = tot / 2 14 | i = 0 15 | while medianindex > cfreq[i]: 16 | i += 1 17 | return vallist[i][0] 18 | 19 | #defines a superflow 20 | class SuperFlow(Flow.Flow): 21 | 22 | def __init__(self, fields): 23 | if fields == None: 24 | self.ip1 = None 25 | self.ip2 = None 26 | self.key = None 27 | self.n_packet1 = 0 28 | self.n_byte1 = 0 29 | self.t_start1 = 0 30 | self.t_end1 = 0 31 | self.t_interarrival1 = [] 32 | self.n_packet2 = 0 33 | self.n_byte2 = 0 34 | self.t_start2 = 0 35 | self.t_end2 = 0 36 | self.t_interarrival2 = [] 37 | else: 38 | self.ip1 = socket.inet_aton(fields[0]) 39 | self.ip2 = socket.inet_aton(fields[1]) 40 | self.key = self.ip1 + self.ip2 41 | self.n_packet1 = int(fields[2]) 42 | self.n_byte1 = int(fields[3]) 43 | self.t_start1 = float(fields[4]) 44 | self.t_end1 = float(fields[5]) 45 | self.t_interarrival1 = [(float(fields[6]),self.n_packet1)] 46 | self.n_packet2 = int(fields[7]) 47 | self.n_byte2 = int(fields[8]) 48 | self.t_start2 = float(fields[9]) 49 | self.t_end2 = float(fields[10]) 50 | self.t_interarrival2 = [(float(fields[11]),self.n_packet2)] 51 | 52 | #get median of interarrival time irrespective of direction 53 | def getInterArrivaltime(self): 54 | combined = self.t_interarrival1 + self.t_interarrival2 55 | if len(combined) > 0: 56 | return getMedian(combined) 57 | return 0 58 | 59 | #interarrival time for direction1(arbitrary) 60 | def getInterArrivaltime1(self): 61 | if len(self.t_interarrival1) > 0: 62 | return getMedian(self.t_interarrival1) 63 | return 0 64 | 65 | #interarrival time for direction2(arbitrary) 66 | def getInterArrivaltime2(self): 67 | if len(self.t_interarrival2) > 0: 68 | return getMedian(self.t_interarrival2) 69 | return 0 70 | -------------------------------------------------------------------------------- /Security Tasks Evaluation/BotnetAnalysis/peershark/GenerateFlows.py: -------------------------------------------------------------------------------- 1 | from P2P_CONSTANTS import * 2 | from Packet import * 3 | from Flow import * 4 | import multiprocessing as MP 5 | import socket 6 | import gc 7 | import time 8 | 9 | ## module to read all the files in the data folder of the 10 | ## project, build flow data and store it in a file 11 | 12 | 13 | def generateFlow(filename, flow_data_dir, timegap, sem): 14 | sem.acquire() 15 | 16 | inputfile = open(filename) 17 | data = [line.strip() for line in inputfile] 18 | inputfile.close() 19 | 20 | packetlist = [] 21 | for eachline in data: 22 | fields = eachline.split(',') 23 | fields.pop(2) 24 | packetlist.append(Packet(fields)) 25 | 26 | outflowlist = packetsToFlows(packetlist, timegap) 27 | #print 'flows in ' + filename + ' : ' + str(len(outflowlist)) 28 | 29 | outfilename = flow_data_dir + (filename.split('/')[-1]) 30 | writeFlowsToFile(outflowlist, outfilename) 31 | 32 | #print 'done writing to : ' + outfilename 33 | #start_collect = time.time() 34 | #collected = gc.collect() 35 | #end_collect = time.time() 36 | #print "Time wasted on GC - GenerateFlows: %ss, collected %s objects"%(end_collect-start_collect, collected) 37 | sem.release() 38 | 39 | def runGenerateFlows(quantized_pcap_data_dir, flow_data_dir, n_processes, timegap): 40 | #create a semaphore so as not to exceed n_processes process limit 41 | sem = MP.Semaphore(n_processes) 42 | 43 | csvfiles = getCSVFiles(quantized_pcap_data_dir) 44 | 45 | tasklist = [] 46 | #generate flowdata from each input packet file(not pcap) in parallel and store it in a file 47 | #so we get as many output files as number of input files 48 | for filename in csvfiles: 49 | task = MP.Process(target = generateFlow, args = (filename, flow_data_dir, timegap, sem)) 50 | tasklist.append(task) 51 | 52 | print "Tasklist size = %s"%(len(tasklist)) 53 | 54 | # #execute commands in parallel 55 | for i in range(0, len(tasklist), n_processes): 56 | for k,task in enumerate(tasklist[i:i+n_processes]): 57 | tasklist[i+k].start() 58 | for k, task in enumerate(tasklist[i:i+n_processes]): 59 | tasklist[i+k].join() 60 | #print "Joined task number %s"%(i+k) -------------------------------------------------------------------------------- /Security Tasks Evaluation/WFAnalysis/AllWebsiteAnalysis/generateFigures.py: -------------------------------------------------------------------------------- 1 | import os 2 | from decimal import Decimal 3 | import numpy as np 4 | import csv 5 | 6 | import matplotlib 7 | if os.environ.get('DISPLAY','') == '': 8 | print('no display found. Using non-interactive Agg backend') 9 | matplotlib.use('Agg') 10 | import matplotlib.pyplot as plt 11 | 12 | 13 | colors = ["0.8", "0.6", "0.2", "0.0"] 14 | colors = ["salmon", "lightsteelblue", "darkseagreen", "thistle", "wheat", "khaki", "skyblue"] 15 | 16 | """ 17 | Attach a text label above each bar displaying its height 18 | """ 19 | def autolabel(rects, ax): 20 | for rect in rects: 21 | height = rect.get_height() 22 | ax.text(rect.get_x() + rect.get_width()/2., 1.005*height, # original height was 1.005*height 23 | "{0:.2f}".format(float(height)), fontsize=7, ha='center', va='bottom') 24 | 25 | 26 | def PlotNormalAccuracy(): 27 | print "Plotting accuracy for no-sketch" 28 | #Gather results for full distribution 29 | profile_data_full = open("classificationResults/AllVsAll.csv", 'rb') 30 | csv_reader_full = csv.reader(profile_data_full, delimiter=',') 31 | 32 | binWidth_full = [] 33 | acc_full = [] 34 | 35 | for n, row in enumerate(csv_reader_full): 36 | if(n == 0): 37 | continue 38 | binWidth_full.append(row[0]) 39 | acc_full.append(round(Decimal(float(row[1])), 4)) 40 | 41 | 42 | #Generate plot 43 | fig = plt.figure() 44 | ax1 = fig.add_subplot(111) 45 | 46 | print "Current feature set: "+ str(binWidth_full) 47 | print "ACC-Full: " + str(acc_full) 48 | 49 | ind = np.arange(len(binWidth_full)) # the x locations for the groups 50 | width = 0.4 51 | 52 | rects1 = ax1.bar(ind - width/2, acc_full, width, color=colors[0], label='Accuracy') 53 | autolabel(rects1,ax1) 54 | 55 | ax1.yaxis.grid(color='black', linestyle='dotted') 56 | ax1.set_title('Quantization effect on accuracy - WF Multiclass', fontsize = 10) 57 | 58 | ax1.set_xticks(ind) 59 | labels = ["K = " + x + "\nBins = " + str(3000/int(x)) for n, x in enumerate(binWidth_full)] 60 | ax1.set_xticklabels(labels) 61 | ax1.legend() 62 | 63 | ax1.set_ylabel('Accuracy') 64 | ax1.set_xlabel('Quantization') 65 | 66 | plt.xticks(fontsize=7) 67 | plt.tight_layout() 68 | plt.ylim(0, 1) 69 | fig.savefig('Figures/AllVsAll.pdf') # save the figure to file 70 | fig.savefig('Figures/AllVsAll.png') # save the figure to file 71 | plt.close(fig) 72 | profile_data_full.close() 73 | 74 | 75 | 76 | def GenerateFigures(): 77 | if not os.path.exists("Figures"): 78 | os.makedirs("Figures") 79 | 80 | PlotNormalAccuracy() -------------------------------------------------------------------------------- /Security Tasks Evaluation/BotnetAnalysis/peershark/FilterPacketsHelper.py: -------------------------------------------------------------------------------- 1 | from P2P_CONSTANTS import * 2 | import os 3 | #return a list of filenames of pcapfiles taken from InputFiles.txt 4 | #if a directory is found then all *.pcap files in the directory are 5 | #included(non-recursive) 6 | 7 | def getPCapFileNames(): 8 | pcapInputFile = open(PCAPFILES) 9 | lines = [eachline.strip() for eachline in pcapInputFile] 10 | pcapInputFile.close() 11 | 12 | pcapfilenames = [] 13 | for eachline in lines: 14 | if eachline.endswith('.pcap'): 15 | if os.path.exists(eachline): 16 | pcapfilenames.append(eachline) 17 | else: 18 | print eachline + ' does not exist' 19 | exit() 20 | else: 21 | if os.path.isdir(eachline): 22 | for eachfile in os.listdir(eachline): 23 | if eachfile.endswith('.pcap'): 24 | pcapfilenames.append(eachline.rstrip('/') + '/' + eachfile) 25 | else: 26 | print eachline + ' is not a directory' 27 | exit() 28 | return pcapfilenames 29 | 30 | #return a list of options to be used with tshark 31 | def getTsharkOptions(): 32 | optionsFile = open(TSHARKOPTIONSFILE) 33 | options = [line.strip() for line in optionsFile] 34 | optionsFile.close() 35 | return options 36 | 37 | #return a tuple (x,y) where 38 | #x = complete tshark command 39 | #y = output csv filename 40 | def contructTsharkCommand(filename,tsharkOptions): 41 | command = 'tshark -r ' + filename + ' ' 42 | for eachstring in tsharkOptions: 43 | command = command + eachstring + ' ' 44 | 45 | #construct output filename 46 | outfilename = filename.split('/') 47 | outfilename = PCAPDATADIR + outfilename[len(outfilename)-1] + '.csv' 48 | 49 | command += '>'+outfilename 50 | return (command,outfilename) 51 | 52 | #remove missing tcp and udp payload lengths and subtract 53 | #8 bytes from udp payload to account for udp header 54 | #returns a list of strings to be printed 55 | def preprocess(data): 56 | outputdata = [] 57 | for eachline in data: 58 | fields = eachline.split(',') 59 | 60 | #sanity check for 6 fields. Has to be changed if tshark options are changed 61 | if len(fields) != 6: 62 | continue 63 | 64 | tcppayload = fields[4].strip() 65 | udppayload = fields[5].strip() 66 | 67 | #subtract udp header length 68 | if udppayload != '': 69 | fields[5] = str(int(udppayload) - UDP_HEADERLENGTH) 70 | if fields[5] == '0': 71 | continue 72 | #ignore packet if both tcp and udp payload lengths are null 73 | elif tcppayload == '' or tcppayload == '0': 74 | continue 75 | 76 | #add all valid fields to output list 77 | for eachfield in fields: 78 | if eachfield.strip() != '': 79 | outputdata.append(eachfield) 80 | outputdata.append(',') 81 | outputdata.pop() 82 | outputdata.append('\n') 83 | return outputdata -------------------------------------------------------------------------------- /Security Tasks Evaluation/BotnetAnalysis/peershark/README.md: -------------------------------------------------------------------------------- 1 | PeerShark 2 | ============================ 3 | Peer-to-Peer botnet detection by tracking conversations 4 | 5 | ### Contributors 6 | * Pratik Narang 7 | * Subhajit Ray 8 | * Chittaranjan Hota 9 | 10 | ###Research papers: 11 | * Narang, P., Ray, S., Hota, C., & Venkatakrishnan, V. (2014, May). Peershark: detecting peer-to-peer botnets by tracking conversations. In Security and Privacy Workshops (SPW), 2014 IEEE (pp. 108-115). IEEE. 12 | * Narang, P., Hota, C., & Venkatakrishnan, V. N. (2014). PeerShark: flow-clustering and conversation-generation for malicious peer-to-peer traffic identification. EURASIP Journal on Information Security, 2014(1), 1-12. 13 | 14 | PeerShark requires Python v2.7.* and Tshark installed, and has been tested only for Linux environment. 15 | 16 | Modules to be used in the following order: 17 | 18 | 1. FilterPackets.py : Take inputdir or input files from PCAPFILES. 19 | The module runs tshark on each file in inputdir and extracts the 20 | fields mention in TsharkOptions.txt such as src-ip,dst-ip, 21 | protocol, payload length. One new file is created per pcap file 22 | which contains only the fields we want for future analysis. The 23 | new files are stored in PCAPDATADIR. 24 | 25 | usage : python FilterPackets.py 26 | 27 | 2. GenerateFlows.py : Take each file from PCAPDATADIR -> generate 28 | flow information -> store processed data for each file in 29 | FLOWDATADIR. 30 | 31 | usage : python GenerateFlows.py 32 | 33 | 3. generateSuperFlows.py : Take each file from FLOWDATADIR -> merge 34 | flows into superflows based on input parameters -> store in 35 | SUPERFLOWDATADIR. 36 | 37 | usage: python generateSuperFlows.py start(in hrs) increment(in hrs) end(in hrs) 38 | 39 | Number of files generated = (end - start)/increment 40 | 41 | One file is generated for each value of timegap ranging from start to end. 42 | 43 | ####OPTIONAL: 44 | 45 | 46 | 4. createTrainingData.py: use this file to create labelled training data set. 47 | It reads *folders* (not files) residing in SUPERFLOWDATADIR, and creates *one* 48 | labelled file (weka style minus the header) per folder (with required attributes only- 49 | no. of pkts, no. of bytes, iat, duration, label) with the folder name appended as last column. 50 | 51 | After generating a labelled 'training dataset', supervised machine learning algorithms 52 | can be used to generate models for P2P botnet detection. 53 | 54 | 55 | ####Flow structure 56 | 57 | `IP1, IP2, #Packets1, #Bytes1, tFlowStart1, tFlowEnd1, MedianIPT1, #Packets2, #Bytes2, tFlowStart2, tFlowEnd2, MedianIPT2,` 58 | 59 | **Example** 60 | `4.79.17.248,192.168.58.137,3,126,1234920043.252418,1234920049.917001,4.326552,450,18900,1234920045.127448,1234920069.383826,0.000068` 61 | 62 | 63 | ####Superflow structure 64 | 65 | `IP1, IP2, #Packets, #Bytes, MedianIPT, tFlowStart, tFlowEnd, tDuration` 66 | 67 | **Example** 68 | `4.68.25.2, 192.168.58.150, 2, 86, 0.000000, 1234978436.632683, 1234978436.632683, 0.000000` 69 | 70 | ####Training data structure 71 | 72 | `#Packets, #Bytes, MedianIPT, tDuration, label` 73 | 74 | **Example** 75 | `16,672,0.000051,2.108578,Waledac` 76 | -------------------------------------------------------------------------------- /Security Tasks Evaluation/BotnetAnalysis/quantize.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import multiprocessing as MP 4 | import socket 5 | import gc 6 | import time 7 | 8 | def RoundToNearest(n, m): 9 | r = n % m 10 | return n + m - r if r + r >= m else n - r 11 | 12 | STORM_IPS = [ 13 | "66.154.80.101", 14 | "66.154.80.105", 15 | "66.154.80.111", 16 | "66.154.80.125", 17 | "66.154.83.107", 18 | "66.154.83.113", 19 | "66.154.83.138", 20 | "66.154.83.80", 21 | "66.154.87.39", 22 | "66.154.87.41", 23 | "66.154.87.57", 24 | "66.154.87.58", 25 | "66.154.87.61" 26 | ] 27 | 28 | WALEDAC_IPS = [ 29 | "192.168.58.136", 30 | "192.168.58.137", 31 | "192.168.58.150" 32 | ] 33 | 34 | 35 | def runQuantization(dataset, traffic_capture, binWidth, ipt_bin_width, sem): 36 | sem.acquire() 37 | 38 | cap_file = open(dataset + "/" + traffic_capture, 'rb') 39 | csv_reader = csv.reader(cap_file, delimiter=',') 40 | 41 | quantized_csv = open('FeatureSets/' + os.path.basename(dataset) + "/" + traffic_capture[:-4] + "_" + str(binWidth) + "_" + str(ipt_bin_width) + ".csv", "w") 42 | 43 | malicious_ips = [] 44 | if(os.path.basename(dataset) == "Storm"): 45 | malicious_ips = STORM_IPS 46 | elif(os.path.basename(dataset) == "Waledac"): 47 | malicious_ips = WALEDAC_IPS 48 | 49 | #print "Malicious IPs = %s"%(malicious_ips) 50 | #print os.path.basename(dataset) 51 | 52 | to_write = [] 53 | #Write modified packets 54 | for row in csv_reader: 55 | #Filter out non-malicious flows from Storm and Waledac datasets 56 | if(("Storm" in os.path.basename(dataset) or "Waledac" in os.path.basename(dataset)) and (row[0] not in malicious_ips and row[1] not in malicious_ips)): 57 | #print "Row not in malicious: %s - %s"%(row[0], row[1]) 58 | continue 59 | else: 60 | new_row = row 61 | 62 | #Quantize packet size 63 | new_row[4] = str(RoundToNearest(int(new_row[4]), binWidth)) 64 | 65 | #Quantize Timestamp 66 | if(ipt_bin_width > 0): 67 | new_row[3] = str(RoundToNearest(int(float(new_row[3])), ipt_bin_width)) 68 | to_write.append(",".join(new_row)) 69 | 70 | quantized_csv.write("\n".join(to_write)) 71 | 72 | cap_file.close() 73 | quantized_csv.close() 74 | 75 | #start_collect = time.time() 76 | #collected = gc.collect() 77 | #end_collect = time.time() 78 | #print "Time wasted on GC - Quantize: %ss, collected %s objects"%(end_collect-start_collect, collected) 79 | sem.release() 80 | 81 | 82 | def QuantizeDataset(dataset, binWidth, ipt_bin_width, n_processes): 83 | sem = MP.Semaphore(n_processes) 84 | traffic_captures = os.listdir(dataset) 85 | 86 | tasklist = [] 87 | 88 | for traffic_capture in traffic_captures: 89 | task = MP.Process(target = runQuantization, args = (dataset, traffic_capture, binWidth, ipt_bin_width, sem)) 90 | tasklist.append(task) 91 | 92 | print "Tasklist size = %s"%(len(tasklist)) 93 | 94 | # #execute commands in parallel 95 | for i in range(0, len(tasklist), n_processes): 96 | for k,task in enumerate(tasklist[i:i+n_processes]): 97 | tasklist[i+k].start() 98 | for k, task in enumerate(tasklist[i:i+n_processes]): 99 | tasklist[i+k].join() 100 | #print "Joined task number %s"%(i+k) 101 | -------------------------------------------------------------------------------- /Security Tasks Evaluation/WFAnalysis/AllWebsiteAnalysis/ParsingUtilities/CSVParseToSimulateHerrman.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from datetime import datetime, timedelta 4 | from collections import defaultdict, OrderedDict 5 | 6 | def RoundToNearest(n, m): 7 | if (m == 1): 8 | return n 9 | if (n > 0): 10 | r = n % m 11 | return n + m - r if r + r >= m else n - r 12 | else: 13 | if (n < 0): 14 | return RoundToNearest(abs(n), m) * -1 15 | return 0 16 | 17 | def main(argv): 18 | name = str(argv[0]) 19 | BASE_DIR = os.path.dirname(name) 20 | file = open(name,'r') 21 | binWidth = int(argv[1]) 22 | 23 | trainSet = open(BASE_DIR+"/TrainSet_" + str(binWidth) + ".csv", 'w') 24 | testSet = open(BASE_DIR+"/TestSet_" + str(binWidth) + ".csv", 'w') 25 | 26 | minBucket = RoundToNearest(-1500, binWidth) 27 | maxBucket = RoundToNearest(1500, binWidth) + 1 28 | for size in range(minBucket, maxBucket, binWidth): 29 | trainSet.write("packetLengthBin_" + str(size) + ", ") 30 | testSet.write("packetLengthBin_" + str(size) + ", ") 31 | trainSet.write("class\n") 32 | testSet.write("class\n") 33 | 34 | i = 0 35 | 36 | TFlineToWrite = [] 37 | CNlineToWrite = [] 38 | 39 | 40 | lineToWrite = OrderedDict() 41 | 42 | l = file.readline() 43 | l = file.readline() 44 | l = file.readline() 45 | l.rstrip('\n') 46 | 47 | lineNumber = 0 48 | while l: 49 | lineSplit = l.split(" ") 50 | if (lineNumber % 2 == 0): 51 | timestamp = lineSplit[2] 52 | else: 53 | website = lineSplit[0][:-1] 54 | lineToWrite[website+"|"+timestamp] = {} 55 | lineToWrite[website+"|"+timestamp] = defaultdict(lambda:0, lineToWrite[website+"|"+timestamp]) 56 | t = lineToWrite[website+"|"+timestamp] 57 | for x in lineSplit[1:]: 58 | try: 59 | t[str(RoundToNearest(int(x), binWidth))] += 1 60 | except: 61 | continue 62 | lineToWrite[website+"|"+timestamp] = t 63 | lineNumber += 1 64 | l = file.readline() 65 | l.rstrip('\n') 66 | 67 | max = 4 68 | max2 = max + 4 69 | counter = 0 70 | currentWebSite = "" 71 | for j in lineToWrite: 72 | if (currentWebSite != j.split("|")[0]): 73 | counter = 0 74 | 75 | currentWebSite = j.split("|")[0] 76 | 77 | if (counter < max): 78 | for s in range(minBucket, maxBucket, binWidth): 79 | trainSet.write(str(lineToWrite[j][str(s)]) + ", ") 80 | trainSet.write(currentWebSite + "\n") 81 | if (counter == 0): 82 | firstTimeStamp = datetime.strptime(j.split("|")[1], "%Y-%m-%d#%H:%M:%S") 83 | secondTimeStamp = firstTimeStamp + timedelta(days=8) 84 | counter += 1 85 | else: 86 | if (datetime.strptime(j.split("|")[1], "%Y-%m-%d#%H:%M:%S") < secondTimeStamp): 87 | lineToWrite[j] = {} 88 | continue 89 | if (counter < max2): 90 | for s in range(minBucket, maxBucket, binWidth): 91 | testSet.write(str(lineToWrite[j][str(s)]) + ", ") 92 | testSet.write(currentWebSite + "\n") 93 | counter += 1 94 | 95 | lineToWrite[j] = {} 96 | 97 | if __name__ == "__main__": 98 | main(sys.argv[1:]) 99 | 100 | -------------------------------------------------------------------------------- /Security Tasks Evaluation/WFAnalysis/AllWebsiteAnalysis/ParsingUtilities/CSVParseWebsiteUnbalanced.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import sys 4 | from datetime import datetime, timedelta 5 | import collections 6 | import math 7 | from collections import defaultdict, OrderedDict 8 | import numpy as np 9 | 10 | def RoundToNearest(n, m): 11 | if (m == 1): 12 | return n 13 | if (n > 0): 14 | r = n % m 15 | return n + m - r if r + r >= m else n - r 16 | else: 17 | if (n < 0): 18 | return RoundToNearest(abs(n), m) * -1 19 | return 0 20 | 21 | def extractDistributionWithoutTruncation(argv): 22 | BASE_DIR = os.path.dirname(argv[0]) 23 | file = open(argv[0],'r') 24 | 25 | binWidth = int(argv[1]) 26 | websiteToClassify = argv[2] 27 | 28 | if not os.path.exists(BASE_DIR + "/" + websiteToClassify): 29 | os.makedirs(BASE_DIR + "/" + websiteToClassify) 30 | 31 | trainSet = open(BASE_DIR + "/" + websiteToClassify + "/TrainSet_" + str(binWidth) + ".csv", 'w') 32 | testSet = open(BASE_DIR + "/" + websiteToClassify + "/TestSet_" + str(binWidth) + ".csv", 'w') 33 | 34 | 35 | #Set for all possible quantized buckets 36 | binsUsedByWebsite = set() 37 | minBucket = RoundToNearest(-1500, binWidth) 38 | maxBucket = RoundToNearest(1500, binWidth) + 1 39 | for size in range(minBucket, maxBucket, binWidth): 40 | binsUsedByWebsite.add(RoundToNearest(size, binWidth)) 41 | 42 | 43 | websiteTrainInstances = int(argv[3]) 44 | websiteTestInstances = int(argv[4]) 45 | 46 | ################################################ 47 | #Build csv with quantized bins 48 | ################################################ 49 | 50 | # Write CSV datasets header (with bins used by the target website) 51 | for size in range(minBucket, maxBucket, binWidth): 52 | if (size in binsUsedByWebsite): 53 | trainSet.write("packetLengthBin_" + str(size) + ", ") 54 | testSet.write("packetLengthBin_" + str(size) + ", ") 55 | trainSet.write("class\n") 56 | testSet.write("class\n") 57 | 58 | 59 | file = open(argv[0],'r') 60 | l = file.readline() #Take out dataset header 61 | l = file.readline() #Take out dataset header 62 | trainCounter = 0 63 | testCounter = 0 64 | currWebsite = "" 65 | trainData = [] 66 | testData =[] 67 | 68 | for lineNumber, l in enumerate(file.readlines()): 69 | lineSplit = l.rstrip('\n').split(" ") 70 | if (lineNumber % 2 == 1): #Gather website data 71 | website = lineSplit[0][:-1] 72 | if(website != currWebsite): 73 | currWebsite = website 74 | trainCounter = 0 75 | testCounter = 0 76 | 77 | #Build container for sample distribution 78 | website_bin_distribution = OrderedDict() 79 | for i in sorted(binsUsedByWebsite): 80 | website_bin_distribution[i] = 0 81 | 82 | #Add useful bins to the sample distribution 83 | for packet_size in lineSplit[1:-1]: 84 | packet_size_binned = RoundToNearest(int(packet_size), binWidth) 85 | if(packet_size_binned in binsUsedByWebsite): 86 | website_bin_distribution[packet_size_binned] += 1 87 | 88 | 89 | if(trainCounter < websiteTrainInstances): 90 | bin_list = [] 91 | for i in website_bin_distribution: 92 | bin_list.append(str(website_bin_distribution[i])) 93 | trainData.append(",".join(bin_list) + ", " + currWebsite + "\n") 94 | trainCounter += 1 95 | elif(testCounter < websiteTestInstances): 96 | bin_list = [] 97 | for i in website_bin_distribution: 98 | bin_list.append(str(website_bin_distribution[i])) 99 | testData.append(",".join(bin_list) + ", " + currWebsite + "\n") 100 | #Account for processed sample 101 | testCounter += 1 102 | 103 | trainSet.write("".join(trainData)) 104 | testSet.write("".join(testData)) 105 | trainSet.close() 106 | testSet.close() 107 | 108 | if __name__ == "__main__": 109 | extractDistributionWithoutTruncation(sys.argv[1:]) 110 | -------------------------------------------------------------------------------- /Security Tasks Evaluation/WFAnalysis/AllWebsiteAnalysis/runExperiment.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import subprocess as sub 4 | import shutil 5 | import time 6 | import weka.core.jvm as jvm 7 | import weka.core.converters as converters 8 | from weka.core.converters import Loader 9 | from weka.classifiers import Classifier 10 | from weka.classifiers import Evaluation 11 | 12 | 13 | dataset_location = "Data/openssh.data" 14 | 15 | #export JAVA_HOME=/Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home/ 16 | def ClassifyParam(mode, binWidths): 17 | if not os.path.exists("classificationResults"): 18 | os.makedirs("classificationResults") 19 | 20 | if("normal" in mode): 21 | file = open("classificationResults/AllVsAll.csv","w") 22 | 23 | file.write("BinWidth, Accuracy\n") 24 | 25 | for binWidth in binWidths: 26 | 27 | train_set = "Data/arff/TrainSet_%s.arff"%(binWidth) 28 | test_set = "Data/arff/TestSet_%s.arff"%(binWidth) 29 | print "Loading Datasets..." 30 | 31 | train_data = converters.load_any_file(train_set) 32 | test_data = converters.load_any_file(test_set) 33 | #Set class attribute 34 | train_data.class_is_last() 35 | test_data.class_is_last() 36 | print "Dataset Loaded!" 37 | 38 | 39 | classifier_name = "weka.classifiers.meta.FilteredClassifier" 40 | 41 | classifier = Classifier(classname=classifier_name, options=[ 42 | "-F", "weka.filters.unsupervised.attribute.StringToWordVector -R first-last -W 1000 -C -T -N 1 -stemmer weka.core.stemmers.NullStemmer -M 1 -tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\" \\\\r\\\\n\\\\t.,;:\\\\\\\'\\\\\\\"()?!\\\"\"", 43 | "-W", "weka.classifiers.bayes.NaiveBayesMultinomial"]) 44 | 45 | 46 | start_train = time.time() 47 | classifier.build_classifier(train_data) 48 | end_train = time.time() 49 | print "Train\t%s\t%s"%(binWidth, end_train-start_train) 50 | 51 | for index, inst in enumerate(test_data): 52 | if(index == 0): 53 | start_sample = time.time() 54 | classifier.classify_instance(inst) 55 | end_sample = time.time() 56 | print "Sample\t%s\t%s"%(binWidth, end_sample-start_sample) 57 | 58 | print "Evaluating w/ Multinomial Naive Bayes classifier. BinWidth = %s"%(binWidth) 59 | evaluation = Evaluation(test_data) 60 | start_batch = time.time() 61 | evaluation.test_model(classifier, test_data) 62 | end_batch = time.time() 63 | print "Batch\t%s\t%s"%(binWidth,end_batch-start_batch) 64 | 65 | 66 | print evaluation.summary() 67 | acc = evaluation.percent_correct/100.0 68 | print "Percent correct: " + str(acc) 69 | 70 | file.write("%s, %s\n"%(binWidth, acc)) 71 | file.close() 72 | 73 | 74 | 75 | def QuantizeAndCreateTrainTestDataset(binWidths): 76 | #2/3 train, 1/3 test (150 total, 100 -50) 77 | # Currently 50-50 78 | target_train_instances = 75 79 | target_test_instances = 75 80 | 81 | #Placeholder website for parsing script to work (compatibility issues) 82 | website = "www.flickr.com" 83 | 84 | for binWidth in binWidths: 85 | simArgs = "python ParsingUtilities/CSVParseWebsiteUnbalanced.py %s %s %s %s %s"%(dataset_location, binWidth, website, target_train_instances, target_test_instances) 86 | print "Quantizing dataset. binWidth = %s"%(binWidth) 87 | sub.call(simArgs, shell = True) 88 | 89 | print "Moving files to Data directory root" 90 | src_folder = "Data/www.flickr.com/" 91 | files = os.listdir(src_folder) 92 | for f in files: 93 | shutil.move(src_folder+f, "Data/") 94 | os.rmdir(src_folder) 95 | 96 | 97 | def BuildQuantizedArffDatasets(mode, binWidths): 98 | if not os.path.exists("Data/arff"): 99 | os.makedirs("Data/arff") 100 | 101 | if("normal" in mode): 102 | train_set = "TrainSet" 103 | test_set = "TestSet" 104 | 105 | for binWidth in binWidths: 106 | simArgs = "python ParsingUtilities/CSVParseToWeka.py Data/%s_%s.csv Data/arff/%s_%s.arff"%(train_set, binWidth, train_set, binWidth) 107 | print "Generating train dataset. binWidth = %s"%(binWidth) 108 | sub.call(simArgs, shell = True) 109 | 110 | simArgs = "python ParsingUtilities/CSVParseToWeka.py Data/%s_%s.csv Data/arff/%s_%s.arff"%(test_set, binWidth, test_set, binWidth) 111 | print "Generating test dataset. binWidth = %s"%(binWidth) 112 | sub.call(simArgs, shell = True) 113 | 114 | 115 | 116 | 117 | if __name__ == "__main__": 118 | 119 | #Quantization 120 | BIN_WIDTH = [1, 4, 8, 16, 32, 64, 128, 256] 121 | 122 | QuantizeAndCreateTrainTestDataset(BIN_WIDTH) 123 | 124 | 125 | BuildQuantizedArffDatasets("normal", BIN_WIDTH) 126 | 127 | #Classify 128 | #Start WEKA execution 129 | jvm.start(max_heap_size="4096m") 130 | 131 | #Classify 132 | ClassifyParam("normal", BIN_WIDTH) 133 | 134 | #stop weka execution 135 | jvm.stop() 136 | 137 | -------------------------------------------------------------------------------- /Security Tasks Evaluation/MPTAnalysis/FacetAnalysis/runExperiment.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import subprocess as sub 4 | 5 | from generateFeatures import CompressFeatures, SplitDataset, CompressFeaturesBasedOnTrainData, MergeTestData, ExtractPacketSample 6 | from classifier import GenerateFeatureImportanceBasedOnTrainData, ClassifyTestDataBasedOnModel, BuildModelBasedOnTrainData 7 | from generateFigures import GenerateFigures 8 | from online_sketching import CreateBinaryVectorRepresentation 9 | from compressive_ta import CreateCompressiveRepresentation 10 | 11 | 12 | def Experiment(BIN_WIDTH, TOPK, DELTAS, MEMORY_FACTORS, DATASET_SPLIT, COVERT_FLOWS_PERC, N_FLOWS, ONLINE_SKETCH_SIZE, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO): 13 | """ 14 | Phase 1a) 15 | Use full information and generate the best buckets. 16 | Datasets are split into half. 17 | 18 | We use the first half to train/test a classifier with a balanced dataset in HoldOut 90/10 19 | """ 20 | CompressFeatures(BIN_WIDTH, [TOPK[-1]]) 21 | SplitDataset(DATASET_SPLIT, N_FLOWS, 1) 22 | GenerateFeatureImportanceBasedOnTrainData("normal", BIN_WIDTH, [TOPK[-1]]) 23 | 24 | """ 25 | Phase 1b) 26 | Quantize, truncate and classify according to the best buckets found 27 | The first half of each dataset is again used for train/test the classifier with a balanced dataset in HoldOut 90/10 28 | However, only the top-K bins are used for performing classification 29 | 30 | The built model is saved to use in Phase 2. 31 | """ 32 | CompressFeaturesBasedOnTrainData(BIN_WIDTH, TOPK[:-1]) 33 | SplitDataset(DATASET_SPLIT, N_FLOWS, COVERT_FLOWS_PERC) 34 | BuildModelBasedOnTrainData("normal", BIN_WIDTH, TOPK) 35 | 36 | 37 | """ 38 | Phase 2 39 | Classify new flows using quantized/truncated distributions using the previously built model 40 | The second half of each dataset is used for train/test the classifier with an unbalanced dataset 41 | """ 42 | #Quantization + Truncation without sketches 43 | ClassifyTestDataBasedOnModel("normal", BIN_WIDTH, TOPK, N_FLOWS) 44 | 45 | #Generate figures 46 | GenerateFigures(BIN_WIDTH, TOPK, DELTAS, MEMORY_FACTORS, N_FLOWS) 47 | 48 | """ 49 | Online Sketching - Coskun et al. 50 | """ 51 | CreateBinaryVectorRepresentation(BIN_WIDTH, [TOPK[-1]], ONLINE_SKETCH_SIZE) 52 | BuildModelBasedOnTrainData("online", BIN_WIDTH, [TOPK[-1]], ONLINE_SKETCH_SIZE) 53 | ClassifyTestDataBasedOnModel("online", BIN_WIDTH, [TOPK[-1]], N_FLOWS, ONLINE_SKETCH_SIZE) 54 | 55 | """ 56 | Compressive TA adjusted to packet distribution 57 | """ 58 | CreateCompressiveRepresentation("compressive_gaussian", BIN_WIDTH, [TOPK[-1]], SIGMA_PARAM, COMPRESSIVE_RATIO) 59 | BuildModelBasedOnTrainData("compressive_gaussian", BIN_WIDTH, [TOPK[-1]], ONLINE_SKETCH_SIZE, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO) 60 | ClassifyTestDataBasedOnModel("compressive_gaussian", BIN_WIDTH, [TOPK[-1]], N_FLOWS, ONLINE_SKETCH_SIZE, DELTAS, MEMORY_FACTORS, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO) 61 | 62 | CreateCompressiveRepresentation("compressive_bernoulli", BIN_WIDTH, [TOPK[-1]], SIGMA_PARAM, COMPRESSIVE_RATIO) 63 | BuildModelBasedOnTrainData("compressive_bernoulli", BIN_WIDTH, [TOPK[-1]], ONLINE_SKETCH_SIZE, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO) 64 | ClassifyTestDataBasedOnModel("compressive_bernoulli", BIN_WIDTH, [TOPK[-1]], N_FLOWS, ONLINE_SKETCH_SIZE, DELTAS, MEMORY_FACTORS, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO) 65 | 66 | 67 | if __name__ == "__main__": 68 | 69 | #Quantization 70 | BIN_WIDTH = [1, 4, 8, 16, 32, 64, 128, 256] 71 | 72 | #Truncation Top-K features 73 | TOPK = [5, 10, 20, 30, 40, 50, 1500] 74 | 75 | #Online Sketch 76 | ONLINE_SKETCH_SIZE = [64, 128, 256, 512, 1024, 2048] 77 | 78 | #Proportion of regular flows to input in sketch 79 | COVERT_FLOWS_PERC = 1 80 | 81 | #Proportion to split training phase (1) and testing phase (2) 82 | DATASET_SPLIT = 0.5 83 | 84 | #Total amount of flows per dataset 85 | N_FLOWS = 1000 86 | 87 | #Standard deviation of Gaussian distribution (compressive TA) 88 | SIGMA_PARAM = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100] 89 | 90 | #Number of packets to compute compressive TA representation 91 | NUMBER_OF_PACKETS = [1000, 2000, 4000] 92 | 93 | #Compression Ratio for Compressive TA 94 | COMPRESSIVE_RATIO = [4, 8, 16, 32, 64, 128, 256] 95 | 96 | #Deprecated 97 | DELTAS = [0.95] 98 | MEMORY_FACTORS = [8, 4, 2, 1] 99 | 100 | #Run Experiment: 101 | Experiment(BIN_WIDTH, TOPK, DELTAS, MEMORY_FACTORS, DATASET_SPLIT, COVERT_FLOWS_PERC, N_FLOWS, ONLINE_SKETCH_SIZE, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO) 102 | 103 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /Security Tasks Evaluation/MPTAnalysis/DeltaShaperAnalysis/runExperiment.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import subprocess as sub 4 | 5 | from generateFeatures import CompressFeatures, SplitDataset, CompressFeaturesBasedOnTrainData, MergeTestData, ExtractPacketSample 6 | from classifier import GenerateFeatureImportanceBasedOnTrainData, ClassifyTestDataBasedOnModel, BuildModelBasedOnTrainData 7 | from generateFigures import GenerateFigures, GenerateFiguresLines 8 | from online_sketching import CreateBinaryVectorRepresentation 9 | from compressive_ta import CreateCompressiveRepresentation 10 | 11 | def Experiment(BIN_WIDTH, TOPK, DELTAS, MEMORY_FACTORS, DATASET_SPLIT, COVERT_FLOWS_PERC, N_FLOWS, ONLINE_SKETCH_SIZE, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO): 12 | """ 13 | Phase 1a) 14 | Use full information and generate the best buckets. 15 | Datasets are split into half. 16 | 17 | We use the first half to train/test a classifier with a balanced dataset in HoldOut 90/10 18 | """ 19 | CompressFeatures(BIN_WIDTH, [TOPK[-1]]) 20 | SplitDataset(DATASET_SPLIT, N_FLOWS, 1) 21 | GenerateFeatureImportanceBasedOnTrainData("normal", BIN_WIDTH, [TOPK[-1]]) 22 | 23 | """ 24 | Phase 1b) 25 | Quantize, truncate and classify according to the best buckets found 26 | The first half of each dataset is again used for train/test the classifier with a balanced dataset in HoldOut 90/10 27 | However, only the top-K bins are used for performing classification 28 | 29 | The built model is saved to use in Phase 2. 30 | """ 31 | CompressFeaturesBasedOnTrainData(BIN_WIDTH, TOPK[:-1]) 32 | SplitDataset(DATASET_SPLIT, N_FLOWS, COVERT_FLOWS_PERC) 33 | BuildModelBasedOnTrainData("normal", BIN_WIDTH, TOPK) 34 | 35 | """ 36 | Phase 2 37 | Classify new flows using quantized/truncated distributions using the previously built model 38 | The second half of each dataset is used for train/test the classifier with an unbalanced dataset 39 | """ 40 | #Quantization + Truncation 41 | ClassifyTestDataBasedOnModel("normal", BIN_WIDTH, TOPK, N_FLOWS, ONLINE_SKETCH_SIZE) 42 | 43 | #Generate figures 44 | GenerateFiguresLines(BIN_WIDTH, TOPK, DELTAS, MEMORY_FACTORS, N_FLOWS) 45 | 46 | 47 | """ 48 | Online Sketching - Coskun et al. 49 | """ 50 | CreateBinaryVectorRepresentation(BIN_WIDTH, [TOPK[-1]], ONLINE_SKETCH_SIZE) 51 | BuildModelBasedOnTrainData("online", BIN_WIDTH, [TOPK[-1]], ONLINE_SKETCH_SIZE) 52 | ClassifyTestDataBasedOnModel("online", BIN_WIDTH, [TOPK[-1]], N_FLOWS, ONLINE_SKETCH_SIZE) 53 | 54 | 55 | """ 56 | Compressive TA adjusted to packet distribution 57 | """ 58 | CreateCompressiveRepresentation("compressive_gaussian", BIN_WIDTH, [TOPK[-1]], SIGMA_PARAM, COMPRESSIVE_RATIO) 59 | BuildModelBasedOnTrainData("compressive_gaussian", BIN_WIDTH, [TOPK[-1]], ONLINE_SKETCH_SIZE, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO) 60 | ClassifyTestDataBasedOnModel("compressive_gaussian", BIN_WIDTH, [TOPK[-1]], N_FLOWS, ONLINE_SKETCH_SIZE, DELTAS, MEMORY_FACTORS, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO) 61 | 62 | CreateCompressiveRepresentation("compressive_bernoulli", BIN_WIDTH, [TOPK[-1]], SIGMA_PARAM, COMPRESSIVE_RATIO) 63 | BuildModelBasedOnTrainData("compressive_bernoulli", BIN_WIDTH, [TOPK[-1]], ONLINE_SKETCH_SIZE, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO) 64 | ClassifyTestDataBasedOnModel("compressive_bernoulli", BIN_WIDTH, [TOPK[-1]], N_FLOWS, ONLINE_SKETCH_SIZE, DELTAS, MEMORY_FACTORS, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO) 65 | 66 | 67 | if __name__ == "__main__": 68 | 69 | #Quantization 70 | BIN_WIDTH = [1, 4, 8, 16, 32, 64, 128, 256] 71 | 72 | #Truncation Top-K features 73 | TOPK = [5, 10, 20, 30, 40, 50, 1500] 74 | 75 | #Online Sketch 76 | ONLINE_SKETCH_SIZE = [64, 128, 256, 512, 1024, 2048] 77 | 78 | #Proportion of regular flows to input in sketch 79 | COVERT_FLOWS_PERC = 1 80 | 81 | #Proportion to split training phase (1) and testing phase (2) 82 | DATASET_SPLIT = 0.5 83 | 84 | #Total amount of flows per dataset 85 | N_FLOWS = 300 86 | 87 | #Standard deviation of Gaussian distribution (compressive TA) 88 | SIGMA_PARAM = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100] 89 | 90 | #Number of packets to compute compressive TA representation 91 | NUMBER_OF_PACKETS = [1000, 2000, 4000] 92 | 93 | #Compression Ratio for Compressive TA 94 | COMPRESSIVE_RATIO = [4, 8, 16, 32, 64, 128, 256] 95 | 96 | #Deprecated 97 | DELTAS = [0.95] 98 | MEMORY_FACTORS = [8, 4, 2, 1] 99 | 100 | #Run Experiment: 101 | Experiment(BIN_WIDTH, TOPK, DELTAS, MEMORY_FACTORS, DATASET_SPLIT, COVERT_FLOWS_PERC, N_FLOWS, ONLINE_SKETCH_SIZE, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO) 102 | 103 | 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /Security Tasks Evaluation/WFAnalysis/SingleWebsiteAnalysis/runExperiment.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import subprocess as sub 4 | import time 5 | import weka.core.jvm as jvm 6 | import weka.core.converters as converters 7 | from weka.core.converters import Loader 8 | from weka.classifiers import Classifier 9 | from weka.classifiers import Evaluation 10 | 11 | from generateFigures import GenerateFigures 12 | 13 | dataset_location = "Data/openssh.data" 14 | 15 | #export JAVA_HOME=/Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home/ 16 | 17 | def ClassifyParam(website, mode, binWidths, truncation_modes=["full", "truncated"]): 18 | if not os.path.exists("classificationResults"): 19 | os.makedirs("classificationResults") 20 | 21 | 22 | if("normal" in mode): 23 | for truncation in truncation_modes: 24 | file = open("classificationResults/SingleWebsite_%s_%s.csv"%(truncation, website),"w") 25 | file.write("BinWidth, Accuracy, FalsePositiveRate, FalseNegativeRate\n") 26 | 27 | for binWidth in binWidths: 28 | 29 | train_set_file = "TrainSet_%s_%s.arff"%(truncation, binWidth) 30 | train_set = "Data/%s/arff/%s"%(website, train_set_file) 31 | test_set = "Data/%s/arff/%s"%(website, train_set_file.replace("TrainSet", "TestSet")) 32 | 33 | print "Loading Datasets..." 34 | print "Train: " + train_set 35 | train_data = converters.load_any_file(train_set) 36 | print "Test: " + test_set 37 | test_data = converters.load_any_file(test_set) 38 | 39 | #Set class attribute 40 | train_data.class_is_last() 41 | test_data.class_is_last() 42 | print "Dataset Loaded!" 43 | 44 | 45 | classifier_name = "weka.classifiers.meta.FilteredClassifier" 46 | 47 | classifier = Classifier(classname=classifier_name, options=[ 48 | "-F", "weka.filters.unsupervised.attribute.StringToWordVector -R first-last -W 1000 -C -T -N 1 -stemmer weka.core.stemmers.NullStemmer -M 1 -tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\" \\\\r\\\\n\\\\t.,;:\\\\\\\'\\\\\\\"()?!\\\"\"", 49 | "-W", "weka.classifiers.bayes.NaiveBayesMultinomial"]) 50 | 51 | start_train = time.time() 52 | classifier.build_classifier(train_data) 53 | end_train = time.time() 54 | print "Train\t%s\t%s"%(binWidth, end_train-start_train) 55 | 56 | for index, inst in enumerate(test_data): 57 | if(index == 0): 58 | start_sample = time.time() 59 | classifier.classify_instance(inst) 60 | end_sample = time.time() 61 | print "Sample\t%s\t%s"%(binWidth, end_sample-start_sample) 62 | 63 | print "Evaluating w/ Multinomial Naive Bayes classifier. BinWidth = %s"%(binWidth) 64 | evaluation = Evaluation(test_data) 65 | start_batch = time.time() 66 | evaluation.test_model(classifier, test_data) 67 | end_batch = time.time() 68 | print "Batch\t%s\t%s"%(binWidth,end_batch-start_batch) 69 | 70 | 71 | print evaluation.summary() 72 | print evaluation.matrix() 73 | #Just as an example, we're measuring the fpr and fnr of the website indexed as class 1 74 | 75 | tp = evaluation.num_true_positives(1) 76 | tn = evaluation.num_true_negatives(1) 77 | fp = evaluation.num_false_positives(1) 78 | fn = evaluation.num_false_negatives(1) 79 | 80 | acc = (tp+tn)/float(tp+tn+fp+fn) 81 | fpr = evaluation.false_positive_rate(1) 82 | fnr = evaluation.false_negative_rate(1) 83 | 84 | print "Accuracy: %s"%(acc) 85 | print "False Positive Rate: %s"%(fpr) 86 | print "False Negative Rate: %s"%(fnr) 87 | 88 | file.write("%s, %s, %s, %s\n"%(binWidth, acc, fpr, fnr)) 89 | file.close() 90 | 91 | 92 | 93 | def QuantizeAndCreateUnbalancedTrainTestDataset(truncate, website, binWidths): 94 | #2/3 train, 1/3 test (150 total, 100 -50) 95 | target_train_instances = 75 96 | target_test_instances = 75 97 | 98 | if(truncate): 99 | truncation = 0 100 | 101 | #Init bookeeping of truncated bins 102 | if not os.path.exists("truncationInfo"): 103 | os.makedirs("truncationInfo") 104 | file = open("truncationInfo/" + website + ".csv", "w") 105 | file.write("BinWidth, TruncatedBins\n") 106 | file.close() 107 | else: 108 | truncation = 1 109 | 110 | for binWidth in binWidths: 111 | simArgs = "python ParsingUtilities/CSVParseWebsiteUnbalanced.py %s %s %s %s %s %s"%(dataset_location, binWidth, website, target_train_instances, target_test_instances, truncation) 112 | print "Quantizing dataset. binWidth = %s"%(binWidth) + ", truncation = " + str(truncate) + ", website = " + website 113 | sub.call(simArgs, shell = True) 114 | 115 | 116 | 117 | def BuildQuantizedArffDatasets(website, mode): 118 | if not os.path.exists("Data/%s/arff"%(website)): 119 | os.makedirs("Data/%s/arff"%(website)) 120 | 121 | if("normal" in mode): 122 | for f in os.listdir("Data/%s"%(website)): 123 | if(".csv" in f and not f.startswith("CountMin")): 124 | 125 | simArgs = "python ParsingUtilities/CSVParseToWeka.py Data/%s/%s Data/%s/arff/%s %s"%(website, f, website, f[:-3] + "arff", website) 126 | print "Generating dataset. File = " + f[:-3] + "arff" 127 | sub.call(simArgs, shell = True) 128 | 129 | 130 | if __name__ == "__main__": 131 | modes = ["normal", "sketch"] 132 | 133 | TRUNCATION_MODES = [True, False] 134 | 135 | #Quantization 136 | BIN_WIDTH = [1, 4, 8, 16, 32, 64, 128, 256] 137 | 138 | WEBSITES = [ 139 | "www.citibank.de", 140 | "mail.google.com", 141 | "www.youtube.com", 142 | "www.amazon.com", 143 | "www.imdb.com", 144 | "www.flickr.com" 145 | ] 146 | 147 | jvm.start(max_heap_size="4096m") 148 | for website in WEBSITES: 149 | for truncate in TRUNCATION_MODES: 150 | # Generates the train and test dataset 151 | #Proportion should be set inside this function 152 | QuantizeAndCreateUnbalancedTrainTestDataset(truncate, website, BIN_WIDTH) 153 | 154 | BuildQuantizedArffDatasets(website, "normal") 155 | 156 | """#Delete raw datasets 157 | for file in os.listdir("Data/" + website): 158 | if(file.endswith(".csv")): 159 | os.remove("Data/" + website + "/" + file)""" 160 | 161 | #Classify 162 | ClassifyParam(website, "normal", BIN_WIDTH) 163 | 164 | """#Delete arff datasets 165 | for file in os.listdir("Data/"): 166 | if(file.endswith(".arff")): 167 | os.remove("Data/" + file)""" 168 | 169 | #Generate figures 170 | GenerateFigures() 171 | jvm.stop() 172 | -------------------------------------------------------------------------------- /Security Tasks Evaluation/BotnetAnalysis/peershark/Flow.py: -------------------------------------------------------------------------------- 1 | from Packet import * 2 | 3 | #input: list of packets, timegap - real number 4 | #return val: list of flows 5 | # 6 | #merges collection of packets(objects) into collection of flows(many-to-one) 7 | #Working: group packets with same ip-pair(direction irrelevant) and merge all packets for 8 | #which |packet1.time - packet2.time| < threshold(timegap) 9 | def packetsToFlows(packets,timegap): 10 | #sanity check for 0 packets 11 | if len(packets) == 0: 12 | return None 13 | 14 | outputflows = [] 15 | 16 | #perform a radix-sort to group together packets 17 | #with same ip-pairs(packet.key represents an ip-pair) 18 | #and sort these packets according to timestamp 19 | packets.sort(key = lambda packet:packet.timestamp) 20 | packets.sort(key = lambda packet:packet.key) 21 | 22 | nextflow = Flow(None) 23 | for nextpacket in packets: 24 | #if ip-pairs dont match or time-difference of prev and current packet greater 25 | #than timegap, create a new flow 26 | if (nextflow.key != nextpacket.key) or ((nextpacket.timestamp - nextflow.getEnd()) > timegap): 27 | nextflow = Flow(nextpacket) 28 | outputflows.append(nextflow) 29 | #if not then add packet to previous flow 30 | else: 31 | nextflow.addPacket(nextpacket) 32 | 33 | return outputflows 34 | 35 | #same as function packetsToFlow but merges flows instead of packets 36 | def combineFlows(flows, flowgap): 37 | if len(flows) == 0: 38 | return None 39 | 40 | outputflows = [] 41 | 42 | flows.sort(key = lambda flow:flow.getStart()) 43 | flows.sort(key = lambda flow:flow.key) 44 | 45 | nextoutflow = Flow(None) 46 | for nextflow in flows: 47 | if (nextoutflow.key != nextflow.key) or ((nextflow.getStart() - nextoutflow.getEnd()) > flowgap): 48 | nextoutflow = nextflow 49 | outputflows.append(nextoutflow) 50 | else: 51 | nextoutflow.addFlow(nextflow) 52 | 53 | return outputflows 54 | 55 | def getCustomWeightedAvg(n1, w1, n2, w2): 56 | num = 0 57 | den = 0 58 | if w1 > 0: 59 | num += w1 * n1 60 | den += w1 61 | if w2 > 0: 62 | num += w2 * n2 63 | den += w2 64 | if den <= 0: 65 | den = 1 66 | return num / den 67 | 68 | 69 | #write list of flows into file in desired format 70 | def writeFlowsToFile(flowlist, filename): 71 | outfile = open(filename, 'w') 72 | 73 | to_write = [] 74 | for flow in flowlist: 75 | to_write.append( 76 | socket.inet_ntoa(flow.ip1) + ',' + 77 | socket.inet_ntoa(flow.ip2) + ',' + 78 | str(flow.n_packet1) + ',' + 79 | str(flow.n_byte1) + ',' + 80 | '%.6f'%flow.t_start1 + ',' + 81 | '%.6f'%flow.t_end1 + ',' + 82 | '%.6f'%flow.getInterArrivaltime1() + ',' + 83 | str(flow.n_packet2) + ',' + 84 | str(flow.n_byte2) + ',' + 85 | '%.6f'%flow.t_start2 + ',' + 86 | '%.6f'%flow.t_end2 + ',' + 87 | '%.6f'%flow.getInterArrivaltime2()) 88 | 89 | outfile.write("\n".join(to_write)) 90 | outfile.close() 91 | 92 | #class which defines the structure of flows 93 | class Flow: 94 | #constructor of default flow 95 | def __init__(self,firstpacket): 96 | if firstpacket == None: 97 | self.ip1 = None 98 | self.ip2 = None 99 | self.key = None 100 | self.n_packet1 = 0 101 | self.n_byte1 = 0 102 | self.t_start1 = 0 103 | self.t_end1 = 0 104 | self.t_interarrival1 = [] 105 | self.n_packet2 = 0 106 | self.n_byte2 = 0 107 | self.t_start2 = 0 108 | self.t_end2 = 0 109 | self.t_interarrival2 = [] 110 | else: 111 | if firstpacket.source < firstpacket.dest: 112 | self.ip1 = firstpacket.source 113 | self.ip2 = firstpacket.dest 114 | self.n_packet1 = 1 115 | self.n_byte1 = firstpacket.size 116 | self.t_start1 = firstpacket.timestamp 117 | self.t_end1 = firstpacket.timestamp 118 | self.t_interarrival1 = [] 119 | self.n_packet2 = 0 120 | self.n_byte2 = 0 121 | self.t_start2 = 0 122 | self.t_end2 = 0 123 | self.t_interarrival2 = [] 124 | else: 125 | self.ip1 = firstpacket.dest 126 | self.ip2 = firstpacket.source 127 | self.n_packet1 = 0 128 | self.n_byte1 = 0 129 | self.t_start1 = 0 130 | self.t_end1 = 0 131 | self.t_interarrival1 = [] 132 | self.n_packet2 = 1 133 | self.n_byte2 = firstpacket.size 134 | self.t_start2 = firstpacket.timestamp 135 | self.t_end2 = firstpacket.timestamp 136 | self.t_interarrival2 = [] 137 | self.key = firstpacket.key 138 | 139 | #add a flow to the current flow (by changing volume and duration) 140 | def addFlow(self,flow): 141 | self.t_interarrival1 += flow.t_interarrival1 142 | self.t_interarrival2 += flow.t_interarrival2 143 | self.n_packet1 += flow.n_packet1 144 | self.n_packet2 += flow.n_packet2 145 | self.n_byte1 += flow.n_byte1 146 | self.n_byte2 += flow.n_byte2 147 | 148 | temp = min(self.t_start1,flow.t_start1) 149 | if temp == 0: 150 | self.t_start1 = self.t_start1 + flow.t_start1 151 | else: 152 | self.t_start1 = temp 153 | 154 | temp = min(self.t_start2,flow.t_start2) 155 | if temp == 0: 156 | self.t_start2 = self.t_start2 + flow.t_start2 157 | else: 158 | self.t_start2 = temp 159 | 160 | if(self.t_end1 < flow.t_end1): 161 | self.t_end1 = flow.t_end1 162 | if(self.t_end2 < flow.t_end2): 163 | self.t_end2 = flow.t_end2 164 | 165 | #add a packet to the current flow (by changing volume and duration) 166 | def addPacket(self,packet): 167 | if packet.source == self.ip1 and packet.dest == self.ip2: 168 | 169 | #initialize flow if not initialized 170 | if self.n_packet1 == 0: 171 | self.t_start1 = packet.timestamp 172 | self.t_end1 = packet.timestamp 173 | self.n_packet1 += 1 174 | self.n_byte1 += packet.size 175 | return 176 | 177 | if self.t_end1 < packet.timestamp: 178 | self.t_interarrival1.append(packet.timestamp-self.t_end1) 179 | self.t_end1 = packet.timestamp 180 | elif self.t_start1 > packet.timestamp: 181 | self.t_interarrival1.append(self.t_start1-packet.timestamp) 182 | self.t_start1 = packet.timestamp 183 | self.n_packet1 += 1 184 | self.n_byte1 += packet.size 185 | 186 | elif packet.source == self.ip2 and packet.dest == self.ip1: 187 | 188 | #initialize flow if not initialized 189 | if self.n_packet2 == 0: 190 | self.t_start2 = packet.timestamp 191 | self.t_end2 = packet.timestamp 192 | self.n_packet2 += 1 193 | self.n_byte2 += packet.size 194 | return 195 | 196 | if self.t_end2 < packet.timestamp: 197 | self.t_interarrival2.append(packet.timestamp-self.t_end2) 198 | self.t_end2 = packet.timestamp 199 | elif self.t_start2 > packet.timestamp: 200 | self.t_interarrival2.append(self.t_start2-packet.timestamp) 201 | self.t_start2 = packet.timestamp 202 | self.n_packet2 += 1 203 | self.n_byte2 += packet.size 204 | 205 | else: 206 | raise Exception('packet does not belong to flow') 207 | 208 | def getDurationInSeconds(self): 209 | return self.getEnd() - self.getStart() 210 | 211 | def getInterArrivaltime(self): 212 | combined = (self.t_interarrival1+self.t_interarrival2).sort() 213 | if len(combined) > 0: 214 | return combined[len(combined)/2] 215 | return 0 216 | 217 | def getInterArrivaltime1(self): 218 | self.t_interarrival1.sort() 219 | if len(self.t_interarrival1) > 0: 220 | return self.t_interarrival1[len(self.t_interarrival1)/2] 221 | return 0 222 | 223 | def getInterArrivaltime2(self): 224 | self.t_interarrival2.sort() 225 | if len(self.t_interarrival2) > 0: 226 | return self.t_interarrival2[len(self.t_interarrival2)/2] 227 | return 0 228 | 229 | def getNoOfBytes(self): 230 | return self.n_byte1 + self.n_byte2 231 | 232 | def getNoOfPackets(self): 233 | return self.n_packet1 + self.n_packet2 234 | 235 | def getStart(self): 236 | temp = min(self.t_start1, self.t_start2) 237 | if temp == 0: 238 | return self.t_start1 + self.t_start2 239 | else: 240 | return temp 241 | 242 | def getEnd(self): 243 | return max(self.t_end1, self.t_end2) 244 | -------------------------------------------------------------------------------- /Security Tasks Evaluation/BotnetAnalysis/runExperiment.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import math 4 | import subprocess as sub 5 | import shutil 6 | import csv 7 | import numpy as np 8 | import multiprocessing as MP 9 | import time 10 | 11 | import gc 12 | 13 | import warnings 14 | warnings.filterwarnings("ignore", category=FutureWarning) 15 | 16 | from sklearn.metrics import accuracy_score, confusion_matrix 17 | from sklearn.model_selection import train_test_split, StratifiedKFold 18 | from sklearn.ensemble import RandomForestClassifier 19 | from joblib import dump, load 20 | 21 | from peershark.GenerateFlows import runGenerateFlows 22 | from peershark.generateSuperFlows import runGenerateSuperFlows 23 | from peershark.createTrainingData import runTrainingDataGenerator 24 | from quantize import QuantizeDataset 25 | 26 | data_location = "Data/" 27 | 28 | 29 | def Classify(binWidth, ipt_bin_width): 30 | dataset_path = 'TrainingData/Datasets/Dataset_%s_%s.csv'%(binWidth, ipt_bin_width) 31 | with open(dataset_path, 'rb') as dataset_file: 32 | print "Loading Dataset: %s ..."%(dataset_path) 33 | 34 | attributes = [] 35 | labels = [] 36 | csv_reader = csv.reader(dataset_file) 37 | for n, row in enumerate(csv_reader): 38 | if(n == 0): 39 | continue 40 | else: 41 | attributes.append(row[:-1]) 42 | labels.append(row[-1]) 43 | 44 | #Split data in 66% train, 33% test 45 | train_x, test_x, train_y, test_y = train_test_split(attributes, labels, test_size=0.33, random_state=42, stratify=labels) 46 | 47 | #Define classifier 48 | classifier = RandomForestClassifier(random_state=42) 49 | 50 | #Train classifier 51 | #start_train = time.time() 52 | model = classifier.fit(np.asarray(train_x), np.asarray(train_y)) 53 | #end_train = time.time() 54 | #print "Model trained in %ss"%(end_train-start_train) 55 | 56 | #for sample in test_x: 57 | # start_sample = time.time() 58 | # model.predict(np.asarray(sample).reshape((1,-1))) 59 | # end_sample = time.time() 60 | # print "Sample predicted in %ss"%(end_sample-start_sample) 61 | 62 | #Perform predictions 63 | print "Predicting %s samples"%(len(test_x)) 64 | #start_batch = time.time() 65 | predictions = model.predict(np.asarray(test_x)) 66 | #end_batch = time.time() 67 | #print "Batch predicted in %ss"%(end_batch-start_batch) 68 | 69 | #Generate metrics (benign) 70 | TN, FP, FN, TP = confusion_matrix(np.asarray(test_y), predictions, labels=["malicious","benign"]).ravel() 71 | FPR_BENIGN = float(FP)/(float(FP)+float(TN)) 72 | RECALL_BENIGN = float(TP)/(float(TP) + float(FN)) 73 | PRECISION_BENIGN = float(TP)/(float(TP) + float(FP)) 74 | 75 | print "Model Precision (benign): " + "{0:.3f}".format(PRECISION_BENIGN) 76 | print "Model Recall (benign): " + "{0:.3f}".format(RECALL_BENIGN) 77 | print "Model FPR (benign): " + "{0:.3f}".format(FPR_BENIGN) 78 | 79 | 80 | #Generate metrics (malicious) 81 | TN, FP, FN, TP = confusion_matrix(np.asarray(test_y), predictions, labels=["benign","malicious"]).ravel() 82 | FPR_MALICIOUS = float(FP)/(float(FP)+float(TN)) 83 | RECALL_MALICIOUS = float(TP)/(float(TP) + float(FN)) 84 | PRECISION_MALICIOUS = float(TP)/(float(TP) + float(FP)) 85 | 86 | print "Model Precision (malicious): " + "{0:.3f}".format(PRECISION_MALICIOUS) 87 | print "Model Recall (malicious): " + "{0:.3f}".format(RECALL_MALICIOUS) 88 | print "Model FPR (malicious): " + "{0:.3f}".format(FPR_MALICIOUS) 89 | 90 | results_file = open("classificationResults/results.csv","a") 91 | results_file.write("%s, %s, %s, %s, %s, %s, %s, %s\n"%(binWidth, ipt_bin_width, "{0:.3f}".format(PRECISION_BENIGN), "{0:.3f}".format(RECALL_BENIGN), "{0:.3f}".format(FPR_BENIGN), "{0:.3f}".format(PRECISION_MALICIOUS), "{0:.3f}".format(RECALL_MALICIOUS), "{0:.3f}".format(FPR_MALICIOUS))) 92 | results_file.flush() 93 | results_file.close() 94 | print "" 95 | 96 | 97 | def GenerateDataset(datasets, binWidth, ipt_bin_width): 98 | if not os.path.exists('TrainingData/Datasets'): 99 | os.makedirs('TrainingData/Datasets') 100 | 101 | datasets_to_merge = [] 102 | for dataset in datasets: 103 | dataset = os.path.basename(dataset) 104 | datasets_to_merge.append('TrainingData/%s/trainingdata_%s_%s.csv'%(dataset, binWidth, ipt_bin_width)) 105 | 106 | #Merge datasets in a single file 107 | with open('TrainingData/Datasets/Dataset_%s_%s.csv'%(binWidth, ipt_bin_width), "w") as out_dataset: 108 | out_dataset.write("NumberOfPackets,TotalBytesTransmitted,MedianIPT,ConversationDuration,class\n") 109 | for fname in datasets_to_merge: 110 | with open(fname, 'rb') as infile: 111 | csv_reader = csv.reader(infile) 112 | for row in csv_reader: 113 | new_row = row 114 | if(row[4] == "P2PTraffic"): 115 | new_row[4] = "benign" 116 | else: 117 | new_row[4] = "malicious" 118 | out_dataset.write(",".join(new_row) + "\n") 119 | 120 | 121 | def RunPeerShark(quantized_pcap_data_dir, flow_data_dir, super_flow_data_dir, training_data_dir, bin_width, ipt_bin_width): 122 | #create a semaphore so as not to exceed threadlimit 123 | n_processes = 4 124 | 125 | #Set TIMEGAP 126 | timegap = 2000 127 | 128 | print "Generating Flows with TIMEGAP = %s"%(timegap) 129 | runGenerateFlows(quantized_pcap_data_dir, flow_data_dir, n_processes, timegap) 130 | 131 | #Set FLOWGAP in seconds 132 | flowgap = 3600 133 | 134 | print "Generating SuperFlows with FLOWGAP = %s"%(flowgap) 135 | runGenerateSuperFlows(flow_data_dir, super_flow_data_dir, flowgap) 136 | 137 | print "Generating Training Data..." 138 | runTrainingDataGenerator(super_flow_data_dir, training_data_dir, bin_width, ipt_bin_width) 139 | 140 | 141 | def Experiment(datasets, bin_width, ipt_bin_width): 142 | 143 | if not os.path.exists('FeatureSets'): 144 | os.makedirs('FeatureSets') 145 | 146 | #Quantize datasets according to bin width 147 | #Generate training sets for quantization 148 | for dataset in datasets: 149 | quantized_pcap_data_dir = 'FeatureSets/' + os.path.basename(dataset) + "/" 150 | flow_data_dir = 'FlowData/' + os.path.basename(dataset) + "/" 151 | superflow_data_dir = 'SuperFlowData/' + os.path.basename(dataset) + "/" 152 | training_data_dir = 'TrainingData/' + os.path.basename(dataset) + "/" 153 | 154 | if not os.path.exists('FeatureSets/' + os.path.basename(dataset)): 155 | os.makedirs('FeatureSets/' + os.path.basename(dataset)) 156 | 157 | if not os.path.exists('FlowData/' + os.path.basename(dataset)): 158 | os.makedirs('FlowData/' + os.path.basename(dataset)) 159 | 160 | if not os.path.exists('SuperFlowData/' + os.path.basename(dataset)): 161 | os.makedirs('SuperFlowData/' + os.path.basename(dataset)) 162 | 163 | if not os.path.exists('TrainingData/' + os.path.basename(dataset)): 164 | os.makedirs('TrainingData/' + os.path.basename(dataset)) 165 | 166 | 167 | print "Quantizing %s with BinWidth = %s and IPT_BinWidth = %s"% (dataset, binWidth, ipt_bin_width) 168 | n_processes = 4 169 | QuantizeDataset(dataset, bin_width, ipt_bin_width, n_processes) 170 | RunPeerShark(quantized_pcap_data_dir, flow_data_dir, superflow_data_dir, training_data_dir, bin_width, ipt_bin_width) 171 | 172 | print "Building Dataset..." 173 | GenerateDataset(datasets, binWidth, ipt_bin_width) 174 | 175 | print "Performing Classification..." 176 | Classify(binWidth, ipt_bin_width) 177 | 178 | start_collect = time.time() 179 | collected = gc.collect() 180 | end_collect = time.time() 181 | print "Time wasted on GC - Classification: %ss, collected %s objects"%(end_collect-start_collect, collected) 182 | 183 | shutil.rmtree('FeatureSets') 184 | shutil.rmtree('FlowData') 185 | shutil.rmtree('SuperFlowData') 186 | shutil.rmtree('TrainingData') 187 | 188 | 189 | 190 | if __name__ == "__main__": 191 | 192 | DATASETS = [ 193 | data_location + "Waledac", 194 | data_location + "Storm", 195 | data_location + "P2PTraffic" 196 | ] 197 | 198 | ### 199 | #The following parameters are now fed by the fullRun.sh shell script 200 | # Please run fullRun.sh instead of this file directly 201 | ### 202 | 203 | #Quantization (packet size) 204 | #BIN_WIDTH = [1, 16, 32, 64, 128, 256] 205 | 206 | #Quantization (IPT in seconds) 207 | #TIMEGAP IS 2000s, FLOWGAP IS 3600s 208 | #IPT_BIN_WIDTH = [0, 1, 10, 60, 300, 900] 209 | 210 | if not os.path.exists("classificationResults"): 211 | os.makedirs("classificationResults") 212 | results_file = open("classificationResults/results.csv","a+") 213 | results_file.write("BinWidth, IPT_BinWidth, Precision_Benign, Recall_Benign, FalsePositiveRate_Benign, Precision_Malicious, Recall_Malicious, FalsePositiveRate_Malicious\n") 214 | results_file.flush() 215 | results_file.close() 216 | 217 | 218 | binWidth = int(sys.argv[1]) 219 | ipt_bin_width = int(sys.argv[2]) 220 | 221 | print "Starting experiment with Bin width %s and IPT Bin Width %s"%(binWidth, ipt_bin_width) 222 | start_time = time.time() 223 | Experiment(DATASETS, binWidth, ipt_bin_width) 224 | end_time = time.time() 225 | time_elapsed_seconds = end_time - start_time 226 | print "Experiment finished in %sh\n"%("{0:.2f}".format(time_elapsed_seconds/60.0/60.0)) 227 | 228 | -------------------------------------------------------------------------------- /Security Tasks Evaluation/WFAnalysis/SingleWebsiteAnalysis/ParsingUtilities/CSVParseWebsiteUnbalanced.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import sys 4 | from datetime import datetime, timedelta 5 | import collections 6 | import math 7 | from collections import defaultdict, OrderedDict 8 | import numpy as np 9 | 10 | def RoundToNearest(n, m): 11 | if (m == 1): 12 | return n 13 | if (n > 0): 14 | r = n % m 15 | return n + m - r if r + r >= m else n - r 16 | else: 17 | if (n < 0): 18 | return RoundToNearest(abs(n), m) * -1 19 | return 0 20 | 21 | def extractDistributionWithoutTruncation(argv): 22 | BASE_DIR = os.path.dirname(argv[0]) 23 | file = open(argv[0],'r') 24 | 25 | binWidth = int(argv[1]) 26 | websiteToClassify = argv[2] 27 | 28 | if not os.path.exists(BASE_DIR + "/" + websiteToClassify): 29 | os.makedirs(BASE_DIR + "/" + websiteToClassify) 30 | 31 | trainSet = open(BASE_DIR + "/" + websiteToClassify + "/TrainSet_full_" + str(binWidth) + ".csv", 'w') 32 | testSet = open(BASE_DIR + "/" + websiteToClassify + "/TestSet_full_" + str(binWidth) + ".csv", 'w') 33 | 34 | 35 | #Set for all possible quantized buckets 36 | binsUsedByWebsite = set() 37 | minBucket = RoundToNearest(-1500, binWidth) 38 | maxBucket = RoundToNearest(1500, binWidth) + 1 39 | for size in range(minBucket, maxBucket, binWidth): 40 | binsUsedByWebsite.add(RoundToNearest(size, binWidth)) 41 | 42 | 43 | websiteTrainInstances = int(argv[3]) 44 | websiteTestInstances = int(argv[4]) 45 | 46 | ################################################ 47 | #Build csv with quantized bins 48 | ################################################ 49 | 50 | # Write CSV datasets header (with bins used by the target website) 51 | for size in range(minBucket, maxBucket, binWidth): 52 | if (size in binsUsedByWebsite): 53 | trainSet.write("packetLengthBin_" + str(size) + ", ") 54 | testSet.write("packetLengthBin_" + str(size) + ", ") 55 | trainSet.write("class\n") 56 | testSet.write("class\n") 57 | 58 | 59 | file = open(argv[0],'r') 60 | l = file.readline() #Take out dataset header 61 | l = file.readline() #Take out dataset header 62 | trainCounter = 0 63 | testCounter = 0 64 | currWebsite = "" 65 | trainData = [] 66 | testData =[] 67 | 68 | for lineNumber, l in enumerate(file.readlines()): 69 | lineSplit = l.rstrip('\n').split(" ") 70 | if (lineNumber % 2 == 1): #Gather website data 71 | website = lineSplit[0][:-1] 72 | if(website != currWebsite): 73 | currWebsite = website 74 | trainCounter = 0 75 | testCounter = 0 76 | 77 | #Build container for sample distribution 78 | website_bin_distribution = OrderedDict() 79 | for i in sorted(binsUsedByWebsite): 80 | website_bin_distribution[i] = 0 81 | 82 | #Add useful bins to the sample distribution 83 | for packet_size in lineSplit[1:-1]: 84 | packet_size_binned = RoundToNearest(int(packet_size), binWidth) 85 | if(packet_size_binned in binsUsedByWebsite): 86 | website_bin_distribution[packet_size_binned] += 1 87 | 88 | 89 | if(trainCounter < websiteTrainInstances): 90 | bin_list = [] 91 | for i in website_bin_distribution: 92 | bin_list.append(str(website_bin_distribution[i])) 93 | trainData.append(",".join(bin_list) + ", " + currWebsite + "\n") 94 | trainCounter += 1 95 | elif(testCounter < websiteTestInstances): 96 | bin_list = [] 97 | for i in website_bin_distribution: 98 | bin_list.append(str(website_bin_distribution[i])) 99 | testData.append(",".join(bin_list) + ", " + currWebsite + "\n") 100 | #Account for processed sample 101 | testCounter += 1 102 | 103 | trainSet.write("".join(trainData)) 104 | testSet.write("".join(testData)) 105 | trainSet.close() 106 | testSet.close() 107 | 108 | 109 | def extractDistributionWithTruncation(argv): 110 | BASE_DIR = os.path.dirname(argv[0]) 111 | file = open(argv[0],'r') 112 | 113 | binWidth = int(argv[1]) 114 | websiteToClassify = argv[2] 115 | 116 | if not os.path.exists(BASE_DIR + "/" + websiteToClassify): 117 | os.makedirs(BASE_DIR + "/" + websiteToClassify) 118 | 119 | trainSet = open(BASE_DIR + "/" + websiteToClassify + "/TrainSet_truncated_" + str(binWidth) + ".csv", 'w') 120 | testSet = open(BASE_DIR + "/" + websiteToClassify + "/TestSet_truncated_" + str(binWidth) + ".csv", 'w') 121 | 122 | 123 | websiteTrainInstances = int(argv[3]) 124 | websiteTestInstances = int(argv[4]) 125 | 126 | trainInstancesCounter = 0 127 | binsUsedByWebsite = set() 128 | minBucket = RoundToNearest(-1500, binWidth) 129 | maxBucket = RoundToNearest(1500, binWidth) + 1 130 | 131 | ################################################ 132 | #Gather list of quantized buckets used by the target website in the training set (1st pass) 133 | ################################################ 134 | 135 | l = file.readline() #Take out dataset header 136 | l = file.readline() #Take out dataset header 137 | for lineNumber, l in enumerate(file.readlines()): 138 | lineSplit = l.rstrip('\n').split(" ") 139 | if (lineNumber % 2 == 1): #Gather website data 140 | website = lineSplit[0][:-1] 141 | if (website == websiteToClassify): 142 | if(trainInstancesCounter < websiteTrainInstances): 143 | for packet_size in lineSplit[1:-1]: 144 | binsUsedByWebsite.add(RoundToNearest(int(packet_size), binWidth)) 145 | trainInstancesCounter += 1 146 | else: 147 | break #We've analysed all training websiteToClassify samples 148 | 149 | 150 | #Get to know the amount of buckets used for measuring a given website 151 | print "Total number of buckets: " + str(int(math.floor(3000.0/binWidth))) 152 | print "Number of buckets after truncation: " + str(len(binsUsedByWebsite)) 153 | #Write these stats to a file 154 | file = open("truncationInfo/" + websiteToClassify + ".csv", "a") 155 | file.write("%s, %s\n"%(binWidth, len(binsUsedByWebsite))) 156 | file.close() 157 | 158 | ################################################ 159 | #Build csv with truncated bins (2nd pass) 160 | ################################################ 161 | 162 | # Write CSV datasets header (with bins used by the target website) 163 | for size in range(minBucket, maxBucket, binWidth): 164 | if (size in binsUsedByWebsite): 165 | trainSet.write("packetLengthBin_" + str(size) + ", ") 166 | testSet.write("packetLengthBin_" + str(size) + ", ") 167 | trainSet.write("class\n") 168 | testSet.write("class\n") 169 | 170 | 171 | file = open(argv[0],'r') 172 | l = file.readline() #Take out dataset header 173 | l = file.readline() #Take out dataset header 174 | trainCounter = 0 175 | testCounter = 0 176 | currWebsite = "" 177 | trainData = [] 178 | testData =[] 179 | 180 | for lineNumber, l in enumerate(file.readlines()): 181 | lineSplit = l.rstrip('\n').split(" ") 182 | if (lineNumber % 2 == 1): #Gather website data 183 | website = lineSplit[0][:-1] 184 | if(website != currWebsite): 185 | currWebsite = website 186 | trainCounter = 0 187 | testCounter = 0 188 | 189 | #Build container for sample distribution 190 | website_bin_distribution = OrderedDict() 191 | for i in sorted(binsUsedByWebsite): 192 | website_bin_distribution[i] = 0 193 | 194 | #Add useful bins to the sample distribution 195 | for packet_size in lineSplit[1:-1]: 196 | packet_size_binned = RoundToNearest(int(packet_size), binWidth) 197 | if(packet_size_binned in binsUsedByWebsite): 198 | website_bin_distribution[packet_size_binned] += 1 199 | 200 | 201 | if(trainCounter < websiteTrainInstances): 202 | bin_list = [] 203 | for i in website_bin_distribution: 204 | bin_list.append(str(website_bin_distribution[i])) 205 | trainData.append(",".join(bin_list) + ", " + currWebsite + "\n") 206 | trainCounter += 1 207 | elif(testCounter < websiteTestInstances): 208 | bin_list = [] 209 | for i in website_bin_distribution: 210 | bin_list.append(str(website_bin_distribution[i])) 211 | testData.append(",".join(bin_list) + ", " + currWebsite + "\n") 212 | #Account for processed sample 213 | testCounter += 1 214 | 215 | trainSet.write("".join(trainData)) 216 | testSet.write("".join(testData)) 217 | trainSet.close() 218 | testSet.close() 219 | 220 | if __name__ == "__main__": 221 | if (int(sys.argv[-1]) == 1): 222 | extractDistributionWithoutTruncation(sys.argv[1:-1]) 223 | else: 224 | extractDistributionWithTruncation(sys.argv[1:-1]) -------------------------------------------------------------------------------- /Security Tasks Evaluation/MPTAnalysis/FacetAnalysis/online_sketching.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import numpy as np 3 | 4 | 5 | def CreateBinaryVectorRepresentation(BIN_WIDTH, TOPK, SKETCH_SIZE): 6 | 7 | for sketch_size in SKETCH_SIZE: 8 | for binWidth in BIN_WIDTH: 9 | for topk in TOPK: 10 | 11 | """ 12 | Generate random base vectors 13 | """ 14 | 15 | if(topk != 1500): 16 | real_bucket_number = topk 17 | else: 18 | real_bucket_number = 1500/binWidth 19 | 20 | random_base_vectors = [] 21 | for i in range(0, sketch_size): 22 | random_base_vector = (2*np.random.randint(0,2,size=(real_bucket_number))-1) 23 | random_base_vectors.append(random_base_vector) 24 | 25 | n_bits = range(0, sketch_size) 26 | 27 | """ 28 | Process Phase 1 Data 29 | """ 30 | 31 | feature_set = 'PL_60_' + str(binWidth) + '_' + str(topk) 32 | data_folder = 'FeatureSets/' + feature_set + '/' 33 | 34 | #Regular Traffic 35 | print "Online_Sketch: Phase 1, Regular - " + feature_set + "/Online_regularTraffic_phase1_" + str(sketch_size) + "_dataset.csv" 36 | output = open(data_folder + "Online_regularTraffic_phase1_" + str(sketch_size) + "_dataset.csv", "w") 37 | f = open(data_folder + "RegularTraffic_phase1_dataset.csv", 'r') 38 | reader = csv.reader(f, delimiter=',') 39 | 40 | #Process data row 41 | for n, row in enumerate(reader): 42 | if(n == 0): 43 | output.write(",".join(str(x) for x in n_bits) + "\n") 44 | else: 45 | #Gather the packet vector array (v_f) 46 | packet_count_vector = [] 47 | for i in row[:-1]: 48 | packet_count_vector.append(int(i)) 49 | 50 | #Compute the integer array (c_f) 51 | integer_array = [] 52 | for i in range(0, sketch_size): 53 | c_f_i = 0 54 | for j in range(0, real_bucket_number): 55 | #print "Random_base_vector: " + str(random_base_vectors[i]) 56 | c_f_i += random_base_vectors[i][j] * packet_count_vector[j] 57 | integer_array.append(c_f_i) 58 | 59 | #Compute the binary array (s_f) 60 | binary_array = [] 61 | for i in integer_array: 62 | if(i <= 0): 63 | binary_array.append(0) 64 | else: 65 | binary_array.append(1) 66 | 67 | #print "Binary array: " + str(binary_array) 68 | output.write(",".join(str(x) for x in binary_array) + "," + row[-1] + "\n") 69 | output.close() 70 | 71 | 72 | #Facet Traffic 73 | print "Online_Sketch: Phase 1, Facet - " + feature_set + "/Online_facetTraffic_phase1_" + str(sketch_size) + "_dataset.csv" 74 | output = open(data_folder + "Online_facetTraffic_phase1_" + str(sketch_size) + "_dataset.csv", "w") 75 | f = open(data_folder + "FacetTraffic_50_phase1_dataset.csv", 'r') 76 | reader = csv.reader(f, delimiter=',') 77 | 78 | #Process data row 79 | for n, row in enumerate(reader): 80 | if(n == 0): 81 | output.write(",".join(str(x) for x in n_bits) + "\n") 82 | else: 83 | #Gather the packet vector array (v_f) 84 | packet_count_vector = [] 85 | for i in row[:-1]: 86 | packet_count_vector.append(int(i)) 87 | 88 | #Compute the integer array (c_f) 89 | integer_array = [] 90 | for i in range(0, sketch_size): 91 | c_f_i = 0 92 | for j in range(0, real_bucket_number): 93 | #print "Random_base_vector: " + str(random_base_vectors[i]) 94 | c_f_i += random_base_vectors[i][j] * packet_count_vector[j] 95 | integer_array.append(c_f_i) 96 | 97 | #Compute the binary array (s_f) 98 | binary_array = [] 99 | for i in integer_array: 100 | if(i <= 0): 101 | binary_array.append(0) 102 | else: 103 | binary_array.append(1) 104 | 105 | #print "Binary array: " + str(binary_array) 106 | output.write(",".join(str(x) for x in binary_array) + "," + row[-1] + "\n") 107 | output.close() 108 | 109 | ######################################################################################## 110 | ######################################################################################## 111 | ######################################################################################## 112 | 113 | 114 | """ 115 | Process Phase 2 Data 116 | """ 117 | 118 | feature_set = 'PL_60_' + str(binWidth) + '_' + str(topk) 119 | data_folder = 'FeatureSets/' + feature_set + '/' 120 | 121 | #Regular Traffic 122 | print "Online_Sketch: Phase 2, Regular - " + feature_set + "/Online_regularTraffic_phase2_" + str(sketch_size) + "_dataset.csv" 123 | output = open(data_folder + "Online_regularTraffic_phase2_" + str(sketch_size) + "_dataset.csv", "w") 124 | f = open(data_folder + "RegularTraffic_phase2_dataset.csv", 'r') 125 | reader = csv.reader(f, delimiter=',') 126 | 127 | #Process data row 128 | for n, row in enumerate(reader): 129 | if(n == 0): 130 | output.write(",".join(str(x) for x in n_bits) + "\n") 131 | else: 132 | #Gather the packet vector array (v_f) 133 | packet_count_vector = [] 134 | for i in row[:-1]: 135 | packet_count_vector.append(int(i)) 136 | 137 | #Compute the integer array (c_f) 138 | integer_array = [] 139 | for i in range(0, sketch_size): 140 | c_f_i = 0 141 | for j in range(0, real_bucket_number): 142 | #print "Random_base_vector: " + str(random_base_vectors[i]) 143 | c_f_i += random_base_vectors[i][j] * packet_count_vector[j] 144 | integer_array.append(c_f_i) 145 | 146 | #Compute the binary array (s_f) 147 | binary_array = [] 148 | for i in integer_array: 149 | if(i <= 0): 150 | binary_array.append(0) 151 | else: 152 | binary_array.append(1) 153 | 154 | #print "Binary array: " + str(binary_array) 155 | output.write(",".join(str(x) for x in binary_array) + "," + row[-1] + "\n") 156 | output.close() 157 | 158 | 159 | #Facet Traffic 160 | print "Online_Sketch: Phase 2, Facet - " + feature_set + "/Online_facetTraffic_phase2_" + str(sketch_size) + "_dataset.csv" 161 | output = open(data_folder + "Online_facetTraffic_phase2_" + str(sketch_size) + "_dataset.csv", "w") 162 | f = open(data_folder + "FacetTraffic_50_phase2_dataset.csv", 'r') 163 | reader = csv.reader(f, delimiter=',') 164 | 165 | #Process data row 166 | for n, row in enumerate(reader): 167 | if(n == 0): 168 | output.write(",".join(str(x) for x in n_bits) + "\n") 169 | else: 170 | #Gather the packet vector array (v_f) 171 | packet_count_vector = [] 172 | for i in row[:-1]: 173 | packet_count_vector.append(int(i)) 174 | 175 | #Compute the integer array (c_f) 176 | integer_array = [] 177 | for i in range(0, sketch_size): 178 | c_f_i = 0 179 | for j in range(0, real_bucket_number): 180 | #print "Random_base_vector: " + str(random_base_vectors[i]) 181 | c_f_i += random_base_vectors[i][j] * packet_count_vector[j] 182 | integer_array.append(c_f_i) 183 | 184 | #Compute the binary array (s_f) 185 | binary_array = [] 186 | for i in integer_array: 187 | if(i <= 0): 188 | binary_array.append(0) 189 | else: 190 | binary_array.append(1) 191 | 192 | #print "Binary array: " + str(binary_array) 193 | output.write(",".join(str(x) for x in binary_array) + "," + row[-1] + "\n") 194 | output.close() 195 | 196 | -------------------------------------------------------------------------------- /Security Tasks Evaluation/MPTAnalysis/DeltaShaperAnalysis/online_sketching.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import numpy as np 3 | 4 | 5 | def CreateBinaryVectorRepresentation(BIN_WIDTH, TOPK, SKETCH_SIZE): 6 | 7 | for sketch_size in SKETCH_SIZE: 8 | for binWidth in BIN_WIDTH: 9 | for topk in TOPK: 10 | 11 | """ 12 | Generate random base vectors 13 | """ 14 | 15 | if(topk != 1500): 16 | real_bucket_number = topk 17 | else: 18 | real_bucket_number = 1500/binWidth 19 | 20 | random_base_vectors = [] 21 | for i in range(0, sketch_size): 22 | random_base_vector = (2*np.random.randint(0,2,size=(real_bucket_number))-1) 23 | random_base_vectors.append(random_base_vector) 24 | 25 | n_bits = range(0, sketch_size) 26 | 27 | """ 28 | Process Phase 1 Data 29 | """ 30 | 31 | feature_set = 'PL_60_' + str(binWidth) + '_' + str(topk) 32 | data_folder = 'FeatureSets/' + feature_set + '/' 33 | 34 | #Regular Traffic 35 | print "Online_Sketch: Phase 1, Regular - " + feature_set + "/Online_regularTraffic_phase1_" + str(sketch_size) + "_dataset.csv" 36 | output = open(data_folder + "Online_regularTraffic_phase1_" + str(sketch_size) + "_dataset.csv", "w") 37 | f = open(data_folder + "RegularTraffic_phase1_dataset.csv", 'r') 38 | reader = csv.reader(f, delimiter=',') 39 | 40 | #Process data row 41 | for n, row in enumerate(reader): 42 | if(n == 0): 43 | output.write(",".join(str(x) for x in n_bits) + "," + row[-1] + "\n") 44 | else: 45 | #Gather the packet vector array (v_f) 46 | packet_count_vector = [] 47 | for i in row[:-1]: 48 | packet_count_vector.append(int(i)) 49 | 50 | #Compute the integer array (c_f) 51 | integer_array = [] 52 | for i in range(0, sketch_size): 53 | c_f_i = 0 54 | for j in range(0, real_bucket_number): 55 | #print "Random_base_vector: " + str(random_base_vectors[i]) 56 | c_f_i += random_base_vectors[i][j] * packet_count_vector[j] 57 | integer_array.append(c_f_i) 58 | 59 | #Compute the binary array (s_f) 60 | binary_array = [] 61 | for i in integer_array: 62 | if(i <= 0): 63 | binary_array.append(0) 64 | else: 65 | binary_array.append(1) 66 | 67 | #print "Binary array: " + str(binary_array) 68 | output.write(",".join(str(x) for x in binary_array) + "," + row[-1] + "\n") 69 | output.close() 70 | 71 | 72 | #DeltaShaper Traffic 73 | print "Online_Sketch: Phase 1, DeltaShaper - " + feature_set + "/Online_deltashaperTraffic_phase1_" + str(sketch_size) + "_dataset.csv" 74 | output = open(data_folder + "Online_deltashaperTraffic_phase1_" + str(sketch_size) + "_dataset.csv", "w") 75 | f = open(data_folder + "DeltaShaperTraffic_320_phase1_dataset.csv", 'r') 76 | reader = csv.reader(f, delimiter=',') 77 | 78 | #Process data row 79 | for n, row in enumerate(reader): 80 | if(n == 0): 81 | output.write(",".join(str(x) for x in n_bits) + "," + row[-1] + "\n") 82 | else: 83 | #Gather the packet vector array (v_f) 84 | packet_count_vector = [] 85 | for i in row[:-1]: 86 | packet_count_vector.append(int(i)) 87 | 88 | #Compute the integer array (c_f) 89 | integer_array = [] 90 | for i in range(0, sketch_size): 91 | c_f_i = 0 92 | for j in range(0, real_bucket_number): 93 | #print "Random_base_vector: " + str(random_base_vectors[i]) 94 | c_f_i += random_base_vectors[i][j] * packet_count_vector[j] 95 | integer_array.append(c_f_i) 96 | 97 | #Compute the binary array (s_f) 98 | binary_array = [] 99 | for i in integer_array: 100 | if(i <= 0): 101 | binary_array.append(0) 102 | else: 103 | binary_array.append(1) 104 | 105 | #print "Binary array: " + str(binary_array) 106 | output.write(",".join(str(x) for x in binary_array) + "," + row[-1] + "\n") 107 | output.close() 108 | 109 | ######################################################################################## 110 | ######################################################################################## 111 | ######################################################################################## 112 | 113 | 114 | """ 115 | Process Phase 2 Data 116 | """ 117 | 118 | feature_set = 'PL_60_' + str(binWidth) + '_' + str(topk) 119 | data_folder = 'FeatureSets/' + feature_set + '/' 120 | 121 | #Regular Traffic 122 | print "Online_Sketch: Phase 2, Regular - " + feature_set + "/Online_regularTraffic_phase2_" + str(sketch_size) + "_dataset.csv" 123 | output = open(data_folder + "Online_regularTraffic_phase2_" + str(sketch_size) + "_dataset.csv", "w") 124 | f = open(data_folder + "RegularTraffic_phase2_dataset.csv", 'r') 125 | reader = csv.reader(f, delimiter=',') 126 | 127 | #Process data row 128 | for n, row in enumerate(reader): 129 | if(n == 0): 130 | output.write(",".join(str(x) for x in n_bits) + "," + row[-1] + "\n") 131 | else: 132 | #Gather the packet vector array (v_f) 133 | packet_count_vector = [] 134 | for i in row[:-1]: 135 | packet_count_vector.append(int(i)) 136 | 137 | #Compute the integer array (c_f) 138 | integer_array = [] 139 | for i in range(0, sketch_size): 140 | c_f_i = 0 141 | for j in range(0, real_bucket_number): 142 | #print "Random_base_vector: " + str(random_base_vectors[i]) 143 | c_f_i += random_base_vectors[i][j] * packet_count_vector[j] 144 | integer_array.append(c_f_i) 145 | 146 | #Compute the binary array (s_f) 147 | binary_array = [] 148 | for i in integer_array: 149 | if(i <= 0): 150 | binary_array.append(0) 151 | else: 152 | binary_array.append(1) 153 | 154 | #print "Binary array: " + str(binary_array) 155 | output.write(",".join(str(x) for x in binary_array) + "," + row[-1] + "\n") 156 | output.close() 157 | 158 | 159 | #DeltaShaper Traffic 160 | print "Online_Sketch: Phase 2, DeltaShaper - " + feature_set + "/Online_deltashaperTraffic_phase2_" + str(sketch_size) + "_dataset.csv" 161 | output = open(data_folder + "Online_deltashaperTraffic_phase2_" + str(sketch_size) + "_dataset.csv", "w") 162 | f = open(data_folder + "DeltaShaperTraffic_320_phase2_dataset.csv", 'r') 163 | reader = csv.reader(f, delimiter=',') 164 | 165 | #Process data row 166 | for n, row in enumerate(reader): 167 | if(n == 0): 168 | output.write(",".join(str(x) for x in n_bits) + "," + row[-1] + "\n") 169 | else: 170 | #Gather the packet vector array (v_f) 171 | packet_count_vector = [] 172 | for i in row[:-1]: 173 | packet_count_vector.append(int(i)) 174 | 175 | #Compute the integer array (c_f) 176 | integer_array = [] 177 | for i in range(0, sketch_size): 178 | c_f_i = 0 179 | for j in range(0, real_bucket_number): 180 | #print "Random_base_vector: " + str(random_base_vectors[i]) 181 | c_f_i += random_base_vectors[i][j] * packet_count_vector[j] 182 | integer_array.append(c_f_i) 183 | 184 | #Compute the binary array (s_f) 185 | binary_array = [] 186 | for i in integer_array: 187 | if(i <= 0): 188 | binary_array.append(0) 189 | else: 190 | binary_array.append(1) 191 | 192 | #print "Binary array: " + str(binary_array) 193 | output.write(",".join(str(x) for x in binary_array) + "," + row[-1] + "\n") 194 | output.close() 195 | 196 | -------------------------------------------------------------------------------- /Security Tasks Evaluation/WFAnalysis/SingleWebsiteAnalysis/generateFigures.py: -------------------------------------------------------------------------------- 1 | import os 2 | from decimal import Decimal 3 | import numpy as np 4 | import csv 5 | 6 | import matplotlib 7 | if os.environ.get('DISPLAY','') == '': 8 | print('no display found. Using non-interactive Agg backend') 9 | matplotlib.use('Agg') 10 | import matplotlib.pyplot as plt 11 | 12 | 13 | colors = ["0.8", "0.6", "0.2", "0.0"] 14 | colors = ["salmon", "lightsteelblue", "darkseagreen", "thistle", "wheat", "khaki", "skyblue"] 15 | 16 | """ 17 | Attach a text label above each bar displaying its height 18 | """ 19 | def autolabel(rects, ax): 20 | for rect in rects: 21 | height = rect.get_height() 22 | ax.text(rect.get_x() + rect.get_width()/2., 1.005*height, # original height was 1.005*height 23 | "{0:.2f}".format(float(height)), fontsize=7, ha='center', va='bottom') 24 | 25 | 26 | def PlotSingleWebsiteStats(): 27 | 28 | for profile in os.listdir("classificationResults/"): 29 | if(".DS_Store" in profile): 30 | continue 31 | 32 | profile_data = open("classificationResults/" + profile, 'rb') 33 | csv_reader = csv.reader(profile_data, delimiter=',') 34 | 35 | binWidth = [] 36 | acc = [] 37 | fpr = [] 38 | fnr = [] 39 | 40 | for n, row in enumerate(csv_reader): 41 | if(n == 0): 42 | continue 43 | binWidth.append(row[0]) 44 | acc.append(float(row[1])) 45 | fpr.append(float(row[2])) 46 | fnr.append(float(row[3])) 47 | 48 | 49 | fig = plt.figure() 50 | ax1 = fig.add_subplot(111) 51 | 52 | print "Current feature set: "+ str(binWidth) 53 | 54 | 55 | ind = np.arange(len(binWidth)) # the x locations for the groups 56 | width = 0.20 57 | 58 | rects0 = ax1.bar(ind - width, acc, width, colors[0], label='Acc') 59 | rects1 = ax1.bar(ind, fpr, width, colors[1], label='FPR') 60 | rects2 = ax1.bar(ind + width, fnr, width, colors[2], label='FNR') 61 | 62 | 63 | ax1.yaxis.grid(color='black', linestyle='dotted') 64 | ax1.set_title('Scores for Quantization') 65 | ax1.set_yscale("log") 66 | ax1.set_xticks(ind) 67 | labels = binWidth 68 | ax1.set_xticklabels(labels) 69 | ax1.legend() 70 | 71 | 72 | plt.tight_layout() 73 | #plt.ylim(0, 1) 74 | 75 | fig.savefig('WF_%s.pdf'%(profile[:-4])) # save the figure to file 76 | fig.savefig('WF_%s.png'%(profile[:-4])) # save the figure to file 77 | plt.close(fig) 78 | profile_data.close() 79 | 80 | 81 | def PlotNormalFPRComparison(): 82 | websites = set() 83 | 84 | #Compute the set of websites to compare 85 | for profile in os.listdir("classificationResults/"): 86 | if(".DS_Store" in profile): 87 | continue 88 | website = profile.split("_")[2] 89 | website = website[:-4] 90 | websites.add(website) 91 | 92 | 93 | for website in websites: 94 | if not os.path.exists("Figures/%s"%(website)): 95 | os.makedirs("Figures/%s"%(website)) 96 | 97 | #Gather results for full distribution 98 | profile_data_full = open("classificationResults/SingleWebsite_full_" + website + ".csv", 'rb') 99 | csv_reader_full = csv.reader(profile_data_full, delimiter=',') 100 | 101 | binWidth_full = [] 102 | acc_full = [] 103 | fpr_full = [] 104 | fnr_full = [] 105 | 106 | for n, row in enumerate(csv_reader_full): 107 | if(n == 0): 108 | continue 109 | binWidth_full.append(row[0]) 110 | acc_full.append(round(Decimal(float(row[1])), 4)) 111 | fpr_full.append(round(Decimal(float(row[2])), 9)) 112 | fnr_full.append(round(Decimal(float(row[3])), 4)) 113 | 114 | 115 | #Gather results for truncated distribution 116 | profile_data_truncated = open("classificationResults/SingleWebsite_truncated_" + website + ".csv", 'rb') 117 | csv_reader_truncated = csv.reader(profile_data_truncated, delimiter=',') 118 | 119 | binWidth_truncated = [] 120 | acc_truncated = [] 121 | fpr_truncated = [] 122 | fnr_truncated = [] 123 | 124 | for n, row in enumerate(csv_reader_truncated): 125 | if(n == 0): 126 | continue 127 | binWidth_truncated.append(row[0]) 128 | acc_truncated.append(round(Decimal(float(row[1])), 4)) 129 | fpr_truncated.append(round(Decimal(float(row[2])), 9)) 130 | fnr_truncated.append(round(Decimal(float(row[3])), 4)) 131 | 132 | #Gather number of bins used in the truncation 133 | truncated_info_file = open("truncationInfo/" + website + ".csv", 'r') 134 | truncation_info = csv.reader(truncated_info_file, delimiter=',') 135 | truncated_bins = [] 136 | 137 | for n, row in enumerate(truncation_info): 138 | if(n == 0): 139 | continue 140 | truncated_bins.append(row[1]) 141 | 142 | #Generate plot 143 | fig = plt.figure() 144 | ax1 = fig.add_subplot(111) 145 | 146 | print "Current feature set: "+ str(binWidth_full) 147 | print "FPR-Full: " + str(fpr_full) 148 | print "FPR-Truncated: " + str(fpr_truncated) 149 | 150 | ind = np.arange(len(binWidth_full)) # the x locations for the groups 151 | width = 0.40 152 | 153 | rects1 = ax1.bar(ind - width, fpr_full, width, color=colors[0], label='FPR-Full') 154 | #autolabel(rects1,ax1) 155 | rects2 = ax1.bar(ind, fpr_truncated, width, color=colors[1], label='FPR-Truncated') 156 | #autolabel(rects2,ax1) 157 | 158 | 159 | ax1.yaxis.grid(color='black', linestyle='dotted') 160 | ax1.set_title('Truncation effect on FPR - %s'%(website), fontsize = 10) 161 | 162 | ax1.set_xticks(ind) 163 | labels = ["K = " + x + "\nBins = " + str(truncated_bins[n]) for n, x in enumerate(binWidth_full)] 164 | ax1.set_xticklabels(labels) 165 | ax1.legend() 166 | 167 | plt.xticks(fontsize=7) 168 | plt.tight_layout() 169 | #plt.ylim(0, 1) 170 | fig.savefig('Figures/%s/WF_FPR_normal_%s.pdf'%(website, website)) # save the figure to file 171 | fig.savefig('Figures/%s/WF_FPR_normal_%s.png'%(website, website)) # save the figure to file 172 | plt.close(fig) 173 | profile_data_full.close() 174 | profile_data_truncated.close() 175 | 176 | 177 | def PlotNormalFNRComparison(): 178 | websites = set() 179 | 180 | #Compute the set of websites to compare 181 | for profile in os.listdir("classificationResults/"): 182 | if(".DS_Store" in profile): 183 | continue 184 | website = profile.split("_")[2] 185 | website = website[:-4] 186 | websites.add(website) 187 | 188 | 189 | for website in websites: 190 | if not os.path.exists("Figures/%s"%(website)): 191 | os.makedirs("Figures/%s"%(website)) 192 | 193 | #Gather results for full distribution 194 | profile_data_full = open("classificationResults/SingleWebsite_full_" + website + ".csv", 'rb') 195 | csv_reader_full = csv.reader(profile_data_full, delimiter=',') 196 | 197 | binWidth_full = [] 198 | acc_full = [] 199 | fpr_full = [] 200 | fnr_full = [] 201 | 202 | for n, row in enumerate(csv_reader_full): 203 | if(n == 0): 204 | continue 205 | binWidth_full.append(row[0]) 206 | acc_full.append(round(Decimal(float(row[1])), 4)) 207 | fpr_full.append(round(Decimal(float(row[2])), 4)) 208 | fnr_full.append(round(Decimal(float(row[3])), 4)) 209 | 210 | 211 | #Gather results for truncated distribution 212 | profile_data_truncated = open("classificationResults/SingleWebsite_truncated_" + website + ".csv", 'rb') 213 | csv_reader_truncated = csv.reader(profile_data_truncated, delimiter=',') 214 | 215 | binWidth_truncated = [] 216 | acc_truncated = [] 217 | fpr_truncated = [] 218 | fnr_truncated = [] 219 | 220 | for n, row in enumerate(csv_reader_truncated): 221 | if(n == 0): 222 | continue 223 | binWidth_truncated.append(row[0]) 224 | acc_truncated.append(round(Decimal(float(row[1])), 4)) 225 | fpr_truncated.append(round(Decimal(float(row[2])), 4)) 226 | fnr_truncated.append(round(Decimal(float(row[3])), 4)) 227 | 228 | 229 | #Gather number of bins used in the truncation 230 | truncated_info_file = open("truncationInfo/" + website + ".csv", 'r') 231 | truncation_info = csv.reader(truncated_info_file, delimiter=',') 232 | truncated_bins = [] 233 | 234 | for n, row in enumerate(truncation_info): 235 | if(n == 0): 236 | continue 237 | truncated_bins.append(row[1]) 238 | 239 | 240 | #Generate plot 241 | fig = plt.figure() 242 | ax1 = fig.add_subplot(111) 243 | 244 | print "Current feature set: "+ str(binWidth_full) 245 | print "FNR-Full: " + str(fnr_full) 246 | print "FNR-Truncated: " + str(fnr_truncated) 247 | 248 | ind = np.arange(len(binWidth_full)) # the x locations for the groups 249 | width = 0.40 250 | 251 | rects1 = ax1.bar(ind - width, fnr_full, width, color=colors[0], label='FNR-Full') 252 | autolabel(rects1,ax1) 253 | rects2 = ax1.bar(ind, fnr_truncated, width, color=colors[1], label='FNR-Truncated') 254 | autolabel(rects2,ax1) 255 | 256 | 257 | ax1.yaxis.grid(color='black', linestyle='dotted') 258 | ax1.set_title('Truncation effect on FNR - %s'%(website), fontsize = 10) 259 | 260 | ax1.set_xticks(ind) 261 | labels = ["K = " + x + "\nBins = " + str(truncated_bins[n]) for n, x in enumerate(binWidth_full)] 262 | ax1.set_xticklabels(labels) 263 | ax1.legend() 264 | 265 | plt.xticks(fontsize=7) 266 | plt.tight_layout() 267 | plt.ylim(0, 1) 268 | fig.savefig('Figures/%s/WF_FNR_normal_%s.pdf'%(website, website)) # save the figure to file 269 | fig.savefig('Figures/%s/WF_FNR_normal_%s.png'%(website, website)) # save the figure to file 270 | plt.close(fig) 271 | profile_data_full.close() 272 | profile_data_truncated.close() 273 | 274 | 275 | 276 | def GenerateFigures(): 277 | if not os.path.exists("Figures"): 278 | os.makedirs("Figures") 279 | 280 | PlotNormalFNRComparison() 281 | PlotNormalFPRComparison() -------------------------------------------------------------------------------- /Security Tasks Evaluation/MPTAnalysis/FacetAnalysis/generateFigures.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import math 4 | 5 | import matplotlib 6 | if os.environ.get('DISPLAY','') == '': 7 | print('no display found. Using non-interactive Agg backend') 8 | matplotlib.use('Agg') 9 | import matplotlib.pyplot as plt 10 | 11 | 12 | colors = ["0.8", "0.6", "0.2", "0.0"] 13 | colors = ["salmon", "lightsteelblue", "darkseagreen", "thistle", "wheat", "khaki", "skyblue"] 14 | 15 | """ 16 | Attach a text label above each bar displaying its height 17 | """ 18 | def autolabel(rects, ax): 19 | for rect in rects: 20 | height = rect.get_height() 21 | ax.text(rect.get_x() + rect.get_width()/2., 1.005*height, # original height was 1.005*height 22 | "{0:.2f}".format(float(height)), fontsize=7, ha='center', va='bottom') 23 | 24 | 25 | def PlotQuantization(binWidths, n_flows): 26 | print "PlotQuantization" 27 | feature_sets = [] 28 | set_acc = [] 29 | set_fpr =[] 30 | set_fnr = [] 31 | 32 | for binWidth in binWidths: 33 | 34 | feature_folder = 'PL_60_' + str(binWidth) + '_1500' 35 | #print feature_folder 36 | 37 | #Load configuration results 38 | data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase2_NoSketch.npy" 39 | results = np.load(data_folder) 40 | set_acc.append(results[0]) 41 | set_fpr.append(results[1]) 42 | set_fnr.append(results[2]) 43 | feature_sets.append(feature_folder) 44 | 45 | 46 | max_acc = 0 47 | max_fset = "" 48 | for i, f_set in enumerate(feature_sets): 49 | if set_acc[i] > max_acc: 50 | max_acc = set_acc[i] 51 | max_fset = f_set 52 | print "Max acc: %s, Best quantization set: %s"%(max_acc, max_fset) 53 | 54 | fig = plt.figure(figsize=(10,4)) 55 | ax1 = fig.add_subplot(111) 56 | 57 | curr_fset = feature_sets 58 | curr_acc = set_acc 59 | curr_fpr = set_fpr 60 | curr_fnr = set_fnr 61 | #print "Current feature set: "+ str(curr_fset) 62 | 63 | ind = np.arange(len(curr_fset)) # the x locations for the groups 64 | width = 0.20 65 | 66 | rects0 = ax1.bar(ind - width - width/2, curr_acc, width, color=colors[0], label='Acc') 67 | autolabel(rects0,ax1) 68 | rects1 = ax1.bar(ind - width/2 , curr_fpr, width, color=colors[1], label='FPR') 69 | autolabel(rects1,ax1) 70 | rects2 = ax1.bar(ind + width - width/2, curr_fnr, width, color=colors[2], label='FNR') 71 | autolabel(rects2,ax1) 72 | 73 | 74 | ax1.yaxis.grid(color='black', linestyle='dotted') 75 | ax1.set_title('Scores for Quantization') 76 | ax1.set_xticks(ind) 77 | labels = ["K = " + str(int(x.split('_')[2])) + " -> " + str(1500/int(x.split('_')[2])) + " bins" + "\n(PerFlow = " + str(int(1500/int(x.split('_')[2]))*4) + " B)" + "\n(CGMem = " + str((n_flows * int(1500/int(x.split('_')[2]))*4)/1024) + " KB)" for x in feature_sets] 78 | ax1.set_xticklabels(labels) 79 | plt.xticks(fontsize=7) 80 | ax1.legend() 81 | 82 | plt.ylim(top=1) 83 | plt.legend(loc='upper right', fontsize=8) 84 | plt.tight_layout() 85 | fig.savefig('Figures/Facet_bin_NoSketch.pdf') # save the figure to file 86 | fig.savefig('Figures/Facet_bin_NoSketch.png') # save the figure to file 87 | plt.close(fig) 88 | 89 | 90 | def PlotQuantizationLines(binWidths, n_flows): 91 | print "PlotQuantizationLines" 92 | feature_sets = [] 93 | set_acc = [] 94 | 95 | for binWidth in binWidths: 96 | 97 | feature_folder = 'PL_60_' + str(binWidth) + '_1500' 98 | #print feature_folder 99 | 100 | #Load configuration results 101 | data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase2_NoSketch.npy" 102 | results = np.load(data_folder) 103 | set_acc.append(results[3]) 104 | feature_sets.append(feature_folder) 105 | 106 | 107 | 108 | fig = plt.figure(figsize=(10,4)) 109 | ax1 = fig.add_subplot(111) 110 | 111 | curr_fset = feature_sets 112 | curr_acc = set_acc 113 | 114 | ind = np.arange(len(curr_fset)) # the x locations for the groups 115 | print curr_acc 116 | ax1.plot(curr_acc, color=colors[0], marker=".", markersize=12, lw=3, label='AUC') 117 | ax1.hlines(0.99, 0, len(ind)-1, lw=2, label='Baseline, AUC = 0.99') 118 | 119 | for i,j in zip(ind,curr_acc): 120 | ax1.annotate("{0:.2f}".format(j),xy=(i-0.15,j-0.08)) 121 | 122 | 123 | ax1.yaxis.grid(color='black', linestyle='dotted') 124 | plt.yticks(fontsize=14) 125 | plt.ylim(bottom=0,top=1) 126 | plt.ylabel("AUC Score", fontsize=14) 127 | 128 | 129 | plt.xlim(-0.3, len(ind)-1+0.3) 130 | ax1.set_xticks(ind) 131 | labels = [str(int(x.split('_')[2])) for x in feature_sets] 132 | #labels = ["K = " + str(int(x.split('_')[2])) + " -> " + str(1500/int(x.split('_')[2])) + " bins" + "\n(PerFlow = " + str(int(1500/int(x.split('_')[2]))*4) + " B)" + "\n(CGMem = " + str((n_flows * int(1500/int(x.split('_')[2]))*4)/1024) + " KB)" for x in feature_sets] 133 | #labels = ["K = " + str(int(x.split('_')[2])) + "\nPF = " + str(int(1500/int(x.split('_')[2]))*4) + " B" + "\nTM = " + str((n_flows * int(1500/int(x.split('_')[2]))*4)/1024) + " KB" for x in feature_sets] 134 | ax1.set_xticklabels(labels) 135 | plt.xticks(fontsize=11) 136 | plt.xlabel("Quantization Factor", fontsize=14) 137 | ax1.legend() 138 | 139 | 140 | plt.legend(loc='lower right', fontsize=12) 141 | plt.tight_layout() 142 | fig.savefig('Figures/Facet_bin_NoSketch_Lines.pdf') # save the figure to file 143 | fig.savefig('Figures/Facet_bin_NoSketch_Lines.png') # save the figure to file 144 | plt.close(fig) 145 | 146 | 147 | def PlotKQuantizationAndTruncation(binWidths, topk_features, n_flows): 148 | print "PlotKQuantizationAndTruncation" 149 | if not os.path.exists('Figures/Truncation_comparison'): 150 | os.makedirs('Figures/Truncation_comparison') 151 | 152 | for binWidth in binWidths: 153 | feature_sets = [] 154 | set_acc = [] 155 | set_fpr =[] 156 | set_fnr = [] 157 | 158 | for topk in topk_features: 159 | 160 | feature_folder = 'PL_60_' + str(binWidth) + '_' + str(topk) 161 | #print feature_folder 162 | 163 | if(topk != 1500 and topk > 1500/binWidth): 164 | #print "Skipping sample, invalid configuration. TopK = " + str(topk) + " Total Features = " + str(1500/binWidth) 165 | set_acc.append(0) 166 | set_fpr.append(0) 167 | set_fnr.append(0) 168 | feature_sets.append(feature_folder) 169 | continue 170 | 171 | #Load configuration results 172 | #if(topk == 1500): 173 | # data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase1_NoSketch.npy" 174 | #else: 175 | data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase2_NoSketch.npy" 176 | results = np.load(data_folder) 177 | set_acc.append(results[0]) 178 | set_fpr.append(results[1]) 179 | set_fnr.append(results[2]) 180 | feature_sets.append(feature_folder) 181 | 182 | 183 | #Check best truncation value 184 | max_acc = 0 185 | max_fset = "" 186 | for i, f_set in enumerate(feature_sets[:-1]): 187 | if set_acc[i] > max_acc: 188 | max_acc = set_acc[i] 189 | max_fset = f_set 190 | print "K = " + str(binWidth) + ", Max acc: %s, Best Truncation: %s"%(max_acc, max_fset) 191 | 192 | 193 | #Plot figures 194 | fig = plt.figure(figsize=(10,4)) 195 | ax1 = fig.add_subplot(111) 196 | 197 | curr_fset = feature_sets 198 | curr_acc = set_acc 199 | curr_fpr = set_fpr 200 | curr_fnr = set_fnr 201 | #print "Current feature set: "+ str(curr_fset) 202 | 203 | ind = np.arange(len(curr_fset)) # the x locations for the groups 204 | width = 0.20 205 | 206 | rects0 = ax1.bar(ind - width - width/2, curr_acc, width, color=colors[0], label='Acc') 207 | autolabel(rects0,ax1) 208 | rects1 = ax1.bar(ind - width/2 , curr_fpr, width, color=colors[1], label='FPR') 209 | autolabel(rects1,ax1) 210 | rects2 = ax1.bar(ind + width - width/2, curr_fnr, width, color=colors[2], label='FNR') 211 | autolabel(rects2,ax1) 212 | 213 | ax1.yaxis.grid(color='black', linestyle='dotted') 214 | ax1.set_title('Truncation Scores for K ='+str(binWidth)) 215 | ax1.set_xticks(ind) 216 | print feature_sets 217 | labels = ["Top-k= " + str(int(x.split('_')[3])) + "\n(PerFlow = " + str(int(x.split('_')[3])*4) + " B)" + "\n(CGMem = " + str((n_flows * int(x.split('_')[3]) * 4)/1024) + " KB)" for x in feature_sets] 218 | labels[len(topk_features)-1] = str(int(1500/binWidth)) + " features\n(PerFlow = " + str(int(1500/binWidth)*4) + " B)" + "\n(CGMem = " + str(int((n_flows * int(1500/binWidth) * 4)/1024)) + " KB)" 219 | ax1.set_xticklabels(labels) 220 | plt.xticks(fontsize=9) 221 | ax1.legend() 222 | 223 | plt.ylim(top=1) 224 | plt.legend(loc='upper right', fontsize=10) 225 | plt.tight_layout() 226 | fig.savefig('Figures/Truncation_comparison/Facet_bin' + str(binWidth) + '_topk_NoSketch.pdf') # save the figure to file 227 | fig.savefig('Figures/Truncation_comparison/Facet_bin' + str(binWidth) + '_topk_NoSketch.png') # save the figure to file 228 | plt.close(fig) 229 | 230 | def PlotKQuantizationAndTruncationLines(binWidths, topk_features, n_flows): 231 | print "PlotKQuantizationAndTruncation" 232 | if not os.path.exists('Figures/Truncation_comparison'): 233 | os.makedirs('Figures/Truncation_comparison') 234 | 235 | for binWidth in binWidths: 236 | feature_sets = [] 237 | set_acc = [] 238 | 239 | for topk in topk_features: 240 | 241 | feature_folder = 'PL_60_' + str(binWidth) + '_' + str(topk) 242 | #print feature_folder 243 | 244 | if(topk != 1500 and topk > 1500/binWidth): 245 | #print "Skipping sample, invalid configuration. TopK = " + str(topk) + " Total Features = " + str(1500/binWidth) 246 | set_acc.append(0) 247 | feature_sets.append(feature_folder) 248 | continue 249 | 250 | #Load configuration results 251 | #if(topk == 1500): 252 | # data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase1_NoSketch.npy" 253 | #else: 254 | data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase2_NoSketch.npy" 255 | results = np.load(data_folder) 256 | set_acc.append(results[3]) 257 | feature_sets.append(feature_folder) 258 | 259 | 260 | #Plot figures 261 | fig = plt.figure(figsize=(10,4)) 262 | ax1 = fig.add_subplot(111) 263 | 264 | curr_fset = feature_sets 265 | curr_acc = set_acc 266 | 267 | #print "Current feature set: "+ str(curr_fset) 268 | 269 | ind = np.arange(len(curr_fset)) # the x locations for the groups 270 | 271 | 272 | ax1.plot(curr_acc, color=colors[0], marker=".", markersize=12, lw=3, label='AUC') 273 | ax1.hlines(0.99, 0, len(ind)-1, lw=3, label='Baseline, AUC = 0.99') 274 | 275 | for i,j in zip(ind,curr_acc): 276 | ax1.annotate("{0:.2f}".format(j),xy=(i-0.1,j-0.08)) 277 | 278 | plt.xlim(-0.3, len(ind)-1+0.3) 279 | ax1.yaxis.grid(color='black', linestyle='dotted') 280 | 281 | ax1.set_xticks(ind) 282 | print feature_sets 283 | labels = [str(int(x.split('_')[3])) for x in feature_sets] 284 | #labels = ["Top-n= " + str(int(x.split('_')[3])) + "\nPF = " + str(int(x.split('_')[3])*4) + " B" + "\nTM = " + str((n_flows * int(x.split('_')[3]) * 4)/1024) + " KB" for x in feature_sets] 285 | #labels[len(topk_features)-1] = str(int(1500/binWidth)) + " features\n(PF = " + str(int(1500/binWidth)*4) + " B)" + "\n(TMem = " + str(int((n_flows * int(1500/binWidth) * 4)/1024)) + " KB)" 286 | ax1.set_xticklabels(labels) 287 | plt.xticks(fontsize=9) 288 | plt.xlabel("Truncation Factor", fontsize=12) 289 | ax1.legend() 290 | 291 | 292 | plt.yticks(fontsize=12) 293 | plt.ylim(bottom=0,top=1) 294 | plt.ylabel("AUC Score", fontsize=12) 295 | 296 | plt.legend(loc='lower right', fontsize=12) 297 | plt.tight_layout() 298 | fig.savefig('Figures/Truncation_comparison/Facet_bin' + str(binWidth) + '_topk_NoSketch_Lines.pdf') # save the figure to file 299 | fig.savefig('Figures/Truncation_comparison/Facet_bin' + str(binWidth) + '_topk_NoSketch_Lines.png') # save the figure to file 300 | plt.close(fig) 301 | 302 | 303 | def GenerateFigures(binWidths, topk_features, nFlows): 304 | if not os.path.exists('Figures'): 305 | os.makedirs('Figures') 306 | 307 | PlotQuantization(binWidths, nFlows) 308 | PlotKQuantizationAndTruncation(binWidths, topk_features, nFlows) 309 | 310 | 311 | 312 | def GenerateFiguresLine(binWidths, topk_features, nFlows): 313 | if not os.path.exists('Figures'): 314 | os.makedirs('Figures') 315 | 316 | TOPK = [10, 20, 30, 40, 50] 317 | PlotQuantizationLines(binWidths, nFlows) 318 | PlotKQuantizationAndTruncationLines(binWidths, TOPK, nFlows) 319 | 320 | 321 | 322 | if __name__ == "__main__": 323 | 324 | #Quantization 325 | BIN_WIDTH = [1, 4, 8, 16, 32, 64, 128, 256] 326 | 327 | #Truncation Top-K features 328 | TOPK = [5, 10, 20, 30, 40, 50, 1500] 329 | TOPK = [10, 20, 30, 40, 50] 330 | 331 | #Total amount of flows per dataset 332 | N_FLOWS = 1000 333 | 334 | PlotQuantizationLines(BIN_WIDTH, N_FLOWS) 335 | PlotKQuantizationAndTruncationLines(BIN_WIDTH, TOPK, N_FLOWS) -------------------------------------------------------------------------------- /Security Tasks Evaluation/MPTAnalysis/DeltaShaperAnalysis/generateFigures.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import math 4 | 5 | import matplotlib 6 | if os.environ.get('DISPLAY','') == '': 7 | print('no display found. Using non-interactive Agg backend') 8 | matplotlib.use('Agg') 9 | import matplotlib.pyplot as plt 10 | 11 | 12 | colors = ["0.8", "0.6", "0.2", "0.0"] 13 | colors = ["salmon", "lightsteelblue", "darkseagreen", "thistle", "wheat", "khaki", "skyblue"] 14 | 15 | """ 16 | Attach a text label above each bar displaying its height 17 | """ 18 | def autolabel(rects, ax): 19 | for rect in rects: 20 | height = rect.get_height() 21 | ax.text(rect.get_x() + rect.get_width()/2., 1.005*height, # original height was 1.005*height 22 | "{0:.2f}".format(float(height)), fontsize=7, ha='center', va='bottom') 23 | 24 | 25 | def PlotQuantization(binWidths, n_flows): 26 | print "PlotQuantization" 27 | feature_sets = [] 28 | set_acc = [] 29 | set_fpr =[] 30 | set_fnr = [] 31 | 32 | for binWidth in binWidths: 33 | 34 | feature_folder = 'PL_60_' + str(binWidth) + '_1500' 35 | #print feature_folder 36 | 37 | #Load configuration results 38 | data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase2_NoSketch.npy" 39 | results = np.load(data_folder) 40 | set_acc.append(results[0]) 41 | set_fpr.append(results[1]) 42 | set_fnr.append(results[2]) 43 | feature_sets.append(feature_folder) 44 | 45 | 46 | max_acc = 0 47 | max_fset = "" 48 | for i, f_set in enumerate(feature_sets): 49 | if set_acc[i] > max_acc: 50 | max_acc = set_acc[i] 51 | max_fset = f_set 52 | print "Max acc: %s, Best quantization set: %s"%(max_acc, max_fset) 53 | 54 | fig = plt.figure(figsize=(10,4)) 55 | ax1 = fig.add_subplot(111) 56 | 57 | curr_fset = feature_sets 58 | curr_acc = set_acc 59 | curr_fpr = set_fpr 60 | curr_fnr = set_fnr 61 | #print "Current feature set: "+ str(curr_fset) 62 | 63 | ind = np.arange(len(curr_fset)) # the x locations for the groups 64 | width = 0.20 65 | 66 | rects0 = ax1.bar(ind - width - width/2, curr_acc, width, color=colors[0], label='Acc') 67 | autolabel(rects0,ax1) 68 | rects1 = ax1.bar(ind - width/2 , curr_fpr, width, color=colors[1], label='FPR') 69 | autolabel(rects1,ax1) 70 | rects2 = ax1.bar(ind + width - width/2, curr_fnr, width, color=colors[2], label='FNR') 71 | autolabel(rects2,ax1) 72 | 73 | 74 | ax1.yaxis.grid(color='black', linestyle='dotted') 75 | ax1.set_title('Scores for Quantization') 76 | ax1.set_xticks(ind) 77 | labels = ["K = " + str(int(x.split('_')[2])) + " -> " + str(1500/int(x.split('_')[2])) + " bins" + "\n(PerFlow = " + str(int(1500/int(x.split('_')[2]))*4) + " B)" + "\n(CGMem = " + str((n_flows * int(1500/int(x.split('_')[2]))*4)/1024) + " KB)" for x in feature_sets] 78 | ax1.set_xticklabels(labels) 79 | plt.xticks(fontsize=7) 80 | ax1.legend() 81 | 82 | plt.ylim(top=1) 83 | plt.legend(loc='upper right', fontsize=8) 84 | plt.tight_layout() 85 | fig.savefig('Figures/DeltaShaper_bin_NoSketch.pdf') # save the figure to file 86 | fig.savefig('Figures/DeltaShaper_bin_NoSketch.png') # save the figure to file 87 | plt.close(fig) 88 | 89 | 90 | def PlotQuantizationLines(binWidths, n_flows): 91 | print "PlotQuantizationLines" 92 | feature_sets = [] 93 | set_acc = [] 94 | 95 | for binWidth in binWidths: 96 | 97 | feature_folder = 'PL_60_' + str(binWidth) + '_1500' 98 | #print feature_folder 99 | 100 | #Load configuration results 101 | data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase2_NoSketch.npy" 102 | results = np.load(data_folder) 103 | set_acc.append(results[3]) 104 | feature_sets.append(feature_folder) 105 | 106 | 107 | 108 | fig = plt.figure(figsize=(10,4)) 109 | ax1 = fig.add_subplot(111) 110 | 111 | curr_fset = feature_sets 112 | curr_acc = set_acc 113 | 114 | ind = np.arange(len(curr_fset)) # the x locations for the groups 115 | print curr_acc 116 | ax1.plot(curr_acc, color=colors[0], marker=".", markersize=12, lw=3, label='AUC') 117 | ax1.hlines(0.95, 0, len(ind)-1, lw=3, label='Baseline, AUC = 0.95') 118 | 119 | for i,j in zip(ind,curr_acc): 120 | ax1.annotate("{0:.2f}".format(j),xy=(i-0.15,j+0.03)) 121 | 122 | ax1.yaxis.grid(color='black', linestyle='dotted') 123 | plt.yticks(fontsize=14) 124 | plt.ylim(bottom=0,top=1) 125 | plt.ylabel("AUC Score", fontsize=14) 126 | 127 | 128 | plt.xlim(-0.3, len(ind)-1+0.3) 129 | ax1.set_xticks(ind) 130 | #labels = ["K = " + str(int(x.split('_')[2])) + " -> " + str(1500/int(x.split('_')[2])) + " bins" + "\n(PerFlow = " + str(int(1500/int(x.split('_')[2]))*4) + " B)" + "\n(CGMem = " + str((n_flows * int(1500/int(x.split('_')[2]))*4)/1024) + " KB)" for x in feature_sets] 131 | #labels = ["K = " + str(int(x.split('_')[2])) + "\nPF = " + str(int(1500/int(x.split('_')[2]))*4) + " B" + "\nTM = " + str((n_flows * int(1500/int(x.split('_')[2]))*4)/1024) + " KB" for x in feature_sets] 132 | labels = [str(int(x.split('_')[2])) for x in feature_sets] 133 | 134 | ax1.set_xticklabels(labels) 135 | plt.xticks(fontsize=9) 136 | plt.xlabel("DeltaShaper Quantization Factor (K)", fontsize=12) 137 | ax1.legend() 138 | 139 | 140 | plt.legend(loc='lower right', fontsize=12) 141 | plt.tight_layout() 142 | fig.savefig('Figures/DeltaShaper_bin_NoSketch_Lines.pdf') # save the figure to file 143 | fig.savefig('Figures/DeltaShaper_bin_NoSketch_Lines.png') # save the figure to file 144 | plt.close(fig) 145 | 146 | 147 | def PlotKQuantizationAndTruncation(binWidths, topk_features, n_flows): 148 | print "PlotKQuantizationAndTruncation" 149 | if not os.path.exists('Figures/Truncation_comparison'): 150 | os.makedirs('Figures/Truncation_comparison') 151 | 152 | for binWidth in binWidths: 153 | feature_sets = [] 154 | set_acc = [] 155 | set_fpr =[] 156 | set_fnr = [] 157 | 158 | for topk in topk_features: 159 | 160 | feature_folder = 'PL_60_' + str(binWidth) + '_' + str(topk) 161 | #print feature_folder 162 | 163 | if(topk != 1500 and topk > 1500/binWidth): 164 | #print "Skipping sample, invalid configuration. TopK = " + str(topk) + " Total Features = " + str(1500/binWidth) 165 | set_acc.append(0) 166 | set_fpr.append(0) 167 | set_fnr.append(0) 168 | feature_sets.append(feature_folder) 169 | continue 170 | 171 | #Load configuration results 172 | #if(topk == 1500): 173 | # data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase1_NoSketch.npy" 174 | #else: 175 | data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase2_NoSketch.npy" 176 | results = np.load(data_folder) 177 | set_acc.append(results[0]) 178 | set_fpr.append(results[1]) 179 | set_fnr.append(results[2]) 180 | feature_sets.append(feature_folder) 181 | 182 | 183 | #Check best truncation value 184 | max_acc = 0 185 | max_fset = "" 186 | for i, f_set in enumerate(feature_sets[:-1]): 187 | if set_acc[i] > max_acc: 188 | max_acc = set_acc[i] 189 | max_fset = f_set 190 | print "K = " + str(binWidth) + ", Max acc: %s, Best Truncation: %s"%(max_acc, max_fset) 191 | 192 | 193 | #Plot figures 194 | fig = plt.figure(figsize=(10,4)) 195 | ax1 = fig.add_subplot(111) 196 | 197 | curr_fset = feature_sets 198 | curr_acc = set_acc 199 | curr_fpr = set_fpr 200 | curr_fnr = set_fnr 201 | #print "Current feature set: "+ str(curr_fset) 202 | 203 | ind = np.arange(len(curr_fset)) # the x locations for the groups 204 | width = 0.20 205 | 206 | rects0 = ax1.bar(ind - width - width/2, curr_acc, width, color=colors[0], label='Acc') 207 | autolabel(rects0,ax1) 208 | rects1 = ax1.bar(ind - width/2 , curr_fpr, width, color=colors[1], label='FPR') 209 | autolabel(rects1,ax1) 210 | rects2 = ax1.bar(ind + width - width/2, curr_fnr, width, color=colors[2], label='FNR') 211 | autolabel(rects2,ax1) 212 | 213 | ax1.yaxis.grid(color='black', linestyle='dotted') 214 | ax1.set_title('Truncation Scores for K ='+str(binWidth)) 215 | ax1.set_xticks(ind) 216 | print feature_sets 217 | labels = ["Top-k= " + str(int(x.split('_')[3])) + "\n(PerFlow = " + str(int(x.split('_')[3])*4) + " B)" + "\n(CGMem = " + str((n_flows * int(x.split('_')[3]) * 4)/1024) + " KB)" for x in feature_sets] 218 | labels[len(topk_features)-1] = str(int(1500/binWidth)) + " features\n(PerFlow = " + str(int(1500/binWidth)*4) + " B)" + "\n(CGMem = " + str(int((n_flows * int(1500/binWidth) * 4)/1024)) + " KB)" 219 | ax1.set_xticklabels(labels) 220 | plt.xticks(fontsize=9) 221 | ax1.legend() 222 | 223 | plt.ylim(top=1) 224 | plt.legend(loc='upper right', fontsize=10) 225 | plt.tight_layout() 226 | fig.savefig('Figures/Truncation_comparison/DeltaShaper_bin' + str(binWidth) + '_topk_NoSketch.pdf') # save the figure to file 227 | fig.savefig('Figures/Truncation_comparison/DeltaShaper_bin' + str(binWidth) + '_topk_NoSketch.png') # save the figure to file 228 | plt.close(fig) 229 | 230 | def PlotKQuantizationAndTruncationLines(binWidths, topk_features, n_flows): 231 | print "PlotKQuantizationAndTruncation" 232 | if not os.path.exists('Figures/Truncation_comparison'): 233 | os.makedirs('Figures/Truncation_comparison') 234 | 235 | for binWidth in binWidths: 236 | feature_sets = [] 237 | set_acc = [] 238 | 239 | for topk in topk_features: 240 | 241 | feature_folder = 'PL_60_' + str(binWidth) + '_' + str(topk) 242 | #print feature_folder 243 | 244 | if(topk != 1500 and topk > 1500/binWidth): 245 | #print "Skipping sample, invalid configuration. TopK = " + str(topk) + " Total Features = " + str(1500/binWidth) 246 | set_acc.append(0) 247 | feature_sets.append(feature_folder) 248 | continue 249 | 250 | #Load configuration results 251 | #if(topk == 1500): 252 | # data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase1_NoSketch.npy" 253 | #else: 254 | data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase2_NoSketch.npy" 255 | results = np.load(data_folder) 256 | set_acc.append(results[3]) 257 | feature_sets.append(feature_folder) 258 | 259 | 260 | #Plot figures 261 | fig = plt.figure(figsize=(10,4)) 262 | ax1 = fig.add_subplot(111) 263 | 264 | curr_fset = feature_sets 265 | curr_acc = set_acc 266 | 267 | #print "Current feature set: "+ str(curr_fset) 268 | 269 | ind = np.arange(len(curr_fset)) # the x locations for the groups 270 | 271 | 272 | ax1.plot(curr_acc, color=colors[0], marker=".", markersize=12, lw=3, label='AUC') 273 | ax1.hlines(0.95, 0, len(ind)-1, lw=3, label='Baseline, AUC = 0.95') 274 | 275 | for i,j in zip(ind,curr_acc): 276 | ax1.annotate("{0:.2f}".format(j),xy=(i-0.1,j+0.03)) 277 | 278 | plt.xlim(-0.3, len(ind)-1+0.3) 279 | ax1.yaxis.grid(color='black', linestyle='dotted') 280 | 281 | ax1.set_xticks(ind) 282 | print feature_sets 283 | labels = [str(int(x.split('_')[3])) for x in feature_sets] 284 | #labels = ["Top-n= " + str(int(x.split('_')[3])) + "\nPF = " + str(int(x.split('_')[3])*4) + " B" + "\nTM = " + str((n_flows * int(x.split('_')[3]) * 4)/1024) + " KB" for x in feature_sets] 285 | #labels[len(topk_features)-1] = str(int(1500/binWidth)) + " features\n(PF = " + str(int(1500/binWidth)*4) + " B)" + "\n(TMem = " + str(int((n_flows * int(1500/binWidth) * 4)/1024)) + " KB)" 286 | ax1.set_xticklabels(labels) 287 | plt.xticks(fontsize=9) 288 | plt.xlabel("Truncation Factor", fontsize=12) 289 | ax1.legend() 290 | 291 | 292 | plt.yticks(fontsize=12) 293 | plt.ylim(bottom=0,top=1) 294 | plt.ylabel("AUC Score", fontsize=12) 295 | 296 | plt.legend(loc='lower right', fontsize=12) 297 | plt.tight_layout() 298 | fig.savefig('Figures/Truncation_comparison/DeltaShaper_bin' + str(binWidth) + '_topk_NoSketch_Lines.pdf') # save the figure to file 299 | fig.savefig('Figures/Truncation_comparison/DeltaShaper_bin' + str(binWidth) + '_topk_NoSketch_Lines.png') # save the figure to file 300 | plt.close(fig) 301 | 302 | 303 | 304 | def GenerateFigures(binWidths, topk_features, nFlows): 305 | if not os.path.exists('Figures'): 306 | os.makedirs('Figures') 307 | 308 | PlotQuantization(binWidths, nFlows) 309 | PlotKQuantizationAndTruncation(binWidths, topk_features, nFlows) 310 | 311 | 312 | def GenerateFiguresLines(binWidths, topk_features, nFlows): 313 | if not os.path.exists('Figures'): 314 | os.makedirs('Figures') 315 | 316 | TOPK = [10, 20, 30, 40, 50] 317 | PlotQuantizationLines(binWidths, nFlows) 318 | PlotKQuantizationAndTruncationLines(binWidths, TOPK, nFlows) 319 | 320 | 321 | 322 | if __name__ == "__main__": 323 | 324 | #Quantization 325 | BIN_WIDTH = [1, 4, 8, 16, 32, 64, 128, 256] 326 | 327 | #Truncation Top-K features 328 | TOPK = [5, 10, 20, 30, 40, 50, 1500] 329 | TOPK = [10, 20, 30, 40, 50] 330 | 331 | 332 | #Total amount of flows per dataset 333 | N_FLOWS = 300 334 | 335 | PlotQuantizationLines(BIN_WIDTH, N_FLOWS) 336 | PlotKQuantizationAndTruncationLines(BIN_WIDTH, TOPK, N_FLOWS) -------------------------------------------------------------------------------- /Security Tasks Evaluation/MPTAnalysis/FacetAnalysis/compressive_ta.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import numpy as np 3 | import os 4 | import math 5 | 6 | 7 | def CreateCompressiveRepresentation(MODE, BIN_WIDTH, TOPK, SIGMA_PARAM, COMPRESSIVE_RATIO): 8 | 9 | for compressive_ratio in COMPRESSIVE_RATIO: 10 | for binWidth in BIN_WIDTH: 11 | for topk in TOPK: 12 | feature_set = 'PL_60_' + str(binWidth) + '_' + str(topk) 13 | data_folder = 'FeatureSets/' + feature_set + '/' 14 | 15 | #Sensing matrix parameters 16 | N = 0 17 | f = open(data_folder + "RegularTraffic_phase1_dataset.csv", 'r') 18 | reader = csv.reader(f, delimiter=',') 19 | for n, row in enumerate(reader): 20 | if(n == 0): 21 | N = len(row) -1 #Read number of bins from file 22 | f.close() 23 | 24 | M = N/compressive_ratio 25 | 26 | if(M < 1): 27 | print "Cannot compress further(features = %d, ratio = %d), only 1 feature left"%(N, compressive_ratio) 28 | continue 29 | 30 | np.random.seed(1) 31 | 32 | print "Compressive Ratio: %d"%(compressive_ratio) 33 | print "M: %d"%(M) 34 | print "N: %d"%(N) 35 | 36 | ###################################### 37 | # GAUSSIAN SENSING MATRIX 38 | ###################################### 39 | if MODE == "compressive_gaussian": 40 | print "Start Compressive Gaussian Representation" 41 | for sigma_param in SIGMA_PARAM: 42 | 43 | """ 44 | Generate sensing matrix 45 | """ 46 | 47 | sensing_matrix = np.random.normal(0,1,(M,N)) 48 | 49 | """ 50 | Process Phase 1 Data 51 | """ 52 | 53 | #Regular Traffic 54 | print "Compressive Gaussian: Phase 1, Regular - " + feature_set + "/CompressiveGaussian_regularTraffic_phase1_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv" 55 | output = open(data_folder + "CompressiveGaussian_regularTraffic_phase1_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv", "w") 56 | f = open(data_folder + "RegularTraffic_phase1_dataset.csv", 'r') 57 | reader = csv.reader(f, delimiter=',') 58 | 59 | #Process data row 60 | for n, row in enumerate(reader): 61 | if(n == 0): 62 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n") 63 | else: 64 | #Gather the first n packets array 65 | first_n_packets_vector = [] 66 | for i in row[:-1]: 67 | first_n_packets_vector.append(int(i)) 68 | 69 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector) 70 | 71 | #print "Compressed vector: " + str(compressed_vector) 72 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n") 73 | output.close() 74 | 75 | 76 | #Facet Traffic 77 | print "Compressive Gaussian: Phase 1, Facet - " + feature_set + "/CompressiveGaussian_facetTraffic_phase1_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv" 78 | output = open(data_folder + "CompressiveGaussian_facetTraffic_phase1_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv", "w") 79 | f = open(data_folder + "FacetTraffic_50_phase1_dataset.csv", 'r') 80 | reader = csv.reader(f, delimiter=',') 81 | 82 | #Process data row 83 | for n, row in enumerate(reader): 84 | if(n == 0): 85 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n") 86 | else: 87 | #Gather the first n packets array 88 | first_n_packets_vector = [] 89 | for i in row[:-1]: 90 | first_n_packets_vector.append(int(i)) 91 | 92 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector) 93 | 94 | #print "Compressed vector: " + str(compressed_vector) 95 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n") 96 | output.close() 97 | 98 | ######################################################################################## 99 | ######################################################################################## 100 | ######################################################################################## 101 | 102 | 103 | """ 104 | Process Phase 2 Data 105 | """ 106 | 107 | #Regular Traffic 108 | print "Compressive Gaussian: Phase 2, Regular - " + feature_set + "/CompressiveGaussian_regularTraffic_phase2_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv" 109 | output = open(data_folder + "CompressiveGaussian_regularTraffic_phase2_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv", "w") 110 | f = open(data_folder + "RegularTraffic_phase2_dataset.csv", 'r') 111 | reader = csv.reader(f, delimiter=',') 112 | 113 | #Process data row 114 | for n, row in enumerate(reader): 115 | if(n == 0): 116 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n") 117 | else: 118 | #Gather the first n packets array 119 | first_n_packets_vector = [] 120 | for i in row[:-1]: 121 | first_n_packets_vector.append(int(i)) 122 | 123 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector) 124 | 125 | #print "Compressed vector: " + str(compressed_vector) 126 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n") 127 | output.close() 128 | 129 | 130 | #Facet Traffic 131 | print "Compressive Gaussian Phase 2, Facet - " + feature_set + "/CompressiveGaussian_facetTraffic_phase2_" + str(sigma_param) + "_" + "_" + str(compressive_ratio) + "_dataset.csv" 132 | output = open(data_folder + "CompressiveGaussian_facetTraffic_phase2_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv", "w") 133 | f = open(data_folder + "FacetTraffic_50_phase2_dataset.csv", 'r') 134 | reader = csv.reader(f, delimiter=',') 135 | 136 | #Process data row 137 | for n, row in enumerate(reader): 138 | if(n == 0): 139 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n") 140 | else: 141 | #Gather the first n packets array 142 | first_n_packets_vector = [] 143 | for i in row[:-1]: 144 | first_n_packets_vector.append(int(i)) 145 | 146 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector) 147 | 148 | #print "Compressed vector: " + str(compressed_vector) 149 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n") 150 | output.close() 151 | 152 | ###################################### 153 | # BERNOULLI SENSING MATRIX 154 | ###################################### 155 | elif MODE == "compressive_bernoulli": 156 | print "Start Compressive Bernoulli Representation" 157 | 158 | """ 159 | Generate sensing matrix 160 | """ 161 | values = [-1/float(math.sqrt(N)), 1/float(math.sqrt(N))] 162 | sensing_matrix = np.random.choice(values,(M,N), p=[0.5, 0.5]) 163 | 164 | 165 | """ 166 | Process Phase 1 Data 167 | """ 168 | 169 | #Regular Traffic 170 | print "Compressive Bernoulli: Phase 1, Regular - " + feature_set + "/CompressiveBernoulli_regularTraffic_phase1_" + str(compressive_ratio) + "_dataset.csv" 171 | output = open(data_folder + "CompressiveBernoulli_regularTraffic_phase1_" + str(compressive_ratio) + "_dataset.csv", "w") 172 | f = open(data_folder + "RegularTraffic_phase1_dataset.csv", 'r') 173 | reader = csv.reader(f, delimiter=',') 174 | 175 | #Process data row 176 | for n, row in enumerate(reader): 177 | if(n == 0): 178 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n") 179 | else: 180 | #Gather the first n packets array 181 | first_n_packets_vector = [] 182 | for i in row[:-1]: 183 | first_n_packets_vector.append(int(i)) 184 | 185 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector) 186 | 187 | #print "Compressed vector: " + str(compressed_vector) 188 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n") 189 | output.close() 190 | 191 | 192 | #Facet Traffic 193 | print "Compressive Bernoulli: Phase 1, Facet - " + feature_set + "/CompressiveBernoulli_facetTraffic_phase1_" + str(compressive_ratio) + "_dataset.csv" 194 | output = open(data_folder + "CompressiveBernoulli_facetTraffic_phase1_" + str(compressive_ratio) + "_dataset.csv", "w") 195 | f = open(data_folder + "FacetTraffic_50_phase1_dataset.csv", 'r') 196 | reader = csv.reader(f, delimiter=',') 197 | 198 | #Process data row 199 | for n, row in enumerate(reader): 200 | if(n == 0): 201 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n") 202 | else: 203 | #Gather the first n packets array 204 | first_n_packets_vector = [] 205 | for i in row[:-1]: 206 | first_n_packets_vector.append(int(i)) 207 | 208 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector) 209 | 210 | #print "Compressed vector: " + str(compressed_vector) 211 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n") 212 | output.close() 213 | 214 | ######################################################################################## 215 | ######################################################################################## 216 | ######################################################################################## 217 | 218 | 219 | """ 220 | Process Phase 2 Data 221 | """ 222 | 223 | #Regular Traffic 224 | print "Compressive Bernoulli: Phase 2, Regular - " + feature_set + "/CompressiveBernoulli_regularTraffic_phase2_" + str(compressive_ratio) + "_dataset.csv" 225 | output = open(data_folder + "CompressiveBernoulli_regularTraffic_phase2_" + str(compressive_ratio) + "_dataset.csv", "w") 226 | f = open(data_folder + "RegularTraffic_phase2_dataset.csv", 'r') 227 | reader = csv.reader(f, delimiter=',') 228 | 229 | #Process data row 230 | for n, row in enumerate(reader): 231 | if(n == 0): 232 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n") 233 | else: 234 | #Gather the first n packets array 235 | first_n_packets_vector = [] 236 | for i in row[:-1]: 237 | first_n_packets_vector.append(int(i)) 238 | 239 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector) 240 | 241 | #print "Compressed vector: " + str(compressed_vector) 242 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n") 243 | output.close() 244 | 245 | 246 | #Facet Traffic 247 | print "Compressive Bernoulli Phase 2, Facet - " + feature_set + "/CompressiveBernoulli_facetTraffic_phase2_" + str(compressive_ratio) + "_dataset.csv" 248 | output = open(data_folder + "CompressiveBernoulli_facetTraffic_phase2_" + str(compressive_ratio) + "_dataset.csv", "w") 249 | f = open(data_folder + "FacetTraffic_50_phase2_dataset.csv", 'r') 250 | reader = csv.reader(f, delimiter=',') 251 | 252 | #Process data row 253 | for n, row in enumerate(reader): 254 | if(n == 0): 255 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n") 256 | else: 257 | #Gather the first n packets array 258 | first_n_packets_vector = [] 259 | for i in row[:-1]: 260 | first_n_packets_vector.append(int(i)) 261 | 262 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector) 263 | 264 | #print "Compressed vector: " + str(compressed_vector) 265 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n") 266 | output.close() -------------------------------------------------------------------------------- /Security Tasks Evaluation/MPTAnalysis/DeltaShaperAnalysis/compressive_ta.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import numpy as np 3 | import os 4 | import math 5 | 6 | 7 | def CreateCompressiveRepresentation(MODE, BIN_WIDTH, TOPK, SIGMA_PARAM, COMPRESSIVE_RATIO): 8 | 9 | for compressive_ratio in COMPRESSIVE_RATIO: 10 | for binWidth in BIN_WIDTH: 11 | for topk in TOPK: 12 | feature_set = 'PL_60_' + str(binWidth) + '_' + str(topk) 13 | data_folder = 'FeatureSets/' + feature_set + '/' 14 | 15 | #Sensing matrix parameters 16 | N = 0 17 | f = open(data_folder + "RegularTraffic_phase1_dataset.csv", 'r') 18 | reader = csv.reader(f, delimiter=',') 19 | for n, row in enumerate(reader): 20 | if(n == 0): 21 | N = len(row) -1 #Read number of bins from file 22 | f.close() 23 | 24 | M = N/compressive_ratio 25 | 26 | if(M < 1): 27 | print "Cannot compress further(features = %d, ratio = %d), only 1 feature left"%(N, compressive_ratio) 28 | continue 29 | 30 | np.random.seed(1) 31 | 32 | print "Compressive Ratio: %d"%(compressive_ratio) 33 | print "M: %d"%(M) 34 | print "N: %d"%(N) 35 | 36 | ###################################### 37 | # GAUSSIAN SENSING MATRIX 38 | ###################################### 39 | if MODE == "compressive_gaussian": 40 | print "Start Compressive Gaussian Representation" 41 | for sigma_param in SIGMA_PARAM: 42 | 43 | """ 44 | Generate sensing matrix 45 | """ 46 | 47 | sensing_matrix = np.random.normal(0,1,(M,N)) 48 | 49 | """ 50 | Process Phase 1 Data 51 | """ 52 | 53 | #Regular Traffic 54 | print "Compressive Gaussian: Phase 1, Regular - " + feature_set + "/CompressiveGaussian_regularTraffic_phase1_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv" 55 | output = open(data_folder + "CompressiveGaussian_regularTraffic_phase1_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv", "w") 56 | f = open(data_folder + "RegularTraffic_phase1_dataset.csv", 'r') 57 | reader = csv.reader(f, delimiter=',') 58 | 59 | #Process data row 60 | for n, row in enumerate(reader): 61 | if(n == 0): 62 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n") 63 | else: 64 | #Gather the first n packets array 65 | first_n_packets_vector = [] 66 | for i in row[:-1]: 67 | first_n_packets_vector.append(int(i)) 68 | 69 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector) 70 | 71 | #print "Compressed vector: " + str(compressed_vector) 72 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n") 73 | output.close() 74 | 75 | 76 | #DeltaShaper Traffic 77 | print "Compressive Gaussian: Phase 1, DeltaShaper - " + feature_set + "/CompressiveGaussian_deltashaperTraffic_phase1_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv" 78 | output = open(data_folder + "CompressiveGaussian_deltashaperTraffic_phase1_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv", "w") 79 | f = open(data_folder + "DeltaShaperTraffic_320_phase1_dataset.csv", 'r') 80 | reader = csv.reader(f, delimiter=',') 81 | 82 | #Process data row 83 | for n, row in enumerate(reader): 84 | if(n == 0): 85 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n") 86 | else: 87 | #Gather the first n packets array 88 | first_n_packets_vector = [] 89 | for i in row[:-1]: 90 | first_n_packets_vector.append(int(i)) 91 | 92 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector) 93 | 94 | #print "Compressed vector: " + str(compressed_vector) 95 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n") 96 | output.close() 97 | 98 | ######################################################################################## 99 | ######################################################################################## 100 | ######################################################################################## 101 | 102 | 103 | """ 104 | Process Phase 2 Data 105 | """ 106 | 107 | #Regular Traffic 108 | print "Compressive Gaussian: Phase 2, Regular - " + feature_set + "/CompressiveGaussian_regularTraffic_phase2_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv" 109 | output = open(data_folder + "CompressiveGaussian_regularTraffic_phase2_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv", "w") 110 | f = open(data_folder + "RegularTraffic_phase2_dataset.csv", 'r') 111 | reader = csv.reader(f, delimiter=',') 112 | 113 | #Process data row 114 | for n, row in enumerate(reader): 115 | if(n == 0): 116 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n") 117 | else: 118 | #Gather the first n packets array 119 | first_n_packets_vector = [] 120 | for i in row[:-1]: 121 | first_n_packets_vector.append(int(i)) 122 | 123 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector) 124 | 125 | #print "Compressed vector: " + str(compressed_vector) 126 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n") 127 | output.close() 128 | 129 | 130 | #DeltaShaper Traffic 131 | print "Compressive Gaussian Phase 2, DeltaShaper - " + feature_set + "/CompressiveGaussian_deltashaperTraffic_phase2_" + str(sigma_param) + "_" + "_" + str(compressive_ratio) + "_dataset.csv" 132 | output = open(data_folder + "CompressiveGaussian_deltashaperTraffic_phase2_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv", "w") 133 | f = open(data_folder + "DeltaShaperTraffic_320_phase2_dataset.csv", 'r') 134 | reader = csv.reader(f, delimiter=',') 135 | 136 | #Process data row 137 | for n, row in enumerate(reader): 138 | if(n == 0): 139 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n") 140 | else: 141 | #Gather the first n packets array 142 | first_n_packets_vector = [] 143 | for i in row[:-1]: 144 | first_n_packets_vector.append(int(i)) 145 | 146 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector) 147 | 148 | #print "Compressed vector: " + str(compressed_vector) 149 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n") 150 | output.close() 151 | 152 | ###################################### 153 | # BERNOULLI SENSING MATRIX 154 | ###################################### 155 | elif MODE == "compressive_bernoulli": 156 | print "Start Compressive Bernoulli Representation" 157 | 158 | """ 159 | Generate sensing matrix 160 | """ 161 | values = [-1/float(math.sqrt(N)), 1/float(math.sqrt(N))] 162 | sensing_matrix = np.random.choice(values,(M,N), p=[0.5, 0.5]) 163 | 164 | 165 | """ 166 | Process Phase 1 Data 167 | """ 168 | 169 | #Regular Traffic 170 | print "Compressive Bernoulli: Phase 1, Regular - " + feature_set + "/CompressiveBernoulli_regularTraffic_phase1_" + str(compressive_ratio) + "_dataset.csv" 171 | output = open(data_folder + "CompressiveBernoulli_regularTraffic_phase1_" + str(compressive_ratio) + "_dataset.csv", "w") 172 | f = open(data_folder + "RegularTraffic_phase1_dataset.csv", 'r') 173 | reader = csv.reader(f, delimiter=',') 174 | 175 | #Process data row 176 | for n, row in enumerate(reader): 177 | if(n == 0): 178 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n") 179 | else: 180 | #Gather the first n packets array 181 | first_n_packets_vector = [] 182 | for i in row[:-1]: 183 | first_n_packets_vector.append(int(i)) 184 | 185 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector) 186 | 187 | #print "Compressed vector: " + str(compressed_vector) 188 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n") 189 | output.close() 190 | 191 | 192 | #DeltaShaper Traffic 193 | print "Compressive Bernoulli: Phase 1, DeltaShaper - " + feature_set + "/CompressiveBernoulli_deltashaperTraffic_phase1_" + str(compressive_ratio) + "_dataset.csv" 194 | output = open(data_folder + "CompressiveBernoulli_deltashaperTraffic_phase1_" + str(compressive_ratio) + "_dataset.csv", "w") 195 | f = open(data_folder + "DeltaShaperTraffic_320_phase1_dataset.csv", 'r') 196 | reader = csv.reader(f, delimiter=',') 197 | 198 | #Process data row 199 | for n, row in enumerate(reader): 200 | if(n == 0): 201 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n") 202 | else: 203 | #Gather the first n packets array 204 | first_n_packets_vector = [] 205 | for i in row[:-1]: 206 | first_n_packets_vector.append(int(i)) 207 | 208 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector) 209 | 210 | #print "Compressed vector: " + str(compressed_vector) 211 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n") 212 | output.close() 213 | 214 | ######################################################################################## 215 | ######################################################################################## 216 | ######################################################################################## 217 | 218 | 219 | """ 220 | Process Phase 2 Data 221 | """ 222 | 223 | #Regular Traffic 224 | print "Compressive Bernoulli: Phase 2, Regular - " + feature_set + "/CompressiveBernoulli_regularTraffic_phase2_" + str(compressive_ratio) + "_dataset.csv" 225 | output = open(data_folder + "CompressiveBernoulli_regularTraffic_phase2_" + str(compressive_ratio) + "_dataset.csv", "w") 226 | f = open(data_folder + "RegularTraffic_phase2_dataset.csv", 'r') 227 | reader = csv.reader(f, delimiter=',') 228 | 229 | #Process data row 230 | for n, row in enumerate(reader): 231 | if(n == 0): 232 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n") 233 | else: 234 | #Gather the first n packets array 235 | first_n_packets_vector = [] 236 | for i in row[:-1]: 237 | first_n_packets_vector.append(int(i)) 238 | 239 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector) 240 | 241 | #print "Compressed vector: " + str(compressed_vector) 242 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n") 243 | output.close() 244 | 245 | 246 | #DeltaShaper Traffic 247 | print "Compressive Bernoulli Phase 2, DeltaShaper - " + feature_set + "/CompressiveBernoulli_deltashaperTraffic_phase2_" + str(compressive_ratio) + "_dataset.csv" 248 | output = open(data_folder + "CompressiveBernoulli_deltashaperTraffic_phase2_" + str(compressive_ratio) + "_dataset.csv", "w") 249 | f = open(data_folder + "DeltaShaperTraffic_320_phase2_dataset.csv", 'r') 250 | reader = csv.reader(f, delimiter=',') 251 | 252 | #Process data row 253 | for n, row in enumerate(reader): 254 | if(n == 0): 255 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n") 256 | else: 257 | #Gather the first n packets array 258 | first_n_packets_vector = [] 259 | for i in row[:-1]: 260 | first_n_packets_vector.append(int(i)) 261 | 262 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector) 263 | 264 | #print "Compressed vector: " + str(compressed_vector) 265 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n") 266 | output.close() -------------------------------------------------------------------------------- /Flow Marker Accumulator/flowlens-v1model.p4: -------------------------------------------------------------------------------- 1 | /* -*- P4_16 -*- */ 2 | #include 3 | #include 4 | 5 | /*Set number of shifts according to the quantization level 6 | QL=2, 1 7 | QL=4, 2 8 | QL=8, 3 9 | QL=16, 4 10 | QL=32, 5 11 | QL=64, 6 12 | QL=128, 7 13 | QL=256, 8 14 | */ 15 | 16 | /* In our running example, we will use QL=16 */ 17 | const bit<8> BIN_WIDTH_SHIFT = 4; 18 | 19 | /* Number of counters for each flow */ 20 | const bit<32> FLOW_BINS = 1500 >> BIN_WIDTH_SHIFT; //94 flow counters for QL=16 21 | 22 | /* Number of flows in each partition */ 23 | const bit<32> FLOWS_PER_PARTITION = 3000; 24 | 25 | const bit<32> PARTITION_SIZE = FLOWS_PER_PARTITION*FLOW_BINS; 26 | 27 | /* Number of packet sizes considered for truncation */ 28 | const bit<32> NUM_PKT_SIZES = 1500; 29 | 30 | /* To flag bins not to be counted */ 31 | const bit<1> NOBIN_FLAG = 0; 32 | 33 | 34 | typedef bit<9> egressSpec_t; 35 | typedef bit<48> macAddr_t; 36 | typedef bit<32> ip4Addr_t; 37 | const bit<16> TYPE_IPV4 = 0x800; 38 | typedef bit<8> ip_protocol_t; 39 | const ip_protocol_t IP_PROTOCOLS_TCP = 6; 40 | const ip_protocol_t IP_PROTOCOLS_UDP = 17; 41 | 42 | /************************************************************************* 43 | *********************** H E A D E R S *********************************** 44 | *************************************************************************/ 45 | 46 | header ethernet_t { 47 | macAddr_t dstAddr; 48 | macAddr_t srcAddr; 49 | bit<16> etherType; 50 | } 51 | 52 | header ipv4_t { 53 | bit<4> version; 54 | bit<4> ihl; 55 | bit<8> diffserv; 56 | bit<16> totalLen; 57 | bit<16> identification; 58 | bit<3> flags; 59 | bit<13> fragOffset; 60 | bit<8> ttl; 61 | bit<8> protocol; 62 | bit<16> hdrChecksum; 63 | ip4Addr_t srcAddr; 64 | ip4Addr_t dstAddr; 65 | } 66 | 67 | header tcp_t { 68 | bit<16> srcPort; 69 | bit<16> dstPort; 70 | bit<32> seqNo; 71 | bit<32> ackNo; 72 | bit<4> dataOffset; 73 | bit<3> res; 74 | bit<3> ecn; 75 | bit<6> ctrl; 76 | bit<16> window; 77 | bit<16> checksum; 78 | bit<16> urgentPtr; 79 | } 80 | 81 | header udp_t { 82 | bit<16> srcPort; 83 | bit<16> dstPort; 84 | bit<16> length_; 85 | bit<16> checksum; 86 | } 87 | 88 | //User-defined metadata 89 | struct metadata { 90 | bit truncation_flag; // marks whether or not the current pkt has to be counted 91 | bit<32> rg_bin_offset; // this is computed by adding the binIndex_posTruncation to the flow_offset 92 | bit<32> binIndex_preTruncation; 93 | bit<32> binIndex_posTruncation; 94 | } 95 | 96 | struct headers { 97 | ethernet_t ethernet; 98 | ipv4_t ipv4; 99 | tcp_t tcp; 100 | udp_t udp; 101 | } 102 | 103 | /************************************************************************* 104 | *********************** P A R S E R *********************************** 105 | *************************************************************************/ 106 | 107 | parser MyParser(packet_in packet, 108 | out headers hdr, 109 | inout metadata meta, 110 | inout standard_metadata_t standard_metadata) { 111 | 112 | // Initial state of the parser 113 | state start { 114 | transition parse_ethernet; 115 | } 116 | 117 | state parse_ethernet { 118 | packet.extract(hdr.ethernet); 119 | transition select(hdr.ethernet.etherType) { 120 | TYPE_IPV4: parse_ipv4; 121 | default: accept; 122 | } 123 | } 124 | 125 | state parse_ipv4 { 126 | packet.extract(hdr.ipv4); 127 | transition select(hdr.ipv4.protocol) { 128 | 6: parse_tcp; 129 | 17: parse_udp; 130 | default: accept; 131 | } 132 | } 133 | 134 | state parse_tcp { 135 | packet.extract(hdr.tcp); 136 | transition accept; 137 | } 138 | 139 | state parse_udp { 140 | packet.extract(hdr.udp); 141 | transition accept; 142 | } 143 | } 144 | 145 | 146 | /************************************************************************* 147 | ************ C H E C K S U M V E R I F I C A T I O N ************* 148 | *************************************************************************/ 149 | 150 | control MyVerifyChecksum(inout headers hdr, inout metadata meta) { 151 | apply { } 152 | } 153 | 154 | /************************************************************************* 155 | ************** I N G R E S S P R O C E S S I N G ******************* 156 | *************************************************************************/ 157 | 158 | control MyIngress(inout headers hdr, 159 | inout metadata meta, 160 | inout standard_metadata_t standard_metadata) { 161 | 162 | action drop() { 163 | mark_to_drop(standard_metadata); 164 | } 165 | 166 | 167 | /////////////////////////////////////////////////////// 168 | //Set ipv4 forwarding for packets traversing the switch 169 | /////////////////////////////////////////////////////// 170 | action ipv4_forward(macAddr_t dstAddr, egressSpec_t port) { 171 | standard_metadata.egress_spec = port; //Sets the egress port for the next hop. 172 | hdr.ethernet.srcAddr = hdr.ethernet.dstAddr; //Updates the ethernet destination address with the address of the next hop. 173 | hdr.ethernet.dstAddr = dstAddr; //Updates the ethernet source address with the address of the switch. 174 | hdr.ipv4.ttl = hdr.ipv4.ttl - 1; //Decrements time to live 175 | } 176 | 177 | 178 | table ipv4_lpm { 179 | key = { 180 | hdr.ipv4.dstAddr: exact; 181 | hdr.ipv4.srcAddr: exact; 182 | } 183 | actions = { 184 | ipv4_forward; 185 | drop; 186 | } 187 | size = 1024; 188 | default_action = drop(); 189 | } 190 | 191 | apply { 192 | 193 | if (hdr.ipv4.isValid()) { 194 | 195 | ipv4_lpm.apply(); 196 | 197 | } 198 | else { 199 | drop(); 200 | } 201 | } 202 | } 203 | 204 | /************************************************************************* 205 | **************** E G R E S S P R O C E S S I N G ******************* 206 | *************************************************************************/ 207 | 208 | control MyEgress(inout headers hdr, 209 | inout metadata meta, 210 | inout standard_metadata_t standard_metadata) { 211 | 212 | register>(PARTITION_SIZE) reg_grid0; 213 | register>(PARTITION_SIZE) reg_grid1; 214 | register>(PARTITION_SIZE) reg_grid2; 215 | register>(PARTITION_SIZE) reg_grid3; 216 | register>(PARTITION_SIZE) reg_grid4; 217 | register>(PARTITION_SIZE) reg_grid5; 218 | register>(PARTITION_SIZE) reg_grid6; 219 | register>(PARTITION_SIZE) reg_grid7; 220 | register>(PARTITION_SIZE) reg_grid8; 221 | 222 | 223 | //****************** Register Actions Definition************************ 224 | action reg_grid0_action() { 225 | bit<16> value; 226 | reg_grid0.read(value, meta.rg_bin_offset); 227 | value = value+1; 228 | reg_grid0.write(meta.rg_bin_offset, value); 229 | } 230 | 231 | action reg_grid1_action() { 232 | bit<16> value; 233 | reg_grid1.read(value, meta.rg_bin_offset); 234 | value = value+1; 235 | reg_grid1.write(meta.rg_bin_offset, value); 236 | } 237 | 238 | action reg_grid2_action() { 239 | bit<16> value; 240 | reg_grid2.read(value, meta.rg_bin_offset); 241 | value = value+1; 242 | reg_grid2.write(meta.rg_bin_offset, value); 243 | } 244 | 245 | action reg_grid3_action() { 246 | bit<16> value; 247 | reg_grid3.read(value, meta.rg_bin_offset); 248 | value = value+1; 249 | reg_grid3.write(meta.rg_bin_offset, value); 250 | } 251 | 252 | action reg_grid4_action() { 253 | bit<16> value; 254 | reg_grid4.read(value, meta.rg_bin_offset); 255 | value = value+1; 256 | reg_grid4.write(meta.rg_bin_offset, value); 257 | } 258 | 259 | action reg_grid5_action() { 260 | bit<16> value; 261 | reg_grid5.read(value, meta.rg_bin_offset); 262 | value = value+1; 263 | reg_grid5.write(meta.rg_bin_offset, value); 264 | } 265 | 266 | action reg_grid6_action() { 267 | bit<16> value; 268 | reg_grid6.read(value, meta.rg_bin_offset); 269 | value = value+1; 270 | reg_grid6.write(meta.rg_bin_offset, value); 271 | } 272 | 273 | action reg_grid7_action() { 274 | bit<16> value; 275 | reg_grid7.read(value, meta.rg_bin_offset); 276 | value = value+1; 277 | reg_grid7.write(meta.rg_bin_offset, value); 278 | } 279 | 280 | action reg_grid8_action() { 281 | bit<16> value; 282 | reg_grid8.read(value, meta.rg_bin_offset); 283 | value = value+1; 284 | reg_grid8.write(meta.rg_bin_offset, value); 285 | } 286 | 287 | //******************End Register Actions Definition********************* 288 | 289 | //****************** Other Actions Definition************************ 290 | 291 | // flow_offset: is used for indexing the flow within a bin of the reg_grid 292 | action set_flow_data(bit<32> flow_offset) { 293 | meta.rg_bin_offset = flow_offset + meta.binIndex_posTruncation; 294 | } 295 | 296 | action quantization_act(){ 297 | meta.binIndex_preTruncation = (bit<32>) (standard_metadata.packet_length >> BIN_WIDTH_SHIFT); 298 | } 299 | 300 | action truncate_binIndex(bit<32> new_index, bit flag) { 301 | meta.binIndex_posTruncation = new_index; 302 | meta.truncation_flag = flag; 303 | } 304 | 305 | 306 | //******************End Other Actions Definition********************* 307 | 308 | //******************Tables Definition************************** 309 | 310 | table flow_tbl0 { 311 | key = { 312 | hdr.ipv4.dstAddr: exact; 313 | hdr.ipv4.srcAddr: exact; 314 | meta.truncation_flag : exact; 315 | } 316 | actions = { 317 | set_flow_data; 318 | NoAction(); 319 | } 320 | default_action = NoAction(); 321 | size = FLOWS_PER_PARTITION; 322 | } 323 | 324 | table flow_tbl1 { 325 | key = { 326 | hdr.ipv4.dstAddr: exact; 327 | hdr.ipv4.srcAddr: exact; 328 | meta.truncation_flag : exact; 329 | } 330 | actions = { 331 | set_flow_data; 332 | NoAction(); 333 | } 334 | default_action = NoAction(); 335 | size = FLOWS_PER_PARTITION; 336 | } 337 | 338 | table flow_tbl2 { 339 | key = { 340 | hdr.ipv4.dstAddr: exact; 341 | hdr.ipv4.srcAddr: exact; 342 | meta.truncation_flag : exact; 343 | } 344 | actions = { 345 | set_flow_data; 346 | NoAction(); 347 | } 348 | default_action = NoAction(); 349 | size = FLOWS_PER_PARTITION; 350 | } 351 | 352 | table flow_tbl3 { 353 | key = { 354 | hdr.ipv4.dstAddr: exact; 355 | hdr.ipv4.srcAddr: exact; 356 | meta.truncation_flag : exact; 357 | } 358 | actions = { 359 | set_flow_data; 360 | NoAction(); 361 | } 362 | default_action = NoAction(); 363 | size = FLOWS_PER_PARTITION; 364 | } 365 | 366 | table flow_tbl4 { 367 | key = { 368 | hdr.ipv4.dstAddr: exact; 369 | hdr.ipv4.srcAddr: exact; 370 | meta.truncation_flag : exact; 371 | } 372 | actions = { 373 | set_flow_data; 374 | NoAction(); 375 | } 376 | default_action = NoAction(); 377 | size = FLOWS_PER_PARTITION; 378 | } 379 | 380 | table flow_tbl5 { 381 | key = { 382 | hdr.ipv4.dstAddr: exact; 383 | hdr.ipv4.srcAddr: exact; 384 | meta.truncation_flag : exact; 385 | } 386 | actions = { 387 | set_flow_data; 388 | NoAction(); 389 | } 390 | default_action = NoAction(); 391 | size = FLOWS_PER_PARTITION; 392 | } 393 | 394 | table flow_tbl6 { 395 | key = { 396 | hdr.ipv4.dstAddr: exact; 397 | hdr.ipv4.srcAddr: exact; 398 | meta.truncation_flag : exact; 399 | } 400 | actions = { 401 | set_flow_data; 402 | NoAction(); 403 | } 404 | default_action = NoAction(); 405 | size = FLOWS_PER_PARTITION; 406 | } 407 | 408 | table flow_tbl7 { 409 | key = { 410 | hdr.ipv4.dstAddr: exact; 411 | hdr.ipv4.srcAddr: exact; 412 | meta.truncation_flag : exact; 413 | } 414 | actions = { 415 | set_flow_data; 416 | NoAction(); 417 | } 418 | default_action = NoAction(); 419 | size = FLOWS_PER_PARTITION; 420 | } 421 | 422 | table flow_tbl8 { 423 | key = { 424 | hdr.ipv4.dstAddr: exact; 425 | hdr.ipv4.srcAddr: exact; 426 | meta.truncation_flag : exact; 427 | } 428 | actions = { 429 | set_flow_data; 430 | NoAction(); 431 | } 432 | default_action = NoAction(); 433 | size = FLOWS_PER_PARTITION; 434 | } 435 | 436 | table truncation_tbl { 437 | key = { 438 | meta.binIndex_preTruncation: exact; 439 | } 440 | actions = { 441 | truncate_binIndex(); 442 | NoAction(); 443 | } 444 | default_action = truncate_binIndex(0, NOBIN_FLAG); 445 | size = NUM_PKT_SIZES; 446 | } 447 | 448 | 449 | //******************End Tables Definition*********************** 450 | 451 | 452 | apply { 453 | 454 | quantization_act(); 455 | 456 | truncation_tbl.apply(); 457 | 458 | if(flow_tbl0.apply().hit) { 459 | reg_grid0_action(); 460 | } 461 | else { 462 | if(flow_tbl1.apply().hit) { 463 | reg_grid1_action(); 464 | } 465 | else { 466 | if(flow_tbl2.apply().hit) { 467 | reg_grid2_action(); 468 | } 469 | else { 470 | if(flow_tbl3.apply().hit) { 471 | reg_grid3_action(); 472 | } 473 | else { 474 | if(flow_tbl4.apply().hit) { 475 | reg_grid4_action(); 476 | } 477 | else { 478 | if(flow_tbl5.apply().hit) { 479 | reg_grid5_action(); 480 | } 481 | else { 482 | if(flow_tbl6.apply().hit) { 483 | reg_grid6_action(); 484 | } 485 | else { 486 | if(flow_tbl7.apply().hit) { 487 | reg_grid7_action(); 488 | } 489 | else { 490 | if(flow_tbl8.apply().hit) { 491 | reg_grid8_action(); 492 | } 493 | } 494 | } 495 | } 496 | } 497 | } 498 | } 499 | } 500 | } 501 | 502 | } // end of the apply block 503 | 504 | } 505 | 506 | /************************************************************************* 507 | ************* C H E C K S U M C O M P U T A T I O N ************** 508 | *************************************************************************/ 509 | 510 | control MyComputeChecksum(inout headers hdr, inout metadata meta) { 511 | apply { 512 | update_checksum( 513 | hdr.ipv4.isValid(), 514 | { hdr.ipv4.version, 515 | hdr.ipv4.ihl, 516 | hdr.ipv4.diffserv, 517 | hdr.ipv4.totalLen, 518 | hdr.ipv4.identification, 519 | hdr.ipv4.flags, 520 | hdr.ipv4.fragOffset, 521 | hdr.ipv4.ttl, 522 | hdr.ipv4.protocol, 523 | hdr.ipv4.srcAddr, 524 | hdr.ipv4.dstAddr }, 525 | hdr.ipv4.hdrChecksum, 526 | HashAlgorithm.csum16); 527 | } 528 | } 529 | 530 | 531 | /************************************************************************* 532 | *********************** D E P A R S E R ******************************* 533 | *************************************************************************/ 534 | 535 | control MyDeparser(packet_out packet, in headers hdr) { 536 | 537 | //deparser that selects the order in which fields inserted into the outgoing packet. 538 | apply { 539 | packet.emit(hdr.ethernet); 540 | packet.emit(hdr.ipv4); 541 | packet.emit(hdr.tcp); 542 | packet.emit(hdr.udp); 543 | } 544 | } 545 | 546 | /************************************************************************* 547 | *********************** S W I T C H ******************************* 548 | *************************************************************************/ 549 | 550 | V1Switch( 551 | MyParser(), 552 | MyVerifyChecksum(), 553 | MyIngress(), 554 | MyEgress(), 555 | MyComputeChecksum(), 556 | MyDeparser() 557 | ) main; 558 | -------------------------------------------------------------------------------- /Security Tasks Evaluation/MPTAnalysis/FacetAnalysis/generateFeatures.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import collections 3 | import dpkt 4 | import subprocess 5 | import socket 6 | import os 7 | import sys 8 | import math 9 | import csv 10 | import numpy as np 11 | from itertools import product 12 | from scipy.stats import kurtosis, skew 13 | import time 14 | import glob 15 | 16 | 17 | DEST_IP = '172.31.0.2' 18 | SOURCE_IP = '172.31.0.19' 19 | 20 | def MergeDatasets(data_folder): 21 | if(os.path.exists(data_folder + '/full_dataset.csv')): 22 | os.remove(data_folder + '/full_dataset.csv') 23 | 24 | features_files = [data_folder + "facet_dataset.csv", data_folder + "RegularTraffic_dataset.csv"] 25 | 26 | print "Merging full dataset..." 27 | header_saved = False 28 | with open(data_folder + '/full_dataset.csv','wb') as fout: 29 | for filename in features_files: 30 | print "merging " + filename 31 | with open(filename) as fin: 32 | header = next(fin) 33 | if not header_saved: 34 | fout.write(header) 35 | header_saved = True 36 | for line in fin: 37 | fout.write(line) 38 | print "Dataset merged!" 39 | 40 | 41 | def CombinedMerging(data_folder): 42 | if(os.path.exists(data_folder + '/regular_50_dataset.csv')): 43 | os.remove(data_folder + '/regular_50_dataset.csv') 44 | 45 | features_files = [data_folder + "FacetTraffic_50_dataset.csv", data_folder + "RegularTraffic_dataset.csv"] 46 | 47 | print "Merging dataset..." 48 | header_saved = False 49 | with open(data_folder + '/regular_50_dataset.csv','wb') as fout: 50 | for filename in features_files: 51 | print "merging " + filename 52 | with open(filename) as fin: 53 | header = next(fin) 54 | if not header_saved: 55 | fout.write(header) 56 | header_saved = True 57 | for line in fin: 58 | fout.write(line) 59 | print "Dataset merged!" 60 | 61 | 62 | def MergeSamples(data_folder): 63 | #Generate training dataset 64 | facet_files = glob.glob(data_folder + "/FacetTraffic_*.csv") 65 | 66 | header_saved = False 67 | with open(data_folder + '/facet_dataset.csv','wb') as fout: 68 | for filename in facet_files: 69 | with open(filename) as fin: 70 | header = next(fin) 71 | if not header_saved: 72 | fout.write(header) 73 | header_saved = True 74 | for line in fin: 75 | fout.write(line) 76 | 77 | 78 | def GenerateDatasets(data_folder): 79 | MergeSamples(data_folder) 80 | CombinedMerging(data_folder) 81 | #MergeDatasets(data_folder) 82 | 83 | 84 | def RoundToNearest(n, m): 85 | r = n % m 86 | return n + m - r if r + r >= m else n - r 87 | 88 | 89 | def FeatureExtractionPLBenchmark(sampleFolder, binWidth, topk): 90 | #Bucket importance in decreasing order 91 | BUCKETS_TO_MEASURE = [] 92 | 93 | #Measure interesting buckets 94 | if(topk != 1500): 95 | #Buckets in decreasing importance order 96 | f_imp = np.load('classificationResults/PL_60_' + str(binWidth) + '_1500/FeatureImportance_XGBoost_FacetTraffic_50.npy') 97 | #Print top k 98 | #for f in f_imp: 99 | # print str(f[1]) + " " + str(f[2]) 100 | 101 | if(topk > len(f_imp)): 102 | print "Skipping, not enough features to accomodate for. TopK = " + str(topk) + " Features = " + str(len(f_imp)) 103 | return 104 | for i in range(0,topk): 105 | b = int(f_imp[i][2].split("_")[1]) 106 | print "Top-" + str(i) + " = " + str(b) 107 | BUCKETS_TO_MEASURE.append(b) 108 | 109 | #Measure all buckets 110 | elif(topk == 1500): 111 | for i in range(0,1500,binWidth): 112 | BUCKETS_TO_MEASURE.append(i/binWidth) 113 | 114 | 115 | quantized_buckets_to_measure = sorted(BUCKETS_TO_MEASURE) 116 | print "Quantized buckets to measure = " + str(quantized_buckets_to_measure) 117 | print "Number of buckets to measure = " + str(len(quantized_buckets_to_measure)) 118 | 119 | 120 | traceInterval = 60 #Amount of time in packet trace to consider for feature extraction 121 | feature_set_folder = 'FeatureSets/PL_' + str(traceInterval) + "_" + str(binWidth) + "_" + str(topk) 122 | print feature_set_folder 123 | 124 | if not os.path.exists(feature_set_folder): 125 | os.makedirs(feature_set_folder) 126 | arff_path = feature_set_folder + '/' + os.path.basename(sampleFolder) + '_dataset.csv' 127 | arff = open(arff_path, 'wb') 128 | written_header = False 129 | 130 | 131 | for sample in os.listdir(sampleFolder): 132 | f = open(sampleFolder + "/" + sample + "/" + sample + ".pcap") 133 | pcap = dpkt.pcap.Reader(f) 134 | 135 | #Analyse packets transmited 136 | bin_dict = {} 137 | 138 | 139 | for i in quantized_buckets_to_measure: 140 | bin_dict[i] = 0 141 | 142 | 143 | firstTime = 0.0 144 | setFirst = False 145 | for ts, buf in pcap: 146 | if(not(setFirst)): 147 | firstTime = ts 148 | setFirst = True 149 | 150 | if(ts < (firstTime + traceInterval)): 151 | 152 | eth = dpkt.ethernet.Ethernet(buf) 153 | ip_hdr = eth.data 154 | try: 155 | src_ip_addr_str = socket.inet_ntoa(ip_hdr.src) 156 | #Target UDP communication between both cluster machines 157 | if (ip_hdr.p == 17 and src_ip_addr_str == SOURCE_IP): 158 | binned = RoundToNearest(len(buf),binWidth) 159 | if(binned/binWidth in quantized_buckets_to_measure): 160 | bin_dict[binned/binWidth]+=1 161 | except: 162 | pass 163 | f.close() 164 | 165 | od_dict = collections.OrderedDict(sorted(bin_dict.items(), key=lambda t: float(t[0]))) 166 | bin_list = [] 167 | for i in od_dict: 168 | bin_list.append(od_dict[i]) 169 | 170 | 171 | label = os.path.basename(sampleFolder) 172 | if('Regular' in sampleFolder): 173 | label = 'Regular' 174 | 175 | #Write sample features to the csv file 176 | f_names = [] 177 | f_values = [] 178 | 179 | for i, b in enumerate(bin_list): 180 | f_names.append('packetLengthBin_' + str(quantized_buckets_to_measure[i])) 181 | f_values.append(b) 182 | 183 | 184 | #print len(f_names) 185 | f_names.append('Class') 186 | f_values.append(label) 187 | 188 | if(not written_header): 189 | arff.write(', '.join(f_names)) 190 | arff.write('\n') 191 | print "Writing header" 192 | written_header = True 193 | 194 | l = [] 195 | for v in f_values: 196 | l.append(str(v)) 197 | arff.write(', '.join(l)) 198 | arff.write('\n') 199 | arff.close() 200 | return feature_set_folder 201 | 202 | 203 | def CompressFeatures(BIN_WIDTH, TOPK): 204 | sampleFolders = [ 205 | "TrafficCaptures/240Resolution/FacetTraffic_50", 206 | "TrafficCaptures/240Resolution/RegularTraffic", 207 | ] 208 | 209 | 210 | if not os.path.exists('FeatureSets'): 211 | os.makedirs('FeatureSets') 212 | 213 | for topk in TOPK: 214 | for binWidth in BIN_WIDTH: 215 | start = time.time() 216 | print "\n#####################################" 217 | print "Generating Dataset based on Binned Packet Length Features" 218 | for sampleFolder in sampleFolders: 219 | print "\n#############################" 220 | print "Parsing " + sampleFolder 221 | print "#############################" 222 | feature_set_folder = FeatureExtractionPLBenchmark(sampleFolder, binWidth, topk) 223 | if(feature_set_folder is not None): 224 | GenerateDatasets(feature_set_folder + '/') 225 | end = time.time() 226 | print "Optimize_compress_bin_%s_topk_%s_time_%s"%(binWidth, topk, end-start) 227 | 228 | 229 | def SplitDataset(DATASET_SPLIT, N_FLOWS, COVERT_FLOWS_PERC): 230 | 231 | print "Splitting datasets with DATASET_SPLIT= %s, N_FLOWS = %s, REG_FLOWS_PROP = %s"%(DATASET_SPLIT, N_FLOWS, COVERT_FLOWS_PERC) 232 | split_value = DATASET_SPLIT * N_FLOWS #samples 233 | covert_split_value = COVERT_FLOWS_PERC * split_value 234 | 235 | print "SPLIT_VALUE = %s"%(split_value) 236 | print "COVERT_SAMPLES_VALUE = %s"%(covert_split_value) 237 | 238 | for feature_folder in os.listdir("FeatureSets"): 239 | if(".DS_Store" not in feature_folder): 240 | start = time.time() 241 | print "Splitting %s"%("FeatureSets/" + feature_folder + "/RegularTraffic_dataset.csv") 242 | #Split RegularFlows 243 | RegularFile = open("FeatureSets/" + feature_folder + "/RegularTraffic_dataset.csv", 'rb') 244 | csv_reader = csv.reader(RegularFile, delimiter=',') 245 | 246 | PhaseOneRegularFile = open("FeatureSets/" + feature_folder + "/RegularTraffic_phase1_dataset.csv", 'w') 247 | PhaseTwoRegularFile = open("FeatureSets/" + feature_folder + "/RegularTraffic_phase2_dataset.csv", 'w') 248 | 249 | for n, row in enumerate(csv_reader): 250 | if(n == 0): 251 | row_string = ",".join(row) + "\n" 252 | PhaseOneRegularFile.write(row_string) 253 | PhaseTwoRegularFile.write(row_string) 254 | elif(n < split_value): 255 | row_string = ",".join(row) + "\n" 256 | PhaseOneRegularFile.write(row_string) 257 | else: 258 | row_string = ",".join(row) + "\n" 259 | PhaseTwoRegularFile.write(row_string) 260 | 261 | RegularFile.close() 262 | PhaseOneRegularFile.close() 263 | PhaseTwoRegularFile.close() 264 | 265 | 266 | #Split CovertFlows 267 | print "Splitting %s"%("FeatureSets/" + feature_folder + "/FacetTraffic_50_dataset.csv") 268 | CovertFile = open("FeatureSets/" + feature_folder + "/FacetTraffic_50_dataset.csv", "rb") 269 | csv_reader = csv.reader(CovertFile, delimiter=',') 270 | 271 | PhaseOneCovertFile = open("FeatureSets/" + feature_folder + "/FacetTraffic_50_phase1_dataset.csv", "w") 272 | PhaseTwoCovertFile = open("FeatureSets/" + feature_folder + "/FacetTraffic_50_phase2_dataset.csv", "w") 273 | 274 | for n, row in enumerate(csv_reader): 275 | if(n == 0): 276 | row_string = ",".join(row) + "\n" 277 | PhaseOneCovertFile.write(row_string) 278 | PhaseTwoCovertFile.write(row_string) 279 | elif(n < split_value): 280 | row_string = ",".join(row) + "\n" 281 | PhaseOneCovertFile.write(row_string) 282 | else: 283 | row_string = ",".join(row) + "\n" 284 | PhaseTwoCovertFile.write(row_string) 285 | 286 | CovertFile.close() 287 | PhaseOneCovertFile.close() 288 | PhaseTwoCovertFile.close() 289 | end = time.time() 290 | binWidth = feature_folder.split("_")[2] 291 | topk = feature_folder.split("_")[3] 292 | print "Optimize_split_bin_%s_topk_%s_time_%s"%(binWidth, topk, end-start) 293 | 294 | 295 | def MergeTestData(): 296 | for feature_folder in os.listdir("FeatureSets"): 297 | if(".DS_Store" not in feature_folder): 298 | print "Merging %s"%("FeatureSets/" + feature_folder + "/RegularTraffic_phase2_dataset.csv") 299 | print "Merging %s"%("FeatureSets/" + feature_folder + "/FacetTraffic_50_phase2_dataset.csv") 300 | 301 | #Merging Phase2 302 | PhaseTwoFile = open("FeatureSets/" + feature_folder + "/Phase2_dataset.csv", 'w') 303 | 304 | PhaseTwoRegularFile = open("FeatureSets/" + feature_folder + "/RegularTraffic_phase2_dataset.csv", 'rb') 305 | PhaseTwoCovertFile = open("FeatureSets/" + feature_folder + "/FacetTraffic_50_phase2_dataset.csv", "rb") 306 | 307 | 308 | #Write data from the regular file 309 | csv_reader = csv.reader(PhaseTwoRegularFile, delimiter=',') 310 | for n, row in enumerate(csv_reader): 311 | row_string = ",".join(row) + "\n" 312 | PhaseTwoFile.write(row_string) 313 | 314 | #Write data from the covert file 315 | csv_reader = csv.reader(PhaseTwoCovertFile, delimiter=',') 316 | for n, row in enumerate(csv_reader): 317 | if(n == 0): 318 | continue 319 | row_string = ",".join(row) + "\n" 320 | PhaseTwoFile.write(row_string) 321 | 322 | PhaseTwoFile.close() 323 | PhaseTwoRegularFile.close() 324 | PhaseTwoCovertFile.close() 325 | 326 | 327 | 328 | def FeatureExtractionPLBenchmarkBasedOnTrainData(sampleFolder, binWidth, topk): 329 | #Bucket importance in decreasing order 330 | BUCKETS_TO_MEASURE = [] 331 | 332 | #Measure interesting buckets 333 | if(topk != 1500): 334 | #Buckets in decreasing importance order 335 | f_imp = np.load('classificationResults/PL_60_' + str(binWidth) + '_1500/FeatureImportance_XGBoost_FacetTraffic_50_phase1.npy') 336 | #Print top k 337 | #for f in f_imp: 338 | # print str(f[1]) + " " + str(f[2]) 339 | 340 | if(topk > len(f_imp)): 341 | print "Skipping, not enough features to accomodate for. TopK = " + str(topk) + " Features = " + str(len(f_imp)) 342 | return 343 | for i in range(0,topk): 344 | b = int(f_imp[i][2].split("_")[1]) 345 | print "Top-" + str(i) + " = " + str(b) 346 | BUCKETS_TO_MEASURE.append(b) 347 | 348 | #Measure all buckets 349 | elif(topk == 1500): 350 | print "Measuring all buckets according to quantization" 351 | for i in range(0,1500,binWidth): 352 | BUCKETS_TO_MEASURE.append(i/binWidth) 353 | 354 | 355 | quantized_buckets_to_measure = sorted(BUCKETS_TO_MEASURE) 356 | print "Quantized buckets to measure = " + str(quantized_buckets_to_measure) 357 | print "Number of buckets to measure = " + str(len(quantized_buckets_to_measure)) 358 | 359 | 360 | traceInterval = 60 #Amount of time in packet trace to consider for feature extraction 361 | feature_set_folder = 'FeatureSets/PL_' + str(traceInterval) + "_" + str(binWidth) + "_" + str(topk) 362 | print feature_set_folder 363 | 364 | if not os.path.exists(feature_set_folder): 365 | os.makedirs(feature_set_folder) 366 | arff_path = feature_set_folder + '/' + os.path.basename(sampleFolder) + '_dataset.csv' 367 | arff = open(arff_path, 'wb') 368 | written_header = False 369 | 370 | 371 | for sample in os.listdir(sampleFolder): 372 | if(".DS_Store" in sample): 373 | continue 374 | f = open(sampleFolder + "/" + sample + "/" + sample + ".pcap") 375 | pcap = dpkt.pcap.Reader(f) 376 | 377 | #Analyse packets transmited 378 | packetSizesIn = [] 379 | packetSizesOut = [] 380 | bin_dict = {} 381 | 382 | 383 | for i in quantized_buckets_to_measure: 384 | bin_dict[i] = 0 385 | 386 | 387 | firstTime = 0.0 388 | setFirst = False 389 | for ts, buf in pcap: 390 | if(not(setFirst)): 391 | firstTime = ts 392 | setFirst = True 393 | 394 | if(ts < (firstTime + traceInterval)): 395 | 396 | eth = dpkt.ethernet.Ethernet(buf) 397 | ip_hdr = eth.data 398 | try: 399 | src_ip_addr_str = socket.inet_ntoa(ip_hdr.src) 400 | dst_ip_addr_str = socket.inet_ntoa(ip_hdr.dst) 401 | #Target UDP communication between both cluster machines 402 | if (ip_hdr.p == 17 and src_ip_addr_str == SOURCE_IP): 403 | binned = RoundToNearest(len(buf),binWidth) 404 | if(binned/binWidth in quantized_buckets_to_measure): 405 | bin_dict[binned/binWidth]+=1 406 | except: 407 | pass 408 | f.close() 409 | 410 | od_dict = collections.OrderedDict(sorted(bin_dict.items(), key=lambda t: float(t[0]))) 411 | bin_list = [] 412 | for i in od_dict: 413 | bin_list.append(od_dict[i]) 414 | 415 | 416 | label = os.path.basename(sampleFolder) 417 | if('Regular' in sampleFolder): 418 | label = 'Regular' 419 | 420 | #Write sample features to the csv file 421 | f_names = [] 422 | f_values = [] 423 | 424 | for i, b in enumerate(bin_list): 425 | f_names.append('packetLengthBin_' + str(quantized_buckets_to_measure[i])) 426 | f_values.append(b) 427 | 428 | 429 | #print len(f_names) 430 | f_names.append('Class') 431 | f_values.append(label) 432 | 433 | if(not written_header): 434 | arff.write(', '.join(f_names)) 435 | arff.write('\n') 436 | print "Writing header" 437 | written_header = True 438 | 439 | l = [] 440 | for v in f_values: 441 | l.append(str(v)) 442 | arff.write(', '.join(l)) 443 | arff.write('\n') 444 | arff.close() 445 | return feature_set_folder 446 | 447 | 448 | 449 | def CompressFeaturesBasedOnTrainData(BIN_WIDTH, TOPK): 450 | sampleFolders = [ 451 | "TrafficCaptures/240Resolution/FacetTraffic_50", 452 | "TrafficCaptures/240Resolution/RegularTraffic", 453 | ] 454 | 455 | 456 | if not os.path.exists('FeatureSets'): 457 | os.makedirs('FeatureSets') 458 | 459 | for topk in TOPK: 460 | for binWidth in BIN_WIDTH: 461 | start = time.time() 462 | print "\n#####################################" 463 | print "Generating Dataset based on Binned Packet Length Features" 464 | for sampleFolder in sampleFolders: 465 | print "\n#############################" 466 | print "Parsing " + sampleFolder 467 | print "#############################" 468 | feature_set_folder = FeatureExtractionPLBenchmarkBasedOnTrainData(sampleFolder, binWidth, topk) 469 | if(feature_set_folder is not None): 470 | GenerateDatasets(feature_set_folder + '/') 471 | end = time.time() 472 | print "Optimize_compress_bin_%s_topk_%s_time_%s"%(binWidth, topk, end-start) 473 | 474 | 475 | 476 | 477 | def ExtractFirstNPackets(sampleFolder, number_of_packets): 478 | 479 | traceInterval = 60 #Amount of time in packet trace to consider for feature extraction 480 | feature_set_folder = 'FeatureSets/First_%d_packets'%(number_of_packets) 481 | print feature_set_folder 482 | 483 | if not os.path.exists(feature_set_folder): 484 | os.makedirs(feature_set_folder) 485 | arff_path = feature_set_folder + '/' + os.path.basename(sampleFolder) + '_dataset.csv' 486 | arff = open(arff_path, 'wb') 487 | written_header = False 488 | 489 | 490 | for sample in os.listdir(sampleFolder): 491 | if(".DS_Store" in sample): 492 | continue 493 | f = open(sampleFolder + "/" + sample + "/" + sample + ".pcap") 494 | pcap = dpkt.pcap.Reader(f) 495 | 496 | 497 | packet_array1 = [] 498 | packet_array2 = [] 499 | firstTime = 0.0 500 | setFirst = False 501 | for ts, buf in pcap: 502 | if(len(packet_array1) >= number_of_packets and len(packet_array2) >= number_of_packets): 503 | break 504 | 505 | if(not(setFirst)): 506 | firstTime = ts 507 | setFirst = True 508 | 509 | if(ts < (firstTime + traceInterval)): 510 | 511 | eth = dpkt.ethernet.Ethernet(buf) 512 | ip_hdr = eth.data 513 | try: 514 | src_ip_addr_str = socket.inet_ntoa(ip_hdr.src) 515 | dst_ip_addr_str = socket.inet_ntoa(ip_hdr.dst) 516 | #Target UDP communication between both cluster machines 517 | if (ip_hdr.p == 17 and src_ip_addr_str == SOURCE_IP): 518 | if(len(packet_array1) < number_of_packets): 519 | packet_array1.append(len(buf)) 520 | elif(ip_hdr.p == 17 and src_ip_addr_str != SOURCE_IP): 521 | if(len(packet_array2) < number_of_packets): 522 | packet_array2.append(len(buf)) 523 | except: 524 | pass 525 | f.close() 526 | 527 | label = os.path.basename(sampleFolder) 528 | if('Regular' in sampleFolder): 529 | label = 'Regular' 530 | 531 | if(len(packet_array1) >= number_of_packets and len(packet_array2) >= number_of_packets): 532 | #Write sample features to the csv file 533 | f_names = [] 534 | f_values = [] 535 | 536 | for i, b in enumerate(packet_array1): 537 | f_names.append('packetNumberOut_' + str(i)) 538 | f_values.append(b) 539 | 540 | for i, b in enumerate(packet_array2): 541 | f_names.append('packetNumberIn_' + str(i)) 542 | f_values.append(b) 543 | 544 | 545 | f_names.append('Class') 546 | f_values.append(label) 547 | 548 | if(not written_header): 549 | arff.write(', '.join(f_names)) 550 | arff.write('\n') 551 | print "Writing header" 552 | written_header = True 553 | 554 | l = [] 555 | for v in f_values: 556 | l.append(str(v)) 557 | arff.write(', '.join(l)) 558 | arff.write('\n') 559 | else: 560 | print "Sample %s has not enough packets"%(sampleFolder + "/" + sample + "/" + sample + ".pcap") 561 | arff.close() 562 | return feature_set_folder 563 | 564 | 565 | def ExtractPacketSample(NUMBER_OF_PACKETS): 566 | sampleFolders = [ 567 | "TrafficCaptures/240Resolution/FacetTraffic_50", 568 | "TrafficCaptures/240Resolution/RegularTraffic", 569 | ] 570 | 571 | if not os.path.exists('FeatureSets'): 572 | os.makedirs('FeatureSets') 573 | 574 | for number_of_packets in NUMBER_OF_PACKETS: 575 | print "\n#####################################" 576 | print "Extracting first %d packet sizes"%(number_of_packets) 577 | 578 | for sampleFolder in sampleFolders: 579 | print "\n#############################" 580 | print "Parsing " + sampleFolder 581 | print "#############################" 582 | feature_set_folder = ExtractFirstNPackets(sampleFolder, number_of_packets) 583 | if(feature_set_folder is not None): 584 | GenerateDatasets(feature_set_folder + '/') --------------------------------------------------------------------------------