├── .gitignore
├── Security Tasks Evaluation
    ├── BotnetAnalysis
    │   ├── peershark
    │   │   ├── __init__.py
    │   │   ├── PcapInputFiles.txt
    │   │   ├── TsharkOptions.txt
    │   │   ├── P2P_CONSTANTS.py
    │   │   ├── Packet.py
    │   │   ├── createTrainingData.py
    │   │   ├── generateSuperFlows.py
    │   │   ├── FilterPackets.py
    │   │   ├── SuperFlow.py
    │   │   ├── GenerateFlows.py
    │   │   ├── FilterPacketsHelper.py
    │   │   ├── README.md
    │   │   └── Flow.py
    │   ├── Data
    │   │   ├── Storm
    │   │   │   └── placeholder.csv
    │   │   ├── P2PTraffic
    │   │   │   └── placeholder.csv
    │   │   └── Waledac
    │   │   │   └── placeholder.csv
    │   ├── TrafficCaptures
    │   │   ├── Storm
    │   │   │   └── placeholder.pcap
    │   │   ├── Waledac
    │   │   │   └── placeholder.pcap
    │   │   └── P2PTraffic
    │   │   │   └── placeholder.pcap
    │   ├── fullRun.sh
    │   ├── README.md
    │   ├── quantize.py
    │   └── runExperiment.py
    ├── WFAnalysis
    │   ├── AllWebsiteAnalysis
    │   │   ├── Data
    │   │   │   └── placeholder.data
    │   │   ├── ParsingUtilities
    │   │   │   ├── CSVParseToWeka.py
    │   │   │   ├── CSVParseToSimulateHerrman.py
    │   │   │   └── CSVParseWebsiteUnbalanced.py
    │   │   ├── generateFigures.py
    │   │   └── runExperiment.py
    │   ├── SingleWebsiteAnalysis
    │   │   ├── Data
    │   │   │   └── placeholder.data
    │   │   ├── ParsingUtilities
    │   │   │   ├── CSVParseToWeka.py
    │   │   │   └── CSVParseWebsiteUnbalanced.py
    │   │   ├── runExperiment.py
    │   │   └── generateFigures.py
    │   └── README.md
    └── MPTAnalysis
    │   ├── README.md
    │   ├── FacetAnalysis
    │       ├── runExperiment.py
    │       ├── online_sketching.py
    │       ├── generateFigures.py
    │       ├── compressive_ta.py
    │       └── generateFeatures.py
    │   └── DeltaShaperAnalysis
    │       ├── runExperiment.py
    │       ├── online_sketching.py
    │       ├── generateFigures.py
    │       └── compressive_ta.py
├── README.md
└── Flow Marker Accumulator
    └── flowlens-v1model.p4


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | 


--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/peershark/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/Data/Storm/placeholder.csv:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/Data/P2PTraffic/placeholder.csv:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/Data/Waledac/placeholder.csv:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/TrafficCaptures/Storm/placeholder.pcap:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/TrafficCaptures/Waledac/placeholder.pcap:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/TrafficCaptures/P2PTraffic/placeholder.pcap:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Security Tasks Evaluation/WFAnalysis/AllWebsiteAnalysis/Data/placeholder.data:
--------------------------------------------------------------------------------
1 | Place openssh.data here


--------------------------------------------------------------------------------
/Security Tasks Evaluation/WFAnalysis/SingleWebsiteAnalysis/Data/placeholder.data:
--------------------------------------------------------------------------------
1 | Place openssh.data here


--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/peershark/PcapInputFiles.txt:
--------------------------------------------------------------------------------
1 | /Users/dmbb/Desktop/flowscope/BotnetAnalysis/Data/P2PTraffic
2 | 


--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/peershark/TsharkOptions.txt:
--------------------------------------------------------------------------------
1 | -t e
2 | -T fields
3 | -E separator=,
4 | -e ip.src -e ip.dst -e ip.proto -e frame.time_epoch -e tcp.len -e udp.length
5 | -Y "(ip.proto==6)||(ip.proto==17)"
6 | 


--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/fullRun.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | for BIN_WIDTH in 1 16 32 64 128 256
4 | do
5 |     for IPT_BIN_WIDTH in 0 1 10 60 300 900
6 |     do 
7 |         python runExperiment.py $BIN_WIDTH $IPT_BIN_WIDTH
8 |     done
9 | done


--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/peershark/P2P_CONSTANTS.py:
--------------------------------------------------------------------------------
 1 | PCAPDATADIR = './pcapdata/'
 2 | PCAPFILES = 'PcapInputFiles.txt'
 3 | TSHARKOPTIONSFILE = 'TsharkOptions.txt'
 4 | TCP_PROTO = '6'
 5 | UDP_PROTO = '17'
 6 | UDP_HEADERLENGTH = 8
 7 | 
 8 | #utility functions
 9 | import os
10 | def getCSVFiles(dirname):
11 | 	csvfiles = []
12 | 	for eachfile in os.listdir(dirname):
13 | 		if eachfile.endswith('.csv'):
14 | 			csvfiles.append(dirname + eachfile)	
15 | 	return csvfiles


--------------------------------------------------------------------------------
/Security Tasks Evaluation/WFAnalysis/README.md:
--------------------------------------------------------------------------------
 1 | ##Dependencies and Data
 2 | 
 3 | 
 4 | ### Website Fingerprinting
 5 | 
 6 | - Download the OpenSSH dataset (parsed and obtained from the original Herrman MySQL database) available [here](https://turbina.gsd.inesc-id.pt/resources/openSSH_herrman/openssh_data.tar.gz)
 7 |   - Place it inside `WFAnalysis/AllWebsiteAnalysis/Data` and `WFAnalysis/SingleWebsiteAnalysis/Data`.
 8 | 
 9 | 
10 | 
11 | ### Running the code
12 | 
13 | - Execute the `RunExperiment.py` script in each of the considered website fingerprinting settings


--------------------------------------------------------------------------------
/Security Tasks Evaluation/MPTAnalysis/README.md:
--------------------------------------------------------------------------------
 1 | ##Dependencies and Data
 2 | 
 3 | 
 4 | ### Multimedia Protocol Tunneling
 5 | 
 6 | - Download the traffic captures of covert channel tools available [here](https://turbina.gsd.inesc-id.pt/resources/mpt_detection/)
 7 |   - Place them in `MPTAnalysis/DeltaShaperAnalysis/TrafficCaptures` and `MPTAnalysis/FacetAnalysis/TrafficCaptures`, respectively.
 8 | 
 9 | 
10 | ### Running the code
11 | 
12 | - Execute the `RunExperiment.py` script in each of the particular covert channel generating tools folder. Then execute `generateFigures.py` to generate figures from the obtained results.


--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/peershark/Packet.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | #defines properties of a packet
 3 | class Packet:
 4 | 	def __init__(self,fields):
 5 | 		if fields == None:
 6 | 			self.source = None
 7 | 			self.dest = None
 8 | 			self.timestamp = None
 9 | 			self.size = 0
10 | 			self.key = None
11 | 		else:
12 | 			self.source = socket.inet_aton(fields[0])
13 | 			self.dest = socket.inet_aton(fields[1])
14 | 			self.timestamp = float(fields[2])
15 | 			self.size = int(fields[3])
16 | 			if self.source < self.dest:
17 | 				self.key = self.source + self.dest
18 | 			else:
19 | 				self.key = self.dest + self.source
20 | 		
21 | 


--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/README.md:
--------------------------------------------------------------------------------
 1 | ##Dependencies and Data
 2 | 
 3 | 
 4 | ### Botnets
 5 | 
 6 | - Download the P2P and botnet datasets gathered for PeerRush, available [here](http://peerrush.cs.uga.edu/peerrush/)
 7 |   - Place them inside `BotnetAnalysis/Data`
 8 | - Botnet detection code by Pratik Narang is available [here](https://github.com/pratiknarang/peershark)
 9 | 
10 | ### Parse Original Captures Used in PeerShark
11 | 
12 | - For each dataset (Waledac, Storm, P2P)
13 |   - Run `peershark/FilterPackets.py`
14 |   - Retrieve original parse of the .pcap at `pcapdata` folder
15 | 
16 | *Note: Storm data samples must be appended with ".pcap"*
17 | `for f in *; do mv "$f" "$f.pcap"; done`
18 |   
19 | ### Run the FlowLens botnet detection experiment
20 | 
21 | Run `fullRun.sh`, which is responsible for applying different quantization parameter combinations on the PL and IPT of P2P packet flows


--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/peershark/createTrainingData.py:
--------------------------------------------------------------------------------
 1 | from P2P_CONSTANTS import *
 2 | 
 3 | 
 4 | def runTrainingDataGenerator(super_flow_data_dir, training_data_dir, bin_width, ipt_bin_width):
 5 | 	#takes takes 50,000 examples and puts it in necessary format for training
 6 | 	csvfiles = []
 7 | 	if os.path.isdir(super_flow_data_dir):
 8 | 		csvfiles += getCSVFiles(super_flow_data_dir)
 9 | 
10 | 	#print ".csv files to generate training data: %s"%(csvfiles)
11 | 
12 | 	outfile = open(training_data_dir + 'trainingdata_' + str(bin_width) + "_" + str(ipt_bin_width) + '.csv','w')
13 | 	for filename in csvfiles:
14 | 		label = filename.split('/')[-2]
15 | 		inputfile = open(filename)
16 | 		line = inputfile.readline().strip()
17 | 		while line!='':
18 | 			fields = line.split(',')
19 | 			if float(fields[4])!=0 and float(fields[3])!=0 and float(fields[7])!=0:
20 | 				outfile.write(
21 | 					fields[2] + ',' +
22 | 					fields[3] + ',' +
23 | 					fields[4] + ',' +
24 | 					fields[7] + ',' +
25 | 					label + '\n')
26 | 			line = inputfile.readline().strip()
27 | 		inputfile.close()
28 | 	outfile.close()


--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/peershark/generateSuperFlows.py:
--------------------------------------------------------------------------------
 1 | from P2P_CONSTANTS import *
 2 | import socket
 3 | import Flow
 4 | import SuperFlow
 5 | import sys
 6 | 
 7 | 
 8 | def runGenerateSuperFlows(flow_data_dir, super_flow_data_dir, flowgap):
 9 | 	#TIMEGAP IN SECONDS
10 | 	csvfiles = getCSVFiles(flow_data_dir)
11 | 	#print csvfiles
12 | 
13 | 	flowdata = []
14 | 	for filename in csvfiles:
15 | 		inputfile = open(filename)
16 | 		data = [line.strip() for line in inputfile]
17 | 		inputfile.close()
18 | 
19 | 		for eachline in data:
20 | 			fields = eachline.split(',')
21 | 			flowdata.append(SuperFlow.SuperFlow(fields))
22 | 	print '\tNo. of flows to be processed: ' + str(len(flowdata))
23 | 
24 | 	
25 | 	flowdata = Flow.combineFlows(flowdata, flowgap)
26 | 	print '\tSuperflows (Flows with flowgap = ' + str(flowgap) + ' sec) : ' + str(len(flowdata))
27 | 
28 | 	outfile = open(super_flow_data_dir + str(flowgap) + '.csv', 'w')
29 | 	
30 | 	to_write = []
31 | 	for flow in flowdata:
32 | 		to_write.append(
33 | 			socket.inet_ntoa(flow.ip1) + ',' +
34 | 			socket.inet_ntoa(flow.ip2) + ',' +
35 | 			str(flow.getNoOfPackets()) + ',' +
36 | 			str(flow.getNoOfBytes()) + ',' +
37 | 			'%.6f'%flow.getInterArrivaltime() + ',' +
38 | 			'%.6f'%flow.getStart() + ',' +
39 | 			'%.6f'%flow.getEnd() + ',' +
40 | 			'%.6f'%flow.getDurationInSeconds())
41 | 	outfile.write("\n".join(to_write))
42 | 	outfile.close()


--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/peershark/FilterPackets.py:
--------------------------------------------------------------------------------
 1 | ## Module to obtain packet data from a pcap/dump file
 2 | ## and save it in csv format using tshark.
 3 | ## Filenames of input pcap files are taken from InputFiles.txt
 4 | ## Tshark options are present in TsharkOptions.txt
 5 | ## TsharkOptions.txt should not contain the -r option.
 6 | 
 7 | ## usage: python FilterPackets.py
 8 | 
 9 | #import global constants
10 | from P2P_CONSTANTS import *
11 | from FilterPacketsHelper import *
12 | import multiprocessing as MP
13 | import subprocess
14 | 
15 | #execute a shell command as a child process
16 | def executeCommand(command,outfilename):
17 | 	sem.acquire()
18 | 
19 | 	subprocess.call(command, shell = True)
20 | 	
21 | 	infile = open(outfilename, 'r')
22 | 	data = [eachline.strip() for eachline in infile]
23 | 	infile.close()
24 | 	
25 | 	data = preprocess(data)
26 | 	
27 | 	outfile = open(outfilename,'w')
28 | 	for eachcomponent in data:
29 | 		outfile.write(eachcomponent)
30 | 	outfile.close()
31 | 	
32 | 	print 'done processing : ' + outfilename
33 | 	sem.release()
34 | 
35 | #obtain input parameters and pcapfilenames
36 | inputfiles = getPCapFileNames()
37 | print "Input Files: " + str(inputfiles)
38 | tsharkOptions = getTsharkOptions()
39 | 
40 | #create a semaphore so as not to exceed threadlimit
41 | sem = MP.Semaphore(THREADLIMIT)
42 | 
43 | #get tshark commands to be executed
44 | for filename in inputfiles:
45 | 	print filename
46 | 	(command,outfilename) = contructTsharkCommand(filename,tsharkOptions)
47 | 	task = MP.Process(target = executeCommand, args = (command, outfilename,))
48 | 	task.start()


--------------------------------------------------------------------------------
/Security Tasks Evaluation/WFAnalysis/AllWebsiteAnalysis/ParsingUtilities/CSVParseToWeka.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import csv
 4 | 
 5 | def main(argv):
 6 | 	OutputFile = open(argv[1], 'w')
 7 | 	InputFile = open(argv[0])
 8 | 	
 9 | 	bin_dict = {}
10 | 	
11 | 	OutputFile.write("@relation\'WF\'\n\n")
12 | 	OutputFile.write("@attribute Text string\n")
13 | 	OutputFile.write("@attribute class {")
14 | 	
15 | 	csv_reader = csv.reader(InputFile, delimiter=',')
16 | 
17 | 	csv_header = ""
18 | 	website_list = set()
19 | 	text = []
20 | 
21 | 	for n, row in enumerate(csv_reader):
22 | 		if(n == 0):
23 | 			#Init bin dict
24 | 			csv_header = row
25 | 			prefix = "packetLengthBin_"
26 | 			for i in range(len(csv_header)-1):
27 | 				parsedBucketSize = csv_header[i][(len(prefix) + 1):]
28 | 				bin_dict[i] = parsedBucketSize
29 | 			continue
30 | 		 
31 | 		currWebsite = row[-1]
32 | 		website_list.add(currWebsite)		
33 | 		bin_list = row[:-1]
34 | 
35 | 		text.append("\'")
36 | 		if("Online" in argv[1]): #Fix for online Sketching (Coskun et al.)
37 | 			for i in range(len(bin_list)):
38 | 				text.append(str(bin_list[i]) + " ")
39 | 		else: #For the others
40 | 			for i in range(len(bin_list)):
41 | 				for _ in range(int(bin_list[i])):
42 | 					text.append(str(bin_dict[i]) + " ")
43 | 		
44 | 		text.append("\'," + currWebsite + "\n")
45 | 
46 | 	#Write classes on header
47 | 	OutputFile.write(",".join(website_list))
48 | 	OutputFile.write("}\n\n")
49 | 	#Write data
50 | 	OutputFile.write("@data\n\n")
51 | 	OutputFile.write("".join(text))
52 | 		
53 | 	
54 | 	OutputFile.close()
55 | 
56 | 
57 | if __name__ == "__main__":
58 | 	main(sys.argv[1:])


--------------------------------------------------------------------------------
/Security Tasks Evaluation/WFAnalysis/SingleWebsiteAnalysis/ParsingUtilities/CSVParseToWeka.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import csv
 4 | 
 5 | def RoundToNearest(n, m):
 6 | 	if (m == 1):
 7 | 		return n    
 8 | 	if (n > 0):
 9 | 		r = n % m
10 | 		return n + m - r if r + r >= m else n - r
11 | 	else:
12 | 		if (n < 0):
13 | 			return RoundToNearest(abs(n), m) * -1
14 | 	return 0
15 | 
16 | def main(argv):
17 | 	OutputFile = open(argv[1], 'w')
18 | 	InputFile = open(argv[0], 'rb')
19 | 	website = argv[2]
20 | 
21 | 	bin_dict = {}
22 | 	
23 | 	OutputFile.write("@relation\'WF\'\n\n")
24 | 	OutputFile.write("@attribute Text string\n")
25 | 	OutputFile.write("@attribute class {Nope,%s}\n\n"%(website))
26 | 	OutputFile.write("@data\n\n")
27 | 
28 | 	
29 | 	csv_reader = csv.reader(InputFile, delimiter=',')
30 | 
31 | 	csv_header = ""
32 | 	text = []
33 | 
34 | 	for n, row in enumerate(csv_reader):
35 | 		if(n == 0):
36 | 			#Init bin dict
37 | 			csv_header = row
38 | 			prefix = "packetLengthBin_"
39 | 			for i in range(len(csv_header)-1):
40 | 				parsedBucketSize = csv_header[i][(len(prefix) + 1):]
41 | 				bin_dict[i] = parsedBucketSize
42 | 			continue
43 | 		 
44 | 		currWebsite = row[-1]		
45 | 		bin_list = row[:-1]
46 | 
47 | 		text.append("\'")		
48 | 		for i in range(len(bin_list)):
49 | 			for _ in range(int(bin_list[i])):
50 | 				text.append(str(bin_dict[i]) + " ")
51 | 		
52 | 		if (website not in currWebsite):
53 | 			text.append("\'," + "Nope" + "\n")
54 | 		else:
55 | 			text.append("\'," + website + "\n")
56 | 	
57 | 
58 | 	OutputFile.write("".join(text))
59 | 	OutputFile.close()
60 | 
61 | if __name__ == "__main__":
62 | 	main(sys.argv[1:])


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # FlowLens
 2 | 
 3 | This repository holds the code for the paper "FlowLens: Enabling Efficient Flow Classification for ML-based Network Security Applications". 
 4 | If you end up using our code for your experiments, please cite our work as follows:
 5 | 
 6 | ```
 7 | @inproceedings{protozoa,
 8 |   title={FlowLens: Enabling Efficient Flow Classification for ML-based Network Security Applications},
 9 |   author={Barradas, Diogo and Santos, Nuno and Rodrigues, Lu{\'i}s and Signorello, Salvatore and Ramos, Fernando M. V. and Madeira, Andr{\'e}},
10 |   booktitle={Proceedings of the 28th Network and Distributed System Security Symposium},
11 |   year={2021},
12 |   address={San Diego, CA, USA},
13 | }
14 | ```
15 | 
16 | ##Dependencies and Data
17 | 
18 | 
19 | ### General Dependencies
20 | 
21 | - Install WEKA
22 | - Run `pip install -r requirements.txt`
23 | 
24 | ### Datasets
25 | 
26 | - Please check the `README.md` in each specific security task folder
27 | 
28 | 
29 | ## How may I use your code?
30 | 
31 | - The `Security Tasks Evaluation` folder includes the code we used for evaluating different ML-based security tasks when using FlowLens. The code applies different combinations of our quantization and truncation approaches and allows for checking FlowLens flow markers trade-offs between accuracy and memory footprint
32 | 
33 | - The `Flow Marker Accumulator` folder includes an adaptation of the P4<sub>16</sub> code we used for implementing FlowLens' flow marker accumulator in a Barefoot Tofino switch. Due to NDA concerns, we make public this adapted version of our code that can be run on the P4's BMV2 behavioral model.
34 | 
35 | 
36 | *Todo: Provide a full end-to-end dummy example of FlowLens running in BMV2 - e.g. on P4's tutorial VM.*


--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/peershark/SuperFlow.py:
--------------------------------------------------------------------------------
 1 | from Packet import *
 2 | import socket
 3 | import Flow
 4 | 
 5 | #get median of interarrival time
 6 | def getMedian(vallist):
 7 | 	vallist.sort(key = lambda val:val[0])
 8 | 	tot = 0
 9 | 	cfreq = []
10 | 	for val in vallist:
11 | 		tot += val[1]
12 | 		cfreq.append(tot)
13 | 	medianindex = tot / 2
14 | 	i = 0
15 | 	while medianindex > cfreq[i]:
16 | 		i += 1
17 | 	return vallist[i][0]
18 | 
19 | #defines a superflow
20 | class SuperFlow(Flow.Flow):
21 | 
22 | 	def __init__(self, fields):
23 | 		if fields == None:
24 | 			self.ip1 = None
25 | 			self.ip2 = None
26 | 			self.key = None
27 | 			self.n_packet1 = 0
28 | 			self.n_byte1 = 0
29 | 			self.t_start1 = 0
30 | 			self.t_end1 = 0	
31 | 			self.t_interarrival1 = []
32 | 			self.n_packet2 = 0
33 | 			self.n_byte2 = 0	
34 | 			self.t_start2 = 0
35 | 			self.t_end2 = 0
36 | 			self.t_interarrival2 = []
37 | 		else:
38 | 			self.ip1 = socket.inet_aton(fields[0])
39 | 			self.ip2 = socket.inet_aton(fields[1])
40 | 			self.key = self.ip1 + self.ip2
41 | 			self.n_packet1 = int(fields[2])
42 | 			self.n_byte1 = int(fields[3])
43 | 			self.t_start1 = float(fields[4])
44 | 			self.t_end1 = float(fields[5])
45 | 			self.t_interarrival1 = [(float(fields[6]),self.n_packet1)]						
46 | 			self.n_packet2 = int(fields[7])
47 | 			self.n_byte2 = int(fields[8])	
48 | 			self.t_start2 = float(fields[9])
49 | 			self.t_end2 = float(fields[10])
50 | 			self.t_interarrival2 = [(float(fields[11]),self.n_packet2)]		
51 | 	
52 | 	#get median of interarrival time irrespective of direction
53 | 	def getInterArrivaltime(self):
54 | 		combined = self.t_interarrival1 + self.t_interarrival2
55 | 		if len(combined) > 0:
56 | 			return getMedian(combined)
57 | 		return 0	
58 | 	
59 | 	#interarrival time for direction1(arbitrary)
60 | 	def getInterArrivaltime1(self):
61 | 		if len(self.t_interarrival1) > 0:
62 | 			return getMedian(self.t_interarrival1)
63 | 		return 0
64 | 	
65 | 	#interarrival time for direction2(arbitrary)
66 | 	def getInterArrivaltime2(self):
67 | 		if len(self.t_interarrival2) > 0:
68 | 			return getMedian(self.t_interarrival2)
69 | 		return 0
70 | 


--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/peershark/GenerateFlows.py:
--------------------------------------------------------------------------------
 1 | from P2P_CONSTANTS import *
 2 | from Packet import *
 3 | from Flow import *
 4 | import multiprocessing as MP
 5 | import socket
 6 | import gc
 7 | import time
 8 | 
 9 | ## module to read all the files in the data folder of the 
10 | ## project, build flow data and store it in a file
11 | 
12 | 
13 | def generateFlow(filename, flow_data_dir, timegap, sem):
14 | 	sem.acquire()
15 | 	
16 | 	inputfile = open(filename)
17 | 	data = [line.strip() for line in inputfile]
18 | 	inputfile.close()
19 | 		
20 | 	packetlist = []
21 | 	for eachline in data:
22 | 		fields = eachline.split(',')
23 | 		fields.pop(2)
24 | 		packetlist.append(Packet(fields))
25 | 	
26 | 	outflowlist = packetsToFlows(packetlist, timegap)
27 | 	#print 'flows in ' + filename + ' : ' + str(len(outflowlist))
28 | 	
29 | 	outfilename = flow_data_dir + (filename.split('/')[-1])		
30 | 	writeFlowsToFile(outflowlist, outfilename)
31 | 
32 | 	#print 'done writing to : ' + outfilename
33 | 	#start_collect = time.time()
34 | 	#collected = gc.collect()
35 | 	#end_collect = time.time()
36 | 	#print "Time wasted on GC - GenerateFlows: %ss, collected %s objects"%(end_collect-start_collect, collected)
37 | 	sem.release()
38 | 
39 | def runGenerateFlows(quantized_pcap_data_dir, flow_data_dir, n_processes, timegap):
40 | 	#create a semaphore so as not to exceed n_processes process limit
41 | 	sem = MP.Semaphore(n_processes)
42 | 
43 | 	csvfiles = getCSVFiles(quantized_pcap_data_dir)
44 | 
45 | 	tasklist = []
46 | 	#generate flowdata from each input packet file(not pcap) in parallel and store it in a file
47 | 	#so we get as many output files as number of input files
48 | 	for filename in csvfiles:
49 | 		task = MP.Process(target = generateFlow, args = (filename, flow_data_dir, timegap, sem))
50 | 		tasklist.append(task)
51 | 
52 | 	print "Tasklist size = %s"%(len(tasklist))
53 | 
54 | 	# #execute commands in parallel
55 | 	for i in range(0, len(tasklist), n_processes):
56 | 		for k,task in enumerate(tasklist[i:i+n_processes]):
57 | 			tasklist[i+k].start()
58 | 		for k, task in enumerate(tasklist[i:i+n_processes]):
59 | 			tasklist[i+k].join()
60 | 			#print "Joined task number %s"%(i+k)


--------------------------------------------------------------------------------
/Security Tasks Evaluation/WFAnalysis/AllWebsiteAnalysis/generateFigures.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from decimal import Decimal
 3 | import numpy as np
 4 | import csv
 5 | 
 6 | import matplotlib
 7 | if os.environ.get('DISPLAY','') == '':
 8 |     print('no display found. Using non-interactive Agg backend')
 9 |     matplotlib.use('Agg')
10 | import matplotlib.pyplot as plt
11 | 
12 | 
13 | colors = ["0.8", "0.6", "0.2", "0.0"]
14 | colors = ["salmon", "lightsteelblue", "darkseagreen", "thistle", "wheat", "khaki", "skyblue"]
15 | 
16 | """
17 | Attach a text label above each bar displaying its height
18 | """
19 | def autolabel(rects, ax):
20 |     for rect in rects:
21 |         height = rect.get_height()
22 |         ax.text(rect.get_x() + rect.get_width()/2., 1.005*height,   # original height was 1.005*height
23 |                 "{0:.2f}".format(float(height)), fontsize=7, ha='center', va='bottom')
24 | 
25 | 
26 | def PlotNormalAccuracy():
27 |     print "Plotting accuracy for no-sketch"
28 |     #Gather results for full distribution
29 |     profile_data_full = open("classificationResults/AllVsAll.csv", 'rb')
30 |     csv_reader_full = csv.reader(profile_data_full, delimiter=',')
31 | 
32 |     binWidth_full = []
33 |     acc_full = []
34 | 
35 |     for n, row in enumerate(csv_reader_full):
36 |         if(n == 0):
37 |             continue
38 |         binWidth_full.append(row[0])
39 |         acc_full.append(round(Decimal(float(row[1])), 4))
40 | 
41 | 
42 |     #Generate plot
43 |     fig = plt.figure()
44 |     ax1 = fig.add_subplot(111)
45 | 
46 |     print "Current feature set: "+ str(binWidth_full)
47 |     print "ACC-Full: " + str(acc_full)
48 |     
49 |     ind = np.arange(len(binWidth_full))  # the x locations for the groups
50 |     width = 0.4
51 | 
52 |     rects1 = ax1.bar(ind - width/2, acc_full, width, color=colors[0], label='Accuracy')
53 |     autolabel(rects1,ax1)
54 | 
55 |     ax1.yaxis.grid(color='black', linestyle='dotted')
56 |     ax1.set_title('Quantization effect on accuracy - WF Multiclass', fontsize = 10)
57 |     
58 |     ax1.set_xticks(ind)
59 |     labels = ["K = " + x + "\nBins = " + str(3000/int(x)) for n, x in enumerate(binWidth_full)]
60 |     ax1.set_xticklabels(labels)
61 |     ax1.legend()
62 | 
63 |     ax1.set_ylabel('Accuracy')
64 |     ax1.set_xlabel('Quantization')
65 | 
66 |     plt.xticks(fontsize=7)
67 |     plt.tight_layout()
68 |     plt.ylim(0, 1)
69 |     fig.savefig('Figures/AllVsAll.pdf')   # save the figure to file
70 |     fig.savefig('Figures/AllVsAll.png')   # save the figure to file
71 |     plt.close(fig)
72 |     profile_data_full.close()
73 | 
74 | 
75 | 
76 | def GenerateFigures():
77 |     if not os.path.exists("Figures"):
78 |         os.makedirs("Figures")
79 | 
80 |     PlotNormalAccuracy()


--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/peershark/FilterPacketsHelper.py:
--------------------------------------------------------------------------------
 1 | from P2P_CONSTANTS import *
 2 | import os
 3 | #return a list of filenames of pcapfiles taken from InputFiles.txt
 4 | #if a directory is found then all *.pcap files in the directory are
 5 | #included(non-recursive)
 6 | 
 7 | def getPCapFileNames():
 8 | 	pcapInputFile = open(PCAPFILES)
 9 | 	lines = [eachline.strip() for eachline in pcapInputFile]
10 | 	pcapInputFile.close()
11 | 	
12 | 	pcapfilenames = []
13 | 	for eachline in lines:
14 | 		if eachline.endswith('.pcap'):
15 | 			if os.path.exists(eachline):
16 | 				pcapfilenames.append(eachline)
17 | 			else:
18 | 				print eachline + ' does not exist'
19 | 				exit()
20 | 		else:
21 | 			if os.path.isdir(eachline):
22 | 				for eachfile in os.listdir(eachline):
23 | 					if eachfile.endswith('.pcap'):
24 | 						pcapfilenames.append(eachline.rstrip('/') + '/' + eachfile)
25 | 			else:
26 | 				print eachline + ' is not a directory'
27 | 				exit()
28 | 	return pcapfilenames
29 | 
30 | #return a list of options to be used with tshark
31 | def getTsharkOptions():
32 | 	optionsFile = open(TSHARKOPTIONSFILE)
33 | 	options = [line.strip() for line in optionsFile]
34 | 	optionsFile.close()
35 | 	return options
36 | 
37 | #return a tuple (x,y) where
38 | #x = complete tshark command
39 | #y = output csv filename
40 | def contructTsharkCommand(filename,tsharkOptions):
41 | 	command = 'tshark -r ' + filename + ' '
42 | 	for eachstring in tsharkOptions:
43 | 		command = command + eachstring + ' '
44 | 	
45 | 	#construct output filename
46 | 	outfilename = filename.split('/')
47 | 	outfilename = PCAPDATADIR + outfilename[len(outfilename)-1] + '.csv'
48 | 	
49 | 	command += '>'+outfilename
50 | 	return (command,outfilename)
51 | 
52 | #remove missing tcp and udp payload lengths and subtract
53 | #8 bytes from udp payload to account for udp header
54 | #returns a list of strings to be printed
55 | def preprocess(data):
56 | 	outputdata = []
57 | 	for eachline in data:
58 | 		fields = eachline.split(',')
59 | 		
60 | 		#sanity check for 6 fields. Has to be changed if tshark options are changed
61 | 		if len(fields) != 6:
62 | 			continue
63 | 
64 | 		tcppayload = fields[4].strip()
65 | 		udppayload = fields[5].strip()
66 | 
67 | 		#subtract udp header length	
68 | 		if udppayload != '':
69 | 			fields[5] = str(int(udppayload) - UDP_HEADERLENGTH)
70 | 			if fields[5] == '0':
71 | 				continue
72 | 		#ignore packet if both tcp and udp payload lengths are null
73 | 		elif tcppayload == '' or tcppayload == '0':
74 | 			continue
75 | 
76 | 		#add all valid fields to output list
77 | 		for eachfield in fields:
78 | 			if eachfield.strip() != '':
79 | 				outputdata.append(eachfield)
80 | 				outputdata.append(',')
81 | 		outputdata.pop()
82 | 		outputdata.append('\n')
83 | 	return outputdata


--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/peershark/README.md:
--------------------------------------------------------------------------------
 1 | PeerShark
 2 | ============================
 3 | Peer-to-Peer botnet detection by tracking conversations
 4 | 
 5 | ### Contributors
 6 | * Pratik Narang
 7 | * Subhajit Ray
 8 | * Chittaranjan Hota
 9 | 
10 | ###Research papers:
11 | * Narang, P., Ray, S., Hota, C., & Venkatakrishnan, V. (2014, May). Peershark: detecting peer-to-peer botnets by tracking conversations. In Security and Privacy Workshops (SPW), 2014 IEEE (pp. 108-115). IEEE.
12 | * Narang, P., Hota, C., & Venkatakrishnan, V. N. (2014). PeerShark: flow-clustering and conversation-generation for malicious peer-to-peer traffic identification. EURASIP Journal on Information Security, 2014(1), 1-12.
13 | 
14 | PeerShark requires Python v2.7.* and Tshark installed, and has been tested only for Linux environment. 
15 | 
16 | Modules to be used in the following order:
17 | 
18 | 1. FilterPackets.py : Take inputdir or input files from PCAPFILES.
19 | The module runs tshark on each file in inputdir and extracts the
20 | fields mention in TsharkOptions.txt such as src-ip,dst-ip,
21 | protocol, payload length. One new file is created per pcap file 
22 | which contains only the fields we want for future analysis. The
23 | new files are stored in PCAPDATADIR.
24 | 
25 |   usage : python FilterPackets.py
26 | 
27 | 2. GenerateFlows.py : Take each file from PCAPDATADIR -> generate
28 | flow information -> store processed data for each file in
29 | FLOWDATADIR. 
30 | 
31 |   usage : python GenerateFlows.py
32 | 
33 | 3. generateSuperFlows.py : Take each file from FLOWDATADIR -> merge
34 | flows into superflows based on input parameters -> store in 
35 | SUPERFLOWDATADIR.
36 | 
37 |  usage: python generateSuperFlows.py start(in hrs) increment(in hrs) end(in hrs)
38 | 
39 |   Number of files generated = (end - start)/increment
40 | 
41 |   One file is generated for each value of timegap ranging from start to end.
42 | 
43 | ####OPTIONAL:
44 | 
45 | 
46 | 4. createTrainingData.py: use this file to create labelled training data set. 
47 | It reads *folders* (not files) residing in SUPERFLOWDATADIR, and creates *one* 
48 | labelled file (weka style minus the header) per folder (with required attributes only- 
49 | no. of pkts, no. of bytes, iat, duration, label) with the folder name appended as last column.
50 | 
51 | After generating a labelled 'training dataset', supervised machine learning algorithms
52 | can be used to generate models for P2P botnet detection.
53 | 
54 | 
55 | ####Flow structure
56 | 
57 | `IP1, IP2, #Packets1, #Bytes1, tFlowStart1, tFlowEnd1, MedianIPT1, #Packets2, #Bytes2, tFlowStart2, tFlowEnd2, MedianIPT2,`
58 | 
59 | **Example**  
60 | `4.79.17.248,192.168.58.137,3,126,1234920043.252418,1234920049.917001,4.326552,450,18900,1234920045.127448,1234920069.383826,0.000068`
61 | 
62 | 
63 | ####Superflow structure
64 | 
65 | `IP1, IP2, #Packets, #Bytes, MedianIPT, tFlowStart, tFlowEnd, tDuration`
66 | 
67 | **Example**  
68 | `4.68.25.2, 192.168.58.150, 2, 86, 0.000000, 1234978436.632683, 1234978436.632683, 0.000000`
69 | 
70 | ####Training data structure
71 | 
72 | `#Packets, #Bytes, MedianIPT, tDuration, label`
73 | 
74 | **Example**  
75 | `16,672,0.000051,2.108578,Waledac`
76 | 


--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/quantize.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import csv
  3 | import multiprocessing as MP
  4 | import socket
  5 | import gc
  6 | import time
  7 | 
  8 | def RoundToNearest(n, m):
  9 | 		r = n % m
 10 | 		return n + m - r if r + r >= m else n - r
 11 | 
 12 | STORM_IPS = [
 13 | 	"66.154.80.101", 
 14 | 	"66.154.80.105",
 15 | 	"66.154.80.111",
 16 | 	"66.154.80.125",
 17 | 	"66.154.83.107",
 18 | 	"66.154.83.113",
 19 | 	"66.154.83.138",
 20 | 	"66.154.83.80",
 21 | 	"66.154.87.39",
 22 | 	"66.154.87.41",
 23 | 	"66.154.87.57",
 24 | 	"66.154.87.58",
 25 | 	"66.154.87.61"
 26 | ]
 27 | 
 28 | WALEDAC_IPS = [
 29 | 	"192.168.58.136", 
 30 | 	"192.168.58.137", 
 31 | 	"192.168.58.150"
 32 | 	]
 33 | 
 34 | 
 35 | def runQuantization(dataset, traffic_capture, binWidth, ipt_bin_width, sem):
 36 | 	sem.acquire()
 37 | 
 38 | 	cap_file = open(dataset + "/" + traffic_capture, 'rb')
 39 | 	csv_reader = csv.reader(cap_file, delimiter=',')
 40 | 
 41 | 	quantized_csv = open('FeatureSets/' + os.path.basename(dataset) + "/" + traffic_capture[:-4] + "_" + str(binWidth) + "_" + str(ipt_bin_width) + ".csv", "w")
 42 | 
 43 | 	malicious_ips = []
 44 | 	if(os.path.basename(dataset) == "Storm"):
 45 | 		malicious_ips = STORM_IPS
 46 | 	elif(os.path.basename(dataset) == "Waledac"):
 47 | 		malicious_ips = WALEDAC_IPS
 48 | 
 49 | 	#print "Malicious IPs = %s"%(malicious_ips)
 50 | 	#print os.path.basename(dataset)
 51 | 
 52 | 	to_write = []
 53 | 	#Write modified packets
 54 | 	for row in csv_reader:
 55 | 		#Filter out non-malicious flows from Storm and Waledac datasets
 56 | 		if(("Storm" in os.path.basename(dataset) or "Waledac" in os.path.basename(dataset)) and (row[0] not in malicious_ips and row[1] not in malicious_ips)):
 57 | 			#print "Row not in malicious: %s - %s"%(row[0], row[1])
 58 | 			continue
 59 | 		else:
 60 | 			new_row = row
 61 | 			
 62 | 			#Quantize packet size
 63 | 			new_row[4] = str(RoundToNearest(int(new_row[4]), binWidth))
 64 | 
 65 | 			#Quantize Timestamp
 66 | 			if(ipt_bin_width > 0):
 67 | 				new_row[3] = str(RoundToNearest(int(float(new_row[3])), ipt_bin_width))
 68 | 			to_write.append(",".join(new_row))
 69 | 	
 70 | 	quantized_csv.write("\n".join(to_write))
 71 | 	
 72 | 	cap_file.close()
 73 | 	quantized_csv.close()
 74 | 
 75 | 	#start_collect = time.time()
 76 | 	#collected = gc.collect()
 77 | 	#end_collect = time.time()
 78 | 	#print "Time wasted on GC - Quantize: %ss, collected %s objects"%(end_collect-start_collect, collected)
 79 | 	sem.release()
 80 | 
 81 | 
 82 | def QuantizeDataset(dataset, binWidth, ipt_bin_width, n_processes):
 83 | 	sem = MP.Semaphore(n_processes)
 84 | 	traffic_captures = os.listdir(dataset)
 85 | 
 86 | 	tasklist = []
 87 | 
 88 | 	for traffic_capture in traffic_captures:
 89 | 		task = MP.Process(target = runQuantization, args = (dataset, traffic_capture, binWidth, ipt_bin_width, sem))
 90 | 		tasklist.append(task)
 91 | 
 92 | 	print "Tasklist size = %s"%(len(tasklist))
 93 | 
 94 | 	# #execute commands in parallel
 95 | 	for i in range(0, len(tasklist), n_processes):
 96 | 		for k,task in enumerate(tasklist[i:i+n_processes]):
 97 | 			tasklist[i+k].start()
 98 | 		for k, task in enumerate(tasklist[i:i+n_processes]):
 99 | 			tasklist[i+k].join()
100 | 			#print "Joined task number %s"%(i+k)
101 | 


--------------------------------------------------------------------------------
/Security Tasks Evaluation/WFAnalysis/AllWebsiteAnalysis/ParsingUtilities/CSVParseToSimulateHerrman.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from datetime import datetime, timedelta
  4 | from collections import defaultdict, OrderedDict
  5 | 
  6 | def RoundToNearest(n, m):
  7 |     if (m == 1):
  8 |         return n
  9 |     if (n > 0):
 10 |         r = n % m
 11 |         return n + m - r if r + r >= m else n - r
 12 |     else:
 13 |         if (n < 0):
 14 |             return RoundToNearest(abs(n), m) * -1
 15 |     return 0
 16 | 
 17 | def main(argv):
 18 |     name = str(argv[0])
 19 |     BASE_DIR = os.path.dirname(name)
 20 |     file = open(name,'r')
 21 |     binWidth = int(argv[1])
 22 |     
 23 |     trainSet = open(BASE_DIR+"/TrainSet_" + str(binWidth) + ".csv", 'w')
 24 |     testSet = open(BASE_DIR+"/TestSet_" + str(binWidth) + ".csv", 'w')    
 25 |     
 26 |     minBucket = RoundToNearest(-1500, binWidth)
 27 |     maxBucket = RoundToNearest(1500, binWidth) + 1
 28 |     for size in range(minBucket, maxBucket, binWidth):
 29 |         trainSet.write("packetLengthBin_" + str(size) + ", ")
 30 |         testSet.write("packetLengthBin_" + str(size) + ", ")
 31 |     trainSet.write("class\n")
 32 |     testSet.write("class\n")    
 33 |     
 34 |     i = 0
 35 | 
 36 |     TFlineToWrite = []
 37 |     CNlineToWrite = []
 38 | 
 39 | 
 40 |     lineToWrite = OrderedDict()
 41 | 
 42 |     l = file.readline()
 43 |     l = file.readline()
 44 |     l = file.readline()
 45 |     l.rstrip('\n')
 46 | 
 47 |     lineNumber = 0
 48 |     while l:
 49 |         lineSplit = l.split(" ")
 50 |         if (lineNumber % 2 == 0):
 51 |             timestamp = lineSplit[2]
 52 |         else:
 53 |             website = lineSplit[0][:-1]
 54 |             lineToWrite[website+"|"+timestamp] = {}
 55 |             lineToWrite[website+"|"+timestamp] = defaultdict(lambda:0, lineToWrite[website+"|"+timestamp])
 56 |             t = lineToWrite[website+"|"+timestamp]
 57 |             for x in lineSplit[1:]:
 58 |                 try:
 59 |                     t[str(RoundToNearest(int(x), binWidth))] += 1
 60 |                 except:
 61 |                     continue
 62 |             lineToWrite[website+"|"+timestamp] = t
 63 |         lineNumber += 1
 64 |         l = file.readline()
 65 |         l.rstrip('\n')
 66 | 
 67 |     max = 4
 68 |     max2 = max + 4
 69 |     counter = 0
 70 |     currentWebSite = ""
 71 |     for j in lineToWrite:
 72 |         if (currentWebSite != j.split("|")[0]):
 73 |             counter = 0
 74 | 
 75 |         currentWebSite = j.split("|")[0]
 76 | 
 77 |         if (counter < max):
 78 |             for s in range(minBucket, maxBucket, binWidth):
 79 |                 trainSet.write(str(lineToWrite[j][str(s)]) + ", ")
 80 |             trainSet.write(currentWebSite + "\n")
 81 |             if (counter == 0):
 82 |                 firstTimeStamp = datetime.strptime(j.split("|")[1], "%Y-%m-%d#%H:%M:%S")
 83 |                 secondTimeStamp = firstTimeStamp + timedelta(days=8)
 84 |             counter += 1
 85 |         else:
 86 |             if (datetime.strptime(j.split("|")[1], "%Y-%m-%d#%H:%M:%S") < secondTimeStamp):
 87 |                 lineToWrite[j] = {}
 88 |                 continue
 89 |             if (counter < max2):
 90 |                 for s in range(minBucket, maxBucket, binWidth):
 91 |                     testSet.write(str(lineToWrite[j][str(s)]) + ", ")
 92 |                 testSet.write(currentWebSite + "\n")
 93 |             counter += 1
 94 | 
 95 |         lineToWrite[j] = {}
 96 | 
 97 | if __name__ == "__main__":
 98 |     main(sys.argv[1:])
 99 | 
100 | 


--------------------------------------------------------------------------------
/Security Tasks Evaluation/WFAnalysis/AllWebsiteAnalysis/ParsingUtilities/CSVParseWebsiteUnbalanced.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import csv
  3 | import sys
  4 | from datetime import datetime, timedelta
  5 | import collections
  6 | import math
  7 | from collections import defaultdict, OrderedDict
  8 | import numpy as np
  9 | 
 10 | def RoundToNearest(n, m):
 11 |     if (m == 1):
 12 |         return n    
 13 |     if (n > 0):
 14 |         r = n % m
 15 |         return n + m - r if r + r >= m else n - r
 16 |     else:
 17 |         if (n < 0):
 18 |             return RoundToNearest(abs(n), m) * -1
 19 |     return 0
 20 | 
 21 | def extractDistributionWithoutTruncation(argv):
 22 |     BASE_DIR = os.path.dirname(argv[0])
 23 |     file = open(argv[0],'r')
 24 |     
 25 |     binWidth = int(argv[1])
 26 |     websiteToClassify = argv[2]
 27 | 
 28 |     if not os.path.exists(BASE_DIR + "/" + websiteToClassify):
 29 |         os.makedirs(BASE_DIR + "/" + websiteToClassify)
 30 |         
 31 |     trainSet = open(BASE_DIR + "/" + websiteToClassify + "/TrainSet_" + str(binWidth) + ".csv", 'w')
 32 |     testSet = open(BASE_DIR + "/" + websiteToClassify + "/TestSet_" + str(binWidth) + ".csv", 'w')
 33 |     
 34 | 
 35 |     #Set for all possible quantized buckets
 36 |     binsUsedByWebsite = set()
 37 |     minBucket = RoundToNearest(-1500, binWidth)
 38 |     maxBucket = RoundToNearest(1500, binWidth) + 1
 39 |     for size in range(minBucket, maxBucket, binWidth):
 40 |         binsUsedByWebsite.add(RoundToNearest(size, binWidth))
 41 | 
 42 | 
 43 |     websiteTrainInstances = int(argv[3])
 44 |     websiteTestInstances = int(argv[4])
 45 |     
 46 |     ################################################
 47 |     #Build csv with quantized bins
 48 |     ################################################
 49 | 
 50 |     # Write CSV datasets header (with bins used by the target website)
 51 |     for size in range(minBucket, maxBucket, binWidth):
 52 |         if (size in binsUsedByWebsite):
 53 |             trainSet.write("packetLengthBin_" + str(size) + ", ")
 54 |             testSet.write("packetLengthBin_" + str(size) + ", ")
 55 |     trainSet.write("class\n")
 56 |     testSet.write("class\n")
 57 | 
 58 | 
 59 |     file = open(argv[0],'r')
 60 |     l = file.readline() #Take out dataset header
 61 |     l = file.readline() #Take out dataset header
 62 |     trainCounter = 0
 63 |     testCounter = 0
 64 |     currWebsite = ""
 65 |     trainData = []
 66 |     testData =[]
 67 | 
 68 |     for lineNumber, l in enumerate(file.readlines()):
 69 |         lineSplit = l.rstrip('\n').split(" ")
 70 |         if (lineNumber % 2 == 1): #Gather website data
 71 |             website = lineSplit[0][:-1]
 72 |             if(website != currWebsite):
 73 |                 currWebsite = website
 74 |                 trainCounter = 0
 75 |                 testCounter = 0
 76 |             
 77 |             #Build container for sample distribution
 78 |             website_bin_distribution = OrderedDict()
 79 |             for i in sorted(binsUsedByWebsite):
 80 |                 website_bin_distribution[i] = 0
 81 | 
 82 |             #Add useful bins to the sample distribution
 83 |             for packet_size in lineSplit[1:-1]:
 84 |                 packet_size_binned = RoundToNearest(int(packet_size), binWidth)
 85 |                 if(packet_size_binned in binsUsedByWebsite):
 86 |                     website_bin_distribution[packet_size_binned] += 1
 87 | 
 88 | 
 89 |             if(trainCounter < websiteTrainInstances):
 90 |                 bin_list = [] 
 91 |                 for i in website_bin_distribution:
 92 |                     bin_list.append(str(website_bin_distribution[i]))
 93 |                 trainData.append(",".join(bin_list) + ", " + currWebsite + "\n")
 94 |                 trainCounter += 1
 95 |             elif(testCounter < websiteTestInstances):
 96 |                 bin_list = [] 
 97 |                 for i in website_bin_distribution:
 98 |                     bin_list.append(str(website_bin_distribution[i]))
 99 |                 testData.append(",".join(bin_list) + ", " + currWebsite + "\n")
100 |                 #Account for processed sample
101 |                 testCounter += 1
102 |     
103 |     trainSet.write("".join(trainData))
104 |     testSet.write("".join(testData))
105 |     trainSet.close()
106 |     testSet.close()
107 | 
108 | if __name__ == "__main__":
109 |     extractDistributionWithoutTruncation(sys.argv[1:])
110 | 


--------------------------------------------------------------------------------
/Security Tasks Evaluation/WFAnalysis/AllWebsiteAnalysis/runExperiment.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import math
  3 | import subprocess as sub
  4 | import shutil
  5 | import time
  6 | import weka.core.jvm as jvm
  7 | import weka.core.converters as converters
  8 | from weka.core.converters import Loader
  9 | from weka.classifiers import Classifier
 10 | from weka.classifiers import Evaluation
 11 | 
 12 | 
 13 | dataset_location = "Data/openssh.data"
 14 | 
 15 | #export JAVA_HOME=/Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home/
 16 | def ClassifyParam(mode, binWidths):
 17 | 	if not os.path.exists("classificationResults"):
 18 | 		os.makedirs("classificationResults")
 19 | 
 20 | 	if("normal" in mode):
 21 | 		file = open("classificationResults/AllVsAll.csv","w") 
 22 | 
 23 | 		file.write("BinWidth, Accuracy\n")
 24 | 
 25 | 		for binWidth in binWidths:
 26 | 
 27 | 			train_set = "Data/arff/TrainSet_%s.arff"%(binWidth)
 28 | 			test_set = "Data/arff/TestSet_%s.arff"%(binWidth)
 29 | 			print "Loading Datasets..."
 30 | 
 31 | 			train_data = converters.load_any_file(train_set)
 32 | 			test_data = converters.load_any_file(test_set)
 33 | 			#Set class attribute
 34 | 			train_data.class_is_last()
 35 | 			test_data.class_is_last()
 36 | 			print "Dataset Loaded!"
 37 | 
 38 | 
 39 | 			classifier_name = "weka.classifiers.meta.FilteredClassifier"
 40 | 
 41 | 			classifier = Classifier(classname=classifier_name, options=[
 42 | 				"-F", "weka.filters.unsupervised.attribute.StringToWordVector -R first-last -W 1000 -C -T -N 1 -stemmer weka.core.stemmers.NullStemmer -M 1 -tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\" \\\\r\\\\n\\\\t.,;:\\\\\\\'\\\\\\\"()?!\\\"\"",
 43 | 				"-W", "weka.classifiers.bayes.NaiveBayesMultinomial"])
 44 | 
 45 | 
 46 | 			start_train = time.time()
 47 | 			classifier.build_classifier(train_data)
 48 | 			end_train = time.time()
 49 | 			print "Train\t%s\t%s"%(binWidth, end_train-start_train)
 50 | 
 51 | 			for index, inst in enumerate(test_data):
 52 | 				if(index == 0):
 53 | 					start_sample = time.time()
 54 | 					classifier.classify_instance(inst)
 55 | 					end_sample = time.time()
 56 | 					print "Sample\t%s\t%s"%(binWidth, end_sample-start_sample)
 57 | 
 58 | 			print "Evaluating w/ Multinomial Naive Bayes classifier. BinWidth = %s"%(binWidth)
 59 | 			evaluation = Evaluation(test_data)
 60 | 			start_batch = time.time()
 61 | 			evaluation.test_model(classifier, test_data)
 62 | 			end_batch = time.time()
 63 | 			print "Batch\t%s\t%s"%(binWidth,end_batch-start_batch)
 64 | 
 65 | 			
 66 | 			print evaluation.summary()
 67 | 			acc = evaluation.percent_correct/100.0
 68 | 			print "Percent correct: " + str(acc)
 69 | 
 70 | 			file.write("%s, %s\n"%(binWidth, acc))
 71 | 		file.close()
 72 | 
 73 | 
 74 | 
 75 | def QuantizeAndCreateTrainTestDataset(binWidths):
 76 | 	#2/3 train, 1/3 test (150 total, 100 -50)
 77 | 	# Currently 50-50
 78 | 	target_train_instances = 75
 79 | 	target_test_instances = 75
 80 | 
 81 | 	#Placeholder website for parsing script to work (compatibility issues)
 82 | 	website = "www.flickr.com"
 83 | 
 84 | 	for binWidth in binWidths:
 85 | 		simArgs = "python ParsingUtilities/CSVParseWebsiteUnbalanced.py %s %s %s %s %s"%(dataset_location, binWidth, website, target_train_instances, target_test_instances)
 86 | 		print "Quantizing dataset. binWidth = %s"%(binWidth)
 87 | 		sub.call(simArgs, shell = True)
 88 | 	
 89 | 	print "Moving files to Data directory root"
 90 | 	src_folder = "Data/www.flickr.com/"
 91 | 	files = os.listdir(src_folder)
 92 | 	for f in files:
 93 | 			shutil.move(src_folder+f, "Data/")
 94 | 	os.rmdir(src_folder)
 95 | 
 96 | 
 97 | def BuildQuantizedArffDatasets(mode, binWidths):
 98 | 	if not os.path.exists("Data/arff"):
 99 | 		os.makedirs("Data/arff")
100 | 
101 | 	if("normal" in mode):
102 | 		train_set = "TrainSet"
103 | 		test_set = "TestSet"
104 | 
105 | 		for binWidth in binWidths:
106 | 			simArgs = "python ParsingUtilities/CSVParseToWeka.py Data/%s_%s.csv Data/arff/%s_%s.arff"%(train_set, binWidth, train_set, binWidth)
107 | 			print "Generating train dataset. binWidth = %s"%(binWidth)
108 | 			sub.call(simArgs, shell = True)
109 | 
110 | 			simArgs = "python ParsingUtilities/CSVParseToWeka.py Data/%s_%s.csv Data/arff/%s_%s.arff"%(test_set, binWidth, test_set, binWidth)
111 | 			print "Generating test dataset. binWidth = %s"%(binWidth)
112 | 			sub.call(simArgs, shell = True)
113 | 
114 | 
115 | 
116 | 
117 | if __name__ == "__main__":
118 | 	
119 | 	#Quantization
120 | 	BIN_WIDTH = [1, 4, 8, 16, 32, 64, 128, 256]
121 | 
122 | 	QuantizeAndCreateTrainTestDataset(BIN_WIDTH)
123 | 
124 | 
125 | 	BuildQuantizedArffDatasets("normal", BIN_WIDTH)
126 | 	
127 | 	#Classify
128 | 	#Start WEKA execution
129 | 	jvm.start(max_heap_size="4096m")
130 | 	
131 | 	#Classify
132 | 	ClassifyParam("normal", BIN_WIDTH)
133 | 
134 | 	#stop weka execution
135 | 	jvm.stop()
136 | 
137 | 


--------------------------------------------------------------------------------
/Security Tasks Evaluation/MPTAnalysis/FacetAnalysis/runExperiment.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import math
  3 | import subprocess as sub
  4 | 
  5 | from generateFeatures import CompressFeatures, SplitDataset, CompressFeaturesBasedOnTrainData, MergeTestData, ExtractPacketSample
  6 | from classifier import GenerateFeatureImportanceBasedOnTrainData, ClassifyTestDataBasedOnModel, BuildModelBasedOnTrainData
  7 | from generateFigures import GenerateFigures
  8 | from online_sketching import CreateBinaryVectorRepresentation
  9 | from compressive_ta import CreateCompressiveRepresentation
 10 | 
 11 | 
 12 | def Experiment(BIN_WIDTH, TOPK, DELTAS, MEMORY_FACTORS, DATASET_SPLIT, COVERT_FLOWS_PERC, N_FLOWS, ONLINE_SKETCH_SIZE, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO):
 13 |     """
 14 |     Phase 1a)
 15 |     Use full information and generate the best buckets.
 16 |         Datasets are split into half. 
 17 |         
 18 |         We use the first half to train/test a classifier with a balanced dataset in HoldOut 90/10
 19 |     """
 20 |     CompressFeatures(BIN_WIDTH, [TOPK[-1]])
 21 |     SplitDataset(DATASET_SPLIT, N_FLOWS, 1)
 22 |     GenerateFeatureImportanceBasedOnTrainData("normal", BIN_WIDTH, [TOPK[-1]])
 23 | 
 24 |     """
 25 |     Phase 1b)
 26 |     Quantize, truncate and classify according to the best buckets found
 27 |         The first half of each dataset is again used for train/test the classifier with a balanced dataset in HoldOut 90/10
 28 |         However, only the top-K bins are used for performing classification 
 29 |         
 30 |         The built model is saved to use in Phase 2.
 31 |     """
 32 |     CompressFeaturesBasedOnTrainData(BIN_WIDTH, TOPK[:-1])
 33 |     SplitDataset(DATASET_SPLIT, N_FLOWS, COVERT_FLOWS_PERC)
 34 |     BuildModelBasedOnTrainData("normal", BIN_WIDTH, TOPK)
 35 | 
 36 | 
 37 |     """
 38 |     Phase 2
 39 |     Classify new flows using quantized/truncated distributions using the previously built model
 40 |         The second half of each dataset is used for train/test the classifier with an unbalanced dataset
 41 |     """
 42 |     #Quantization + Truncation without sketches
 43 |     ClassifyTestDataBasedOnModel("normal", BIN_WIDTH, TOPK, N_FLOWS)
 44 | 
 45 |     #Generate figures
 46 |     GenerateFigures(BIN_WIDTH, TOPK, DELTAS, MEMORY_FACTORS, N_FLOWS)
 47 | 
 48 |     """
 49 |     Online Sketching - Coskun et al.
 50 |     """
 51 |     CreateBinaryVectorRepresentation(BIN_WIDTH, [TOPK[-1]], ONLINE_SKETCH_SIZE)
 52 |     BuildModelBasedOnTrainData("online", BIN_WIDTH, [TOPK[-1]], ONLINE_SKETCH_SIZE)
 53 |     ClassifyTestDataBasedOnModel("online", BIN_WIDTH, [TOPK[-1]], N_FLOWS, ONLINE_SKETCH_SIZE)
 54 | 
 55 |     """
 56 |     Compressive TA adjusted to packet distribution
 57 |     """
 58 |     CreateCompressiveRepresentation("compressive_gaussian", BIN_WIDTH, [TOPK[-1]], SIGMA_PARAM, COMPRESSIVE_RATIO)
 59 |     BuildModelBasedOnTrainData("compressive_gaussian", BIN_WIDTH, [TOPK[-1]], ONLINE_SKETCH_SIZE, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO)
 60 |     ClassifyTestDataBasedOnModel("compressive_gaussian", BIN_WIDTH, [TOPK[-1]], N_FLOWS, ONLINE_SKETCH_SIZE, DELTAS, MEMORY_FACTORS, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO)
 61 | 
 62 |     CreateCompressiveRepresentation("compressive_bernoulli", BIN_WIDTH, [TOPK[-1]], SIGMA_PARAM, COMPRESSIVE_RATIO)
 63 |     BuildModelBasedOnTrainData("compressive_bernoulli", BIN_WIDTH, [TOPK[-1]], ONLINE_SKETCH_SIZE, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO)
 64 |     ClassifyTestDataBasedOnModel("compressive_bernoulli", BIN_WIDTH, [TOPK[-1]], N_FLOWS, ONLINE_SKETCH_SIZE, DELTAS, MEMORY_FACTORS, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO)
 65 |     
 66 | 
 67 | if __name__ == "__main__":
 68 |     
 69 |     #Quantization
 70 |     BIN_WIDTH = [1, 4, 8, 16, 32, 64, 128, 256]
 71 | 
 72 |     #Truncation Top-K features
 73 |     TOPK = [5, 10, 20, 30, 40, 50, 1500]
 74 |     
 75 |     #Online Sketch
 76 |     ONLINE_SKETCH_SIZE = [64, 128, 256, 512, 1024, 2048]
 77 | 
 78 |     #Proportion of regular flows to input in sketch
 79 |     COVERT_FLOWS_PERC = 1
 80 |     
 81 |     #Proportion to split training phase (1) and testing phase (2)
 82 |     DATASET_SPLIT = 0.5
 83 | 
 84 |     #Total amount of flows per dataset
 85 |     N_FLOWS = 1000
 86 | 
 87 |     #Standard deviation of Gaussian distribution (compressive TA)
 88 |     SIGMA_PARAM = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
 89 | 
 90 |     #Number of packets to compute compressive TA representation
 91 |     NUMBER_OF_PACKETS = [1000, 2000, 4000]
 92 | 
 93 |     #Compression Ratio for Compressive TA
 94 |     COMPRESSIVE_RATIO = [4, 8, 16, 32, 64, 128, 256]
 95 | 
 96 |     #Deprecated
 97 |     DELTAS = [0.95]
 98 |     MEMORY_FACTORS = [8, 4, 2, 1]
 99 | 
100 |     #Run Experiment:
101 |     Experiment(BIN_WIDTH, TOPK, DELTAS, MEMORY_FACTORS, DATASET_SPLIT, COVERT_FLOWS_PERC, N_FLOWS, ONLINE_SKETCH_SIZE, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO)
102 | 
103 | 
104 | 
105 | 
106 | 


--------------------------------------------------------------------------------
/Security Tasks Evaluation/MPTAnalysis/DeltaShaperAnalysis/runExperiment.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import math
  3 | import subprocess as sub
  4 | 
  5 | from generateFeatures import CompressFeatures, SplitDataset, CompressFeaturesBasedOnTrainData, MergeTestData, ExtractPacketSample
  6 | from classifier import GenerateFeatureImportanceBasedOnTrainData, ClassifyTestDataBasedOnModel, BuildModelBasedOnTrainData
  7 | from generateFigures import GenerateFigures, GenerateFiguresLines
  8 | from online_sketching import CreateBinaryVectorRepresentation
  9 | from compressive_ta import CreateCompressiveRepresentation
 10 | 
 11 | def Experiment(BIN_WIDTH, TOPK, DELTAS, MEMORY_FACTORS, DATASET_SPLIT, COVERT_FLOWS_PERC, N_FLOWS, ONLINE_SKETCH_SIZE, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO):
 12 |     """
 13 |     Phase 1a)
 14 |     Use full information and generate the best buckets.
 15 |         Datasets are split into half. 
 16 |         
 17 |         We use the first half to train/test a classifier with a balanced dataset in HoldOut 90/10
 18 |     """
 19 |     CompressFeatures(BIN_WIDTH, [TOPK[-1]])
 20 |     SplitDataset(DATASET_SPLIT, N_FLOWS, 1)
 21 |     GenerateFeatureImportanceBasedOnTrainData("normal", BIN_WIDTH, [TOPK[-1]])
 22 | 
 23 |     """
 24 |     Phase 1b)
 25 |     Quantize, truncate and classify according to the best buckets found
 26 |         The first half of each dataset is again used for train/test the classifier with a balanced dataset in HoldOut 90/10
 27 |         However, only the top-K bins are used for performing classification 
 28 |         
 29 |         The built model is saved to use in Phase 2.
 30 |     """
 31 |     CompressFeaturesBasedOnTrainData(BIN_WIDTH, TOPK[:-1])
 32 |     SplitDataset(DATASET_SPLIT, N_FLOWS, COVERT_FLOWS_PERC)
 33 |     BuildModelBasedOnTrainData("normal", BIN_WIDTH, TOPK)
 34 | 
 35 |     """
 36 |     Phase 2
 37 |     Classify new flows using quantized/truncated distributions using the previously built model
 38 |         The second half of each dataset is used for train/test the classifier with an unbalanced dataset
 39 |     """
 40 |     #Quantization + Truncation
 41 |     ClassifyTestDataBasedOnModel("normal", BIN_WIDTH, TOPK, N_FLOWS, ONLINE_SKETCH_SIZE)
 42 | 
 43 |     #Generate figures
 44 |     GenerateFiguresLines(BIN_WIDTH, TOPK, DELTAS, MEMORY_FACTORS, N_FLOWS)
 45 | 
 46 | 
 47 |     """
 48 |     Online Sketching - Coskun et al.
 49 |     """
 50 |     CreateBinaryVectorRepresentation(BIN_WIDTH, [TOPK[-1]], ONLINE_SKETCH_SIZE)
 51 |     BuildModelBasedOnTrainData("online", BIN_WIDTH, [TOPK[-1]], ONLINE_SKETCH_SIZE)
 52 |     ClassifyTestDataBasedOnModel("online", BIN_WIDTH, [TOPK[-1]], N_FLOWS, ONLINE_SKETCH_SIZE)
 53 | 
 54 |     
 55 |     """
 56 |     Compressive TA adjusted to packet distribution
 57 |     """
 58 |     CreateCompressiveRepresentation("compressive_gaussian", BIN_WIDTH, [TOPK[-1]], SIGMA_PARAM, COMPRESSIVE_RATIO)
 59 |     BuildModelBasedOnTrainData("compressive_gaussian", BIN_WIDTH, [TOPK[-1]], ONLINE_SKETCH_SIZE, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO)
 60 |     ClassifyTestDataBasedOnModel("compressive_gaussian", BIN_WIDTH, [TOPK[-1]], N_FLOWS, ONLINE_SKETCH_SIZE, DELTAS, MEMORY_FACTORS, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO)
 61 | 
 62 |     CreateCompressiveRepresentation("compressive_bernoulli", BIN_WIDTH, [TOPK[-1]], SIGMA_PARAM, COMPRESSIVE_RATIO)
 63 |     BuildModelBasedOnTrainData("compressive_bernoulli", BIN_WIDTH, [TOPK[-1]], ONLINE_SKETCH_SIZE, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO)
 64 |     ClassifyTestDataBasedOnModel("compressive_bernoulli", BIN_WIDTH, [TOPK[-1]], N_FLOWS, ONLINE_SKETCH_SIZE, DELTAS, MEMORY_FACTORS, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO)
 65 | 
 66 | 
 67 | if __name__ == "__main__":
 68 | 
 69 |     #Quantization
 70 |     BIN_WIDTH = [1, 4, 8, 16, 32, 64, 128, 256]
 71 | 
 72 |     #Truncation Top-K features
 73 |     TOPK = [5, 10, 20, 30, 40, 50, 1500]
 74 |     
 75 |     #Online Sketch
 76 |     ONLINE_SKETCH_SIZE = [64, 128, 256, 512, 1024, 2048]
 77 | 
 78 |     #Proportion of regular flows to input in sketch
 79 |     COVERT_FLOWS_PERC = 1
 80 |     
 81 |     #Proportion to split training phase (1) and testing phase (2)
 82 |     DATASET_SPLIT = 0.5
 83 | 
 84 |     #Total amount of flows per dataset
 85 |     N_FLOWS = 300
 86 | 
 87 |     #Standard deviation of Gaussian distribution (compressive TA)
 88 |     SIGMA_PARAM = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
 89 | 
 90 |     #Number of packets to compute compressive TA representation
 91 |     NUMBER_OF_PACKETS = [1000, 2000, 4000]
 92 | 
 93 |     #Compression Ratio for Compressive TA
 94 |     COMPRESSIVE_RATIO = [4, 8, 16, 32, 64, 128, 256]
 95 | 
 96 |     #Deprecated
 97 |     DELTAS = [0.95]
 98 |     MEMORY_FACTORS = [8, 4, 2, 1]
 99 | 
100 |     #Run Experiment:
101 |     Experiment(BIN_WIDTH, TOPK, DELTAS, MEMORY_FACTORS, DATASET_SPLIT, COVERT_FLOWS_PERC, N_FLOWS, ONLINE_SKETCH_SIZE, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO)
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 


--------------------------------------------------------------------------------
/Security Tasks Evaluation/WFAnalysis/SingleWebsiteAnalysis/runExperiment.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import math
  3 | import subprocess as sub
  4 | import time
  5 | import weka.core.jvm as jvm
  6 | import weka.core.converters as converters
  7 | from weka.core.converters import Loader
  8 | from weka.classifiers import Classifier
  9 | from weka.classifiers import Evaluation
 10 | 
 11 | from generateFigures import GenerateFigures
 12 | 
 13 | dataset_location = "Data/openssh.data"
 14 | 
 15 | #export JAVA_HOME=/Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home/
 16 | 
 17 | def ClassifyParam(website, mode, binWidths, truncation_modes=["full", "truncated"]):
 18 | 	if not os.path.exists("classificationResults"):
 19 | 		os.makedirs("classificationResults")
 20 | 
 21 | 	
 22 | 	if("normal" in mode):
 23 | 		for truncation in truncation_modes:
 24 | 			file = open("classificationResults/SingleWebsite_%s_%s.csv"%(truncation, website),"w")
 25 | 			file.write("BinWidth, Accuracy, FalsePositiveRate, FalseNegativeRate\n")
 26 | 
 27 | 			for binWidth in binWidths:
 28 | 
 29 | 				train_set_file = "TrainSet_%s_%s.arff"%(truncation, binWidth)
 30 | 				train_set = "Data/%s/arff/%s"%(website, train_set_file)
 31 | 				test_set = "Data/%s/arff/%s"%(website, train_set_file.replace("TrainSet", "TestSet"))
 32 | 
 33 | 				print "Loading Datasets..."
 34 | 				print "Train: " + train_set
 35 | 				train_data = converters.load_any_file(train_set)
 36 | 				print "Test: " + test_set
 37 | 				test_data = converters.load_any_file(test_set)
 38 | 				
 39 | 				#Set class attribute
 40 | 				train_data.class_is_last()
 41 | 				test_data.class_is_last()
 42 | 				print "Dataset Loaded!"
 43 | 
 44 | 
 45 | 				classifier_name = "weka.classifiers.meta.FilteredClassifier"
 46 | 
 47 | 				classifier = Classifier(classname=classifier_name, options=[
 48 | 					"-F", "weka.filters.unsupervised.attribute.StringToWordVector -R first-last -W 1000 -C -T -N 1 -stemmer weka.core.stemmers.NullStemmer -M 1 -tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\" \\\\r\\\\n\\\\t.,;:\\\\\\\'\\\\\\\"()?!\\\"\"",
 49 | 					"-W", "weka.classifiers.bayes.NaiveBayesMultinomial"])
 50 | 
 51 | 				start_train = time.time()
 52 | 				classifier.build_classifier(train_data)
 53 | 				end_train = time.time()
 54 | 				print "Train\t%s\t%s"%(binWidth, end_train-start_train)
 55 | 
 56 | 				for index, inst in enumerate(test_data):
 57 | 					if(index == 0):
 58 | 						start_sample = time.time()
 59 | 						classifier.classify_instance(inst)
 60 | 						end_sample = time.time()
 61 | 						print "Sample\t%s\t%s"%(binWidth, end_sample-start_sample)
 62 | 
 63 | 				print "Evaluating w/ Multinomial Naive Bayes classifier. BinWidth = %s"%(binWidth)
 64 | 				evaluation = Evaluation(test_data)
 65 | 				start_batch = time.time()
 66 | 				evaluation.test_model(classifier, test_data)
 67 | 				end_batch = time.time()
 68 | 				print "Batch\t%s\t%s"%(binWidth,end_batch-start_batch)
 69 | 				
 70 | 
 71 | 				print evaluation.summary()
 72 | 				print evaluation.matrix()
 73 | 				#Just as an example, we're measuring the fpr and fnr of the website indexed as class 1
 74 | 
 75 | 				tp = evaluation.num_true_positives(1)
 76 | 				tn = evaluation.num_true_negatives(1)
 77 | 				fp = evaluation.num_false_positives(1)
 78 | 				fn = evaluation.num_false_negatives(1)
 79 | 
 80 | 				acc = (tp+tn)/float(tp+tn+fp+fn)
 81 | 				fpr = evaluation.false_positive_rate(1)
 82 | 				fnr = evaluation.false_negative_rate(1)
 83 | 				
 84 | 				print "Accuracy: %s"%(acc)
 85 | 				print "False Positive Rate: %s"%(fpr)
 86 | 				print "False Negative Rate: %s"%(fnr)
 87 | 
 88 | 				file.write("%s, %s, %s, %s\n"%(binWidth, acc, fpr, fnr))
 89 | 			file.close()
 90 | 	
 91 | 
 92 | 
 93 | def QuantizeAndCreateUnbalancedTrainTestDataset(truncate, website, binWidths):
 94 | 	#2/3 train, 1/3 test (150 total, 100 -50)
 95 | 	target_train_instances = 75
 96 | 	target_test_instances = 75
 97 | 
 98 | 	if(truncate):
 99 | 		truncation = 0
100 | 
101 | 		#Init bookeeping of truncated bins
102 | 		if not os.path.exists("truncationInfo"):
103 | 			os.makedirs("truncationInfo")
104 | 		file = open("truncationInfo/" + website + ".csv", "w") 
105 | 		file.write("BinWidth, TruncatedBins\n")
106 | 		file.close()
107 | 	else:
108 | 		truncation = 1
109 | 
110 | 	for binWidth in binWidths:
111 | 		simArgs = "python ParsingUtilities/CSVParseWebsiteUnbalanced.py %s %s %s %s %s %s"%(dataset_location, binWidth, website, target_train_instances, target_test_instances, truncation)
112 | 		print "Quantizing dataset. binWidth = %s"%(binWidth) + ", truncation = " + str(truncate) + ", website = " + website
113 | 		sub.call(simArgs, shell = True)
114 | 
115 | 
116 | 
117 | def BuildQuantizedArffDatasets(website, mode):
118 | 	if not os.path.exists("Data/%s/arff"%(website)):
119 | 		os.makedirs("Data/%s/arff"%(website))
120 | 
121 | 	if("normal" in mode):
122 | 		for f in os.listdir("Data/%s"%(website)):
123 | 			if(".csv" in f and not f.startswith("CountMin")):
124 | 
125 | 				simArgs = "python ParsingUtilities/CSVParseToWeka.py Data/%s/%s Data/%s/arff/%s %s"%(website, f, website, f[:-3] + "arff", website)
126 | 				print "Generating dataset. File = " + f[:-3] + "arff"
127 | 				sub.call(simArgs, shell = True)
128 | 
129 | 
130 | if __name__ == "__main__":
131 | 	modes = ["normal", "sketch"]
132 | 
133 | 	TRUNCATION_MODES = [True, False]
134 | 
135 | 	#Quantization
136 | 	BIN_WIDTH = [1, 4, 8, 16, 32, 64, 128, 256]
137 | 
138 | 	WEBSITES = [
139 | 	"www.citibank.de",
140 | 	"mail.google.com",
141 | 	"www.youtube.com",
142 | 	"www.amazon.com",
143 | 	"www.imdb.com",
144 | 	"www.flickr.com"
145 | 	]
146 | 
147 | 	jvm.start(max_heap_size="4096m")
148 | 	for website in WEBSITES:
149 | 		for truncate in TRUNCATION_MODES:
150 | 			# Generates the train and test dataset
151 | 			#Proportion should be set inside this function
152 | 			QuantizeAndCreateUnbalancedTrainTestDataset(truncate, website, BIN_WIDTH)
153 | 
154 | 		BuildQuantizedArffDatasets(website, "normal")
155 | 
156 | 		"""#Delete raw datasets
157 | 		for file in os.listdir("Data/" + website):
158 | 			if(file.endswith(".csv")):
159 | 				os.remove("Data/" + website + "/" + file)"""
160 | 
161 | 		#Classify
162 | 		ClassifyParam(website, "normal", BIN_WIDTH)
163 | 
164 | 		"""#Delete arff datasets
165 | 		for file in os.listdir("Data/"):
166 | 			if(file.endswith(".arff")):
167 | 				os.remove("Data/" + file)"""
168 | 	
169 | 		#Generate figures
170 | 		GenerateFigures()
171 | 	jvm.stop()
172 | 


--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/peershark/Flow.py:
--------------------------------------------------------------------------------
  1 | from Packet import *
  2 | 
  3 | #input: list of packets, timegap - real number
  4 | #return val: list of flows
  5 | #
  6 | #merges collection of packets(objects) into collection of flows(many-to-one)
  7 | #Working: group packets with same ip-pair(direction irrelevant) and merge all packets for
  8 | #which |packet1.time - packet2.time| < threshold(timegap)
  9 | def packetsToFlows(packets,timegap):
 10 | 	#sanity check for 0 packets 
 11 | 	if len(packets) == 0:
 12 | 		return None
 13 | 
 14 | 	outputflows = []
 15 | 	
 16 | 	#perform a radix-sort to group together packets
 17 | 	#with same ip-pairs(packet.key represents an ip-pair) 
 18 | 	#and sort these packets according to timestamp
 19 | 	packets.sort(key = lambda packet:packet.timestamp)
 20 | 	packets.sort(key = lambda packet:packet.key)
 21 | 	
 22 | 	nextflow = Flow(None)
 23 | 	for nextpacket in packets:
 24 | 		#if ip-pairs dont match or time-difference of prev and current packet greater
 25 | 		#than timegap, create a new flow 
 26 | 		if (nextflow.key != nextpacket.key) or ((nextpacket.timestamp - nextflow.getEnd()) > timegap):
 27 | 			nextflow = Flow(nextpacket)
 28 | 			outputflows.append(nextflow)
 29 | 		#if not then add packet to previous flow
 30 | 		else:
 31 | 			nextflow.addPacket(nextpacket)
 32 | 
 33 | 	return outputflows
 34 | 
 35 | #same as function packetsToFlow but merges flows instead of packets
 36 | def combineFlows(flows, flowgap):
 37 | 	if len(flows) == 0:
 38 | 		return None
 39 | 
 40 | 	outputflows = []
 41 | 
 42 | 	flows.sort(key = lambda flow:flow.getStart())
 43 | 	flows.sort(key = lambda flow:flow.key)
 44 | 	
 45 | 	nextoutflow = Flow(None)
 46 | 	for nextflow in flows:
 47 | 		if (nextoutflow.key != nextflow.key) or ((nextflow.getStart() - nextoutflow.getEnd()) > flowgap):
 48 | 			nextoutflow = nextflow
 49 | 			outputflows.append(nextoutflow)
 50 | 		else:
 51 | 			nextoutflow.addFlow(nextflow)
 52 | 
 53 | 	return outputflows
 54 | 
 55 | def getCustomWeightedAvg(n1, w1, n2, w2):
 56 | 	num = 0
 57 | 	den = 0
 58 | 	if w1 > 0:
 59 | 		num += w1 * n1
 60 | 		den += w1
 61 | 	if w2 > 0:
 62 | 		num += w2 * n2
 63 | 		den	+= w2
 64 | 	if den <= 0:
 65 | 		den = 1
 66 | 	return num / den	
 67 | 
 68 | 
 69 | #write list of flows into file in desired format
 70 | def writeFlowsToFile(flowlist, filename):
 71 | 	outfile = open(filename, 'w')
 72 | 	
 73 | 	to_write = []
 74 | 	for flow in flowlist:
 75 | 		to_write.append(
 76 | 			socket.inet_ntoa(flow.ip1) + ',' +
 77 | 			socket.inet_ntoa(flow.ip2) + ',' +
 78 | 			str(flow.n_packet1) + ',' +
 79 | 			str(flow.n_byte1) + ',' +
 80 | 			'%.6f'%flow.t_start1 + ',' +
 81 | 			'%.6f'%flow.t_end1 + ',' +
 82 | 			'%.6f'%flow.getInterArrivaltime1() + ',' + 
 83 | 			str(flow.n_packet2) + ',' +
 84 | 			str(flow.n_byte2) + ',' +
 85 | 			'%.6f'%flow.t_start2 + ',' +
 86 | 			'%.6f'%flow.t_end2 + ',' +
 87 | 			'%.6f'%flow.getInterArrivaltime2())
 88 | 	
 89 | 	outfile.write("\n".join(to_write))
 90 | 	outfile.close()
 91 | 
 92 | #class which defines the structure of flows
 93 | class Flow:
 94 | 	#constructor of default flow
 95 | 	def __init__(self,firstpacket):
 96 | 		if firstpacket == None:
 97 | 			self.ip1 = None
 98 | 			self.ip2 = None
 99 | 			self.key = None
100 | 			self.n_packet1 = 0
101 | 			self.n_byte1 = 0
102 | 			self.t_start1 = 0
103 | 			self.t_end1 = 0	
104 | 			self.t_interarrival1 = []
105 | 			self.n_packet2 = 0
106 | 			self.n_byte2 = 0	
107 | 			self.t_start2 = 0
108 | 			self.t_end2 = 0
109 | 			self.t_interarrival2 = []
110 | 		else:
111 | 			if firstpacket.source < firstpacket.dest:
112 | 				self.ip1 = firstpacket.source
113 | 				self.ip2 = firstpacket.dest
114 | 				self.n_packet1 = 1
115 | 				self.n_byte1 = firstpacket.size
116 | 				self.t_start1 = firstpacket.timestamp
117 | 				self.t_end1 = firstpacket.timestamp
118 | 				self.t_interarrival1 = []						
119 | 				self.n_packet2 = 0
120 | 				self.n_byte2 = 0	
121 | 				self.t_start2 = 0
122 | 				self.t_end2 = 0
123 | 				self.t_interarrival2 = []
124 | 			else:
125 | 				self.ip1 = firstpacket.dest
126 | 				self.ip2 = firstpacket.source
127 | 				self.n_packet1 = 0
128 | 				self.n_byte1 = 0
129 | 				self.t_start1 = 0
130 | 				self.t_end1 = 0
131 | 				self.t_interarrival1 = []
132 | 				self.n_packet2 = 1			
133 | 				self.n_byte2 = firstpacket.size				
134 | 				self.t_start2 = firstpacket.timestamp
135 | 				self.t_end2 = firstpacket.timestamp
136 | 				self.t_interarrival2 = []			
137 | 			self.key = firstpacket.key
138 | 	
139 | 	#add a flow to the current flow (by changing volume and duration)
140 | 	def addFlow(self,flow):
141 | 		self.t_interarrival1 += flow.t_interarrival1
142 | 		self.t_interarrival2 += flow.t_interarrival2
143 | 		self.n_packet1 += flow.n_packet1
144 | 		self.n_packet2 += flow.n_packet2
145 | 		self.n_byte1 += flow.n_byte1
146 | 		self.n_byte2 += flow.n_byte2
147 | 				
148 | 		temp = min(self.t_start1,flow.t_start1)
149 | 		if temp == 0:
150 | 			self.t_start1 = self.t_start1 + flow.t_start1
151 | 		else:
152 | 			self.t_start1 = temp
153 | 		
154 | 		temp = min(self.t_start2,flow.t_start2)
155 | 		if temp == 0:
156 | 			self.t_start2 = self.t_start2 + flow.t_start2
157 | 		else:
158 | 			self.t_start2 = temp
159 | 		
160 | 		if(self.t_end1 < flow.t_end1):
161 | 			self.t_end1 = flow.t_end1
162 | 		if(self.t_end2 < flow.t_end2):
163 | 			self.t_end2 = flow.t_end2
164 | 	
165 | 	#add a packet to the current flow (by changing volume and duration)
166 | 	def addPacket(self,packet):
167 | 		if packet.source == self.ip1 and packet.dest == self.ip2:			
168 | 			
169 | 			#initialize flow if not initialized
170 | 			if self.n_packet1 == 0:
171 | 				self.t_start1 = packet.timestamp
172 | 				self.t_end1 = packet.timestamp
173 | 				self.n_packet1 += 1
174 | 				self.n_byte1 += packet.size
175 | 				return
176 | 
177 | 			if self.t_end1 < packet.timestamp:
178 | 				self.t_interarrival1.append(packet.timestamp-self.t_end1)
179 | 				self.t_end1 = packet.timestamp
180 | 			elif self.t_start1 > packet.timestamp:
181 | 				self.t_interarrival1.append(self.t_start1-packet.timestamp)
182 | 				self.t_start1 = packet.timestamp
183 | 			self.n_packet1 += 1
184 | 			self.n_byte1 += packet.size			
185 | 		
186 | 		elif packet.source == self.ip2 and packet.dest == self.ip1:
187 | 			
188 | 			#initialize flow if not initialized
189 | 			if self.n_packet2 == 0:
190 | 				self.t_start2 = packet.timestamp
191 | 				self.t_end2 = packet.timestamp
192 | 				self.n_packet2 += 1
193 | 				self.n_byte2 += packet.size
194 | 				return
195 | 			
196 | 			if self.t_end2 < packet.timestamp:
197 | 				self.t_interarrival2.append(packet.timestamp-self.t_end2)
198 | 				self.t_end2 = packet.timestamp
199 | 			elif self.t_start2 > packet.timestamp:
200 | 				self.t_interarrival2.append(self.t_start2-packet.timestamp)
201 | 				self.t_start2 = packet.timestamp
202 | 			self.n_packet2 += 1
203 | 			self.n_byte2 += packet.size
204 | 
205 | 		else:
206 | 			raise Exception('packet does not belong to flow')
207 | 	
208 | 	def getDurationInSeconds(self):
209 | 		return self.getEnd() - self.getStart()
210 | 
211 | 	def getInterArrivaltime(self):
212 | 		combined = (self.t_interarrival1+self.t_interarrival2).sort()
213 | 		if len(combined) > 0:
214 | 			return combined[len(combined)/2]
215 | 		return 0	
216 | 	
217 | 	def getInterArrivaltime1(self):
218 | 		self.t_interarrival1.sort()
219 | 		if len(self.t_interarrival1) > 0:
220 | 			return self.t_interarrival1[len(self.t_interarrival1)/2]
221 | 		return 0
222 | 
223 | 	def getInterArrivaltime2(self):
224 | 		self.t_interarrival2.sort()
225 | 		if len(self.t_interarrival2) > 0:
226 | 			return self.t_interarrival2[len(self.t_interarrival2)/2]
227 | 		return 0	
228 | 	
229 | 	def getNoOfBytes(self):
230 | 		return self.n_byte1 + self.n_byte2
231 | 
232 | 	def getNoOfPackets(self):
233 | 		return self.n_packet1 + self.n_packet2
234 | 
235 | 	def getStart(self):
236 | 		temp =  min(self.t_start1, self.t_start2)
237 | 		if temp == 0:
238 | 			return self.t_start1 + self.t_start2
239 | 		else:
240 | 			return temp
241 | 
242 | 	def getEnd(self):
243 | 		return max(self.t_end1, self.t_end2)
244 | 


--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/runExperiment.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import math
  4 | import subprocess as sub
  5 | import shutil
  6 | import csv
  7 | import numpy as np
  8 | import multiprocessing as MP
  9 | import time
 10 | 
 11 | import gc
 12 | 
 13 | import warnings
 14 | warnings.filterwarnings("ignore", category=FutureWarning)
 15 | 
 16 | from sklearn.metrics import accuracy_score, confusion_matrix
 17 | from sklearn.model_selection import train_test_split, StratifiedKFold
 18 | from sklearn.ensemble import RandomForestClassifier
 19 | from joblib import dump, load
 20 | 
 21 | from peershark.GenerateFlows import runGenerateFlows
 22 | from peershark.generateSuperFlows import runGenerateSuperFlows
 23 | from peershark.createTrainingData import runTrainingDataGenerator
 24 | from quantize import QuantizeDataset
 25 | 
 26 | data_location = "Data/"
 27 | 
 28 | 
 29 | def Classify(binWidth, ipt_bin_width):
 30 | 	dataset_path = 'TrainingData/Datasets/Dataset_%s_%s.csv'%(binWidth, ipt_bin_width)
 31 | 	with open(dataset_path, 'rb') as dataset_file:
 32 | 		print "Loading Dataset: %s ..."%(dataset_path)
 33 | 
 34 | 		attributes = []
 35 | 		labels = []
 36 | 		csv_reader = csv.reader(dataset_file)
 37 | 		for n, row in enumerate(csv_reader):
 38 | 			if(n == 0):
 39 | 				continue
 40 | 			else:
 41 | 				attributes.append(row[:-1])
 42 | 				labels.append(row[-1])
 43 | 		
 44 | 		#Split data in 66% train, 33% test
 45 | 		train_x, test_x, train_y, test_y = train_test_split(attributes, labels, test_size=0.33, random_state=42, stratify=labels)
 46 | 
 47 | 		#Define classifier
 48 | 		classifier = RandomForestClassifier(random_state=42)
 49 | 
 50 | 		#Train classifier
 51 | 		#start_train = time.time()
 52 | 		model = classifier.fit(np.asarray(train_x), np.asarray(train_y))
 53 | 		#end_train = time.time()
 54 | 		#print "Model trained in %ss"%(end_train-start_train)
 55 | 
 56 | 		#for sample in test_x:
 57 | 		#	start_sample = time.time()
 58 | 		#	model.predict(np.asarray(sample).reshape((1,-1)))
 59 | 		#	end_sample = time.time()
 60 | 		#	print "Sample predicted in %ss"%(end_sample-start_sample)
 61 | 		
 62 | 		#Perform predictions
 63 | 		print "Predicting %s samples"%(len(test_x))
 64 | 		#start_batch = time.time()
 65 | 		predictions = model.predict(np.asarray(test_x))
 66 | 		#end_batch = time.time()
 67 | 		#print "Batch predicted in %ss"%(end_batch-start_batch)
 68 | 
 69 | 		#Generate metrics (benign)
 70 | 		TN, FP, FN, TP = confusion_matrix(np.asarray(test_y), predictions, labels=["malicious","benign"]).ravel()
 71 | 		FPR_BENIGN = float(FP)/(float(FP)+float(TN))
 72 | 		RECALL_BENIGN = float(TP)/(float(TP) + float(FN))
 73 | 		PRECISION_BENIGN = float(TP)/(float(TP) + float(FP))
 74 | 
 75 | 		print "Model Precision (benign): " + "{0:.3f}".format(PRECISION_BENIGN)
 76 | 		print "Model Recall (benign): " + "{0:.3f}".format(RECALL_BENIGN)
 77 | 		print "Model FPR (benign): " + "{0:.3f}".format(FPR_BENIGN)
 78 | 		
 79 | 
 80 | 		#Generate metrics (malicious)
 81 | 		TN, FP, FN, TP = confusion_matrix(np.asarray(test_y), predictions, labels=["benign","malicious"]).ravel()
 82 | 		FPR_MALICIOUS = float(FP)/(float(FP)+float(TN))
 83 | 		RECALL_MALICIOUS = float(TP)/(float(TP) + float(FN))
 84 | 		PRECISION_MALICIOUS = float(TP)/(float(TP) + float(FP))
 85 | 
 86 | 		print "Model Precision (malicious): " + "{0:.3f}".format(PRECISION_MALICIOUS)
 87 | 		print "Model Recall (malicious): " + "{0:.3f}".format(RECALL_MALICIOUS)
 88 | 		print "Model FPR (malicious): " + "{0:.3f}".format(FPR_MALICIOUS)
 89 | 
 90 | 		results_file = open("classificationResults/results.csv","a") 
 91 | 		results_file.write("%s, %s, %s, %s, %s, %s, %s, %s\n"%(binWidth, ipt_bin_width, "{0:.3f}".format(PRECISION_BENIGN), "{0:.3f}".format(RECALL_BENIGN), "{0:.3f}".format(FPR_BENIGN), "{0:.3f}".format(PRECISION_MALICIOUS), "{0:.3f}".format(RECALL_MALICIOUS), "{0:.3f}".format(FPR_MALICIOUS)))
 92 | 		results_file.flush()
 93 | 		results_file.close()
 94 | 		print ""
 95 | 
 96 | 
 97 | def GenerateDataset(datasets, binWidth, ipt_bin_width):
 98 | 	if not os.path.exists('TrainingData/Datasets'):
 99 | 				os.makedirs('TrainingData/Datasets')
100 | 	
101 | 	datasets_to_merge = []
102 | 	for dataset in datasets:
103 | 		dataset = os.path.basename(dataset)
104 | 		datasets_to_merge.append('TrainingData/%s/trainingdata_%s_%s.csv'%(dataset, binWidth, ipt_bin_width))
105 | 
106 | 	#Merge datasets in a single file
107 | 	with open('TrainingData/Datasets/Dataset_%s_%s.csv'%(binWidth, ipt_bin_width), "w") as out_dataset:
108 | 		out_dataset.write("NumberOfPackets,TotalBytesTransmitted,MedianIPT,ConversationDuration,class\n")
109 | 		for fname in datasets_to_merge:
110 | 			with open(fname, 'rb') as infile:
111 | 				csv_reader = csv.reader(infile)
112 | 				for row in csv_reader:
113 | 					new_row = row
114 | 					if(row[4] == "P2PTraffic"):
115 | 						new_row[4] = "benign"
116 | 					else:
117 | 						new_row[4] = "malicious"
118 | 					out_dataset.write(",".join(new_row) + "\n")
119 | 
120 | 
121 | def RunPeerShark(quantized_pcap_data_dir, flow_data_dir, super_flow_data_dir, training_data_dir, bin_width, ipt_bin_width):
122 | 	#create a semaphore so as not to exceed threadlimit
123 | 	n_processes = 4
124 | 
125 | 	#Set TIMEGAP 
126 | 	timegap = 2000
127 | 
128 | 	print "Generating Flows with TIMEGAP = %s"%(timegap)
129 | 	runGenerateFlows(quantized_pcap_data_dir, flow_data_dir, n_processes, timegap)
130 | 
131 | 	#Set FLOWGAP in seconds
132 | 	flowgap = 3600
133 | 
134 | 	print "Generating SuperFlows with FLOWGAP = %s"%(flowgap)
135 | 	runGenerateSuperFlows(flow_data_dir, super_flow_data_dir, flowgap)
136 | 
137 | 	print "Generating Training Data..."
138 | 	runTrainingDataGenerator(super_flow_data_dir, training_data_dir, bin_width, ipt_bin_width)
139 | 
140 | 
141 | def Experiment(datasets, bin_width, ipt_bin_width):
142 | 
143 | 	if not os.path.exists('FeatureSets'):
144 | 				os.makedirs('FeatureSets')
145 | 
146 | 	#Quantize datasets according to bin width
147 | 	#Generate training sets for quantization
148 | 	for dataset in datasets:
149 | 		quantized_pcap_data_dir = 'FeatureSets/' + os.path.basename(dataset) + "/"
150 | 		flow_data_dir = 'FlowData/' + os.path.basename(dataset) + "/"
151 | 		superflow_data_dir = 'SuperFlowData/' + os.path.basename(dataset) + "/"
152 | 		training_data_dir = 'TrainingData/' + os.path.basename(dataset) + "/"
153 | 
154 | 		if not os.path.exists('FeatureSets/' + os.path.basename(dataset)):
155 | 			os.makedirs('FeatureSets/' + os.path.basename(dataset))
156 | 		
157 | 		if not os.path.exists('FlowData/' + os.path.basename(dataset)):
158 | 			os.makedirs('FlowData/' + os.path.basename(dataset))
159 | 		
160 | 		if not os.path.exists('SuperFlowData/' + os.path.basename(dataset)):
161 | 			os.makedirs('SuperFlowData/' + os.path.basename(dataset))
162 | 		
163 | 		if not os.path.exists('TrainingData/' + os.path.basename(dataset)):
164 | 			os.makedirs('TrainingData/' + os.path.basename(dataset))
165 | 
166 | 
167 | 		print "Quantizing %s with BinWidth = %s and IPT_BinWidth = %s"% (dataset, binWidth, ipt_bin_width)
168 | 		n_processes = 4
169 | 		QuantizeDataset(dataset, bin_width, ipt_bin_width, n_processes)
170 | 		RunPeerShark(quantized_pcap_data_dir, flow_data_dir, superflow_data_dir, training_data_dir, bin_width, ipt_bin_width)
171 | 
172 | 	print "Building Dataset..."
173 | 	GenerateDataset(datasets, binWidth, ipt_bin_width)
174 | 
175 | 	print "Performing Classification..."
176 | 	Classify(binWidth, ipt_bin_width)
177 | 	
178 | 	start_collect = time.time()
179 | 	collected = gc.collect()
180 | 	end_collect = time.time()
181 | 	print "Time wasted on GC - Classification: %ss, collected %s objects"%(end_collect-start_collect, collected)
182 | 
183 | 	shutil.rmtree('FeatureSets')
184 | 	shutil.rmtree('FlowData')
185 | 	shutil.rmtree('SuperFlowData')
186 | 	shutil.rmtree('TrainingData')
187 | 
188 | 
189 | 
190 | if __name__ == "__main__":
191 | 	
192 | 	DATASETS = [
193 | 	data_location + "Waledac",
194 | 	data_location + "Storm",
195 | 	data_location + "P2PTraffic"
196 | 	]
197 | 
198 | 	###
199 | 	#The following parameters are now fed by the fullRun.sh shell script
200 | 	# Please run fullRun.sh instead of this file directly
201 | 	###
202 | 
203 | 	#Quantization (packet size)
204 | 	#BIN_WIDTH = [1, 16, 32, 64, 128, 256]
205 | 	
206 | 	#Quantization (IPT in seconds)
207 | 	#TIMEGAP IS 2000s, FLOWGAP IS 3600s
208 | 	#IPT_BIN_WIDTH = [0, 1, 10, 60, 300, 900]
209 | 
210 | 	if not os.path.exists("classificationResults"):
211 | 		os.makedirs("classificationResults")
212 | 	results_file = open("classificationResults/results.csv","a+") 
213 | 	results_file.write("BinWidth, IPT_BinWidth, Precision_Benign, Recall_Benign, FalsePositiveRate_Benign, Precision_Malicious, Recall_Malicious, FalsePositiveRate_Malicious\n")
214 | 	results_file.flush()
215 | 	results_file.close()
216 | 	
217 | 
218 | 	binWidth = int(sys.argv[1])
219 | 	ipt_bin_width = int(sys.argv[2])
220 | 
221 | 	print "Starting experiment with Bin width %s and IPT Bin Width %s"%(binWidth, ipt_bin_width)
222 | 	start_time = time.time()
223 | 	Experiment(DATASETS, binWidth, ipt_bin_width)
224 | 	end_time = time.time()
225 | 	time_elapsed_seconds = end_time - start_time
226 | 	print "Experiment finished in %sh\n"%("{0:.2f}".format(time_elapsed_seconds/60.0/60.0))
227 | 
228 | 


--------------------------------------------------------------------------------
/Security Tasks Evaluation/WFAnalysis/SingleWebsiteAnalysis/ParsingUtilities/CSVParseWebsiteUnbalanced.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import csv
  3 | import sys
  4 | from datetime import datetime, timedelta
  5 | import collections
  6 | import math
  7 | from collections import defaultdict, OrderedDict
  8 | import numpy as np
  9 | 
 10 | def RoundToNearest(n, m):
 11 |     if (m == 1):
 12 |         return n    
 13 |     if (n > 0):
 14 |         r = n % m
 15 |         return n + m - r if r + r >= m else n - r
 16 |     else:
 17 |         if (n < 0):
 18 |             return RoundToNearest(abs(n), m) * -1
 19 |     return 0
 20 | 
 21 | def extractDistributionWithoutTruncation(argv):
 22 |     BASE_DIR = os.path.dirname(argv[0])
 23 |     file = open(argv[0],'r')
 24 |     
 25 |     binWidth = int(argv[1])
 26 |     websiteToClassify = argv[2]
 27 | 
 28 |     if not os.path.exists(BASE_DIR + "/" + websiteToClassify):
 29 |         os.makedirs(BASE_DIR + "/" + websiteToClassify)
 30 |         
 31 |     trainSet = open(BASE_DIR + "/" + websiteToClassify + "/TrainSet_full_" + str(binWidth) + ".csv", 'w')
 32 |     testSet = open(BASE_DIR + "/" + websiteToClassify + "/TestSet_full_" + str(binWidth) + ".csv", 'w')
 33 |     
 34 | 
 35 |     #Set for all possible quantized buckets
 36 |     binsUsedByWebsite = set()
 37 |     minBucket = RoundToNearest(-1500, binWidth)
 38 |     maxBucket = RoundToNearest(1500, binWidth) + 1
 39 |     for size in range(minBucket, maxBucket, binWidth):
 40 |         binsUsedByWebsite.add(RoundToNearest(size, binWidth))
 41 | 
 42 | 
 43 |     websiteTrainInstances = int(argv[3])
 44 |     websiteTestInstances = int(argv[4])
 45 |     
 46 |     ################################################
 47 |     #Build csv with quantized bins
 48 |     ################################################
 49 | 
 50 |     # Write CSV datasets header (with bins used by the target website)
 51 |     for size in range(minBucket, maxBucket, binWidth):
 52 |         if (size in binsUsedByWebsite):
 53 |             trainSet.write("packetLengthBin_" + str(size) + ", ")
 54 |             testSet.write("packetLengthBin_" + str(size) + ", ")
 55 |     trainSet.write("class\n")
 56 |     testSet.write("class\n")
 57 | 
 58 | 
 59 |     file = open(argv[0],'r')
 60 |     l = file.readline() #Take out dataset header
 61 |     l = file.readline() #Take out dataset header
 62 |     trainCounter = 0
 63 |     testCounter = 0
 64 |     currWebsite = ""
 65 |     trainData = []
 66 |     testData =[]
 67 | 
 68 |     for lineNumber, l in enumerate(file.readlines()):
 69 |         lineSplit = l.rstrip('\n').split(" ")
 70 |         if (lineNumber % 2 == 1): #Gather website data
 71 |             website = lineSplit[0][:-1]
 72 |             if(website != currWebsite):
 73 |                 currWebsite = website
 74 |                 trainCounter = 0
 75 |                 testCounter = 0
 76 |             
 77 |             #Build container for sample distribution
 78 |             website_bin_distribution = OrderedDict()
 79 |             for i in sorted(binsUsedByWebsite):
 80 |                 website_bin_distribution[i] = 0
 81 | 
 82 |             #Add useful bins to the sample distribution
 83 |             for packet_size in lineSplit[1:-1]:
 84 |                 packet_size_binned = RoundToNearest(int(packet_size), binWidth)
 85 |                 if(packet_size_binned in binsUsedByWebsite):
 86 |                     website_bin_distribution[packet_size_binned] += 1
 87 | 
 88 | 
 89 |             if(trainCounter < websiteTrainInstances):
 90 |                 bin_list = [] 
 91 |                 for i in website_bin_distribution:
 92 |                     bin_list.append(str(website_bin_distribution[i]))
 93 |                 trainData.append(",".join(bin_list) + ", " + currWebsite + "\n")
 94 |                 trainCounter += 1
 95 |             elif(testCounter < websiteTestInstances):
 96 |                 bin_list = [] 
 97 |                 for i in website_bin_distribution:
 98 |                     bin_list.append(str(website_bin_distribution[i]))
 99 |                 testData.append(",".join(bin_list) + ", " + currWebsite + "\n")
100 |                 #Account for processed sample
101 |                 testCounter += 1
102 |     
103 |     trainSet.write("".join(trainData))
104 |     testSet.write("".join(testData))
105 |     trainSet.close()
106 |     testSet.close()
107 | 
108 | 
109 | def extractDistributionWithTruncation(argv):
110 |     BASE_DIR = os.path.dirname(argv[0])
111 |     file = open(argv[0],'r')
112 |     
113 |     binWidth = int(argv[1])
114 |     websiteToClassify = argv[2]
115 | 
116 |     if not os.path.exists(BASE_DIR + "/" + websiteToClassify):
117 |         os.makedirs(BASE_DIR + "/" + websiteToClassify)
118 |         
119 |     trainSet = open(BASE_DIR + "/" + websiteToClassify + "/TrainSet_truncated_" + str(binWidth) + ".csv", 'w')
120 |     testSet = open(BASE_DIR + "/" + websiteToClassify + "/TestSet_truncated_" + str(binWidth) + ".csv", 'w')
121 |     
122 | 
123 |     websiteTrainInstances = int(argv[3])
124 |     websiteTestInstances = int(argv[4])
125 |     
126 |     trainInstancesCounter = 0
127 |     binsUsedByWebsite = set()
128 |     minBucket = RoundToNearest(-1500, binWidth)
129 |     maxBucket = RoundToNearest(1500, binWidth) + 1
130 |     
131 |     ################################################
132 |     #Gather list of quantized buckets used by the target website in the training set (1st pass)
133 |     ################################################
134 | 
135 |     l = file.readline() #Take out dataset header
136 |     l = file.readline() #Take out dataset header
137 |     for lineNumber, l in enumerate(file.readlines()):
138 |         lineSplit = l.rstrip('\n').split(" ")
139 |         if (lineNumber % 2 == 1): #Gather website data
140 |             website = lineSplit[0][:-1]
141 |             if (website == websiteToClassify):
142 |                 if(trainInstancesCounter < websiteTrainInstances):
143 |                     for packet_size in lineSplit[1:-1]:
144 |                         binsUsedByWebsite.add(RoundToNearest(int(packet_size), binWidth))
145 |                     trainInstancesCounter += 1
146 |                 else:
147 |                     break #We've analysed all training websiteToClassify samples
148 | 
149 | 
150 |     #Get to know the amount of buckets used for measuring a given website
151 |     print "Total number of buckets: " + str(int(math.floor(3000.0/binWidth)))
152 |     print "Number of buckets after truncation: " + str(len(binsUsedByWebsite))
153 |     #Write these stats to a file
154 |     file = open("truncationInfo/" + websiteToClassify + ".csv", "a")
155 |     file.write("%s, %s\n"%(binWidth, len(binsUsedByWebsite)))
156 |     file.close()
157 | 
158 |     ################################################
159 |     #Build csv with truncated bins (2nd pass)
160 |     ################################################
161 | 
162 |     # Write CSV datasets header (with bins used by the target website)
163 |     for size in range(minBucket, maxBucket, binWidth):
164 |         if (size in binsUsedByWebsite):
165 |             trainSet.write("packetLengthBin_" + str(size) + ", ")
166 |             testSet.write("packetLengthBin_" + str(size) + ", ")
167 |     trainSet.write("class\n")
168 |     testSet.write("class\n")
169 | 
170 | 
171 |     file = open(argv[0],'r')
172 |     l = file.readline() #Take out dataset header
173 |     l = file.readline() #Take out dataset header
174 |     trainCounter = 0
175 |     testCounter = 0
176 |     currWebsite = ""
177 |     trainData = []
178 |     testData =[]
179 | 
180 |     for lineNumber, l in enumerate(file.readlines()):
181 |         lineSplit = l.rstrip('\n').split(" ")
182 |         if (lineNumber % 2 == 1): #Gather website data
183 |             website = lineSplit[0][:-1]
184 |             if(website != currWebsite):
185 |                 currWebsite = website
186 |                 trainCounter = 0
187 |                 testCounter = 0
188 |             
189 |             #Build container for sample distribution
190 |             website_bin_distribution = OrderedDict()
191 |             for i in sorted(binsUsedByWebsite):
192 |                 website_bin_distribution[i] = 0
193 | 
194 |             #Add useful bins to the sample distribution
195 |             for packet_size in lineSplit[1:-1]:
196 |                 packet_size_binned = RoundToNearest(int(packet_size), binWidth)
197 |                 if(packet_size_binned in binsUsedByWebsite):
198 |                     website_bin_distribution[packet_size_binned] += 1
199 | 
200 | 
201 |             if(trainCounter < websiteTrainInstances):
202 |                 bin_list = [] 
203 |                 for i in website_bin_distribution:
204 |                     bin_list.append(str(website_bin_distribution[i]))
205 |                 trainData.append(",".join(bin_list) + ", " + currWebsite + "\n")
206 |                 trainCounter += 1
207 |             elif(testCounter < websiteTestInstances):
208 |                 bin_list = [] 
209 |                 for i in website_bin_distribution:
210 |                     bin_list.append(str(website_bin_distribution[i]))
211 |                 testData.append(",".join(bin_list) + ", " + currWebsite + "\n")
212 |                 #Account for processed sample
213 |                 testCounter += 1
214 |     
215 |     trainSet.write("".join(trainData))
216 |     testSet.write("".join(testData))
217 |     trainSet.close()
218 |     testSet.close()
219 | 
220 | if __name__ == "__main__":
221 |     if (int(sys.argv[-1]) == 1):
222 |         extractDistributionWithoutTruncation(sys.argv[1:-1])
223 |     else:
224 |         extractDistributionWithTruncation(sys.argv[1:-1])


--------------------------------------------------------------------------------
/Security Tasks Evaluation/MPTAnalysis/FacetAnalysis/online_sketching.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import numpy as np
  3 | 
  4 | 
  5 | def CreateBinaryVectorRepresentation(BIN_WIDTH, TOPK, SKETCH_SIZE):
  6 |     
  7 |     for sketch_size in SKETCH_SIZE:
  8 |         for binWidth in BIN_WIDTH:
  9 |             for topk in TOPK:
 10 |                 
 11 |                 """
 12 |                 Generate random base vectors
 13 |                 """
 14 |                 
 15 |                 if(topk != 1500):
 16 |                     real_bucket_number = topk
 17 |                 else:
 18 |                     real_bucket_number = 1500/binWidth
 19 |                         
 20 |                 random_base_vectors = []
 21 |                 for i in range(0, sketch_size):
 22 |                     random_base_vector = (2*np.random.randint(0,2,size=(real_bucket_number))-1)
 23 |                     random_base_vectors.append(random_base_vector)
 24 | 
 25 |                 n_bits = range(0, sketch_size)
 26 | 
 27 |                 """
 28 |                 Process Phase 1 Data
 29 |                 """
 30 | 
 31 |                 feature_set = 'PL_60_' + str(binWidth) + '_' + str(topk)
 32 |                 data_folder = 'FeatureSets/' + feature_set + '/'
 33 | 
 34 |                 #Regular Traffic
 35 |                 print "Online_Sketch: Phase 1, Regular - " + feature_set + "/Online_regularTraffic_phase1_" + str(sketch_size) + "_dataset.csv"
 36 |                 output = open(data_folder + "Online_regularTraffic_phase1_" + str(sketch_size) + "_dataset.csv", "w") 
 37 |                 f = open(data_folder + "RegularTraffic_phase1_dataset.csv", 'r')
 38 |                 reader = csv.reader(f, delimiter=',')
 39 | 
 40 |                 #Process data row
 41 |                 for n, row in enumerate(reader):
 42 |                     if(n == 0):
 43 |                         output.write(",".join(str(x) for x in n_bits) + "\n")
 44 |                     else:
 45 |                         #Gather the packet vector array (v_f)
 46 |                         packet_count_vector = []
 47 |                         for i in row[:-1]:
 48 |                             packet_count_vector.append(int(i))
 49 | 
 50 |                         #Compute the integer array (c_f)
 51 |                         integer_array = []
 52 |                         for i in range(0, sketch_size):
 53 |                             c_f_i = 0
 54 |                             for j in range(0, real_bucket_number):
 55 |                                 #print "Random_base_vector: " + str(random_base_vectors[i])
 56 |                                 c_f_i += random_base_vectors[i][j] * packet_count_vector[j]
 57 |                             integer_array.append(c_f_i)
 58 | 
 59 |                         #Compute the binary array (s_f)
 60 |                         binary_array = []
 61 |                         for i in integer_array:
 62 |                             if(i <= 0):
 63 |                                 binary_array.append(0)
 64 |                             else:
 65 |                                 binary_array.append(1)
 66 | 
 67 |                         #print "Binary array: " + str(binary_array)
 68 |                         output.write(",".join(str(x) for x in binary_array) + "," + row[-1] + "\n")
 69 |                 output.close()
 70 | 
 71 | 
 72 |                 #Facet Traffic
 73 |                 print "Online_Sketch: Phase 1, Facet - " + feature_set + "/Online_facetTraffic_phase1_" + str(sketch_size) + "_dataset.csv"
 74 |                 output = open(data_folder + "Online_facetTraffic_phase1_" + str(sketch_size) + "_dataset.csv", "w") 
 75 |                 f = open(data_folder + "FacetTraffic_50_phase1_dataset.csv", 'r')
 76 |                 reader = csv.reader(f, delimiter=',')
 77 | 
 78 |                 #Process data row
 79 |                 for n, row in enumerate(reader):
 80 |                     if(n == 0):
 81 |                         output.write(",".join(str(x) for x in n_bits) + "\n")
 82 |                     else:
 83 |                         #Gather the packet vector array (v_f)
 84 |                         packet_count_vector = []
 85 |                         for i in row[:-1]:
 86 |                             packet_count_vector.append(int(i))
 87 | 
 88 |                         #Compute the integer array (c_f)
 89 |                         integer_array = []
 90 |                         for i in range(0, sketch_size):
 91 |                             c_f_i = 0
 92 |                             for j in range(0, real_bucket_number):
 93 |                                 #print "Random_base_vector: " + str(random_base_vectors[i])
 94 |                                 c_f_i += random_base_vectors[i][j] * packet_count_vector[j]
 95 |                             integer_array.append(c_f_i)
 96 | 
 97 |                         #Compute the binary array (s_f)
 98 |                         binary_array = []
 99 |                         for i in integer_array:
100 |                             if(i <= 0):
101 |                                 binary_array.append(0)
102 |                             else:
103 |                                 binary_array.append(1)
104 | 
105 |                         #print "Binary array: " + str(binary_array)
106 |                         output.write(",".join(str(x) for x in binary_array) + "," + row[-1] + "\n")
107 |                 output.close()
108 | 
109 |                 ########################################################################################
110 |                 ########################################################################################
111 |                 ########################################################################################
112 | 
113 | 
114 |                 """
115 |                 Process Phase 2 Data
116 |                 """
117 | 
118 |                 feature_set = 'PL_60_' + str(binWidth) + '_' + str(topk)
119 |                 data_folder = 'FeatureSets/' + feature_set + '/'
120 | 
121 |                 #Regular Traffic
122 |                 print "Online_Sketch: Phase 2, Regular - " + feature_set + "/Online_regularTraffic_phase2_" + str(sketch_size) + "_dataset.csv"
123 |                 output = open(data_folder + "Online_regularTraffic_phase2_" + str(sketch_size) + "_dataset.csv", "w") 
124 |                 f = open(data_folder + "RegularTraffic_phase2_dataset.csv", 'r')
125 |                 reader = csv.reader(f, delimiter=',')
126 | 
127 |                 #Process data row
128 |                 for n, row in enumerate(reader):
129 |                     if(n == 0):
130 |                         output.write(",".join(str(x) for x in n_bits) + "\n")
131 |                     else:
132 |                         #Gather the packet vector array (v_f)
133 |                         packet_count_vector = []
134 |                         for i in row[:-1]:
135 |                             packet_count_vector.append(int(i))
136 | 
137 |                         #Compute the integer array (c_f)
138 |                         integer_array = []
139 |                         for i in range(0, sketch_size):
140 |                             c_f_i = 0
141 |                             for j in range(0, real_bucket_number):
142 |                                 #print "Random_base_vector: " + str(random_base_vectors[i])
143 |                                 c_f_i += random_base_vectors[i][j] * packet_count_vector[j]
144 |                             integer_array.append(c_f_i)
145 | 
146 |                         #Compute the binary array (s_f)
147 |                         binary_array = []
148 |                         for i in integer_array:
149 |                             if(i <= 0):
150 |                                 binary_array.append(0)
151 |                             else:
152 |                                 binary_array.append(1)
153 | 
154 |                         #print "Binary array: " + str(binary_array)
155 |                         output.write(",".join(str(x) for x in binary_array) + "," + row[-1] + "\n")
156 |                 output.close()
157 | 
158 | 
159 |                 #Facet Traffic
160 |                 print "Online_Sketch: Phase 2, Facet - " + feature_set + "/Online_facetTraffic_phase2_" + str(sketch_size) + "_dataset.csv"
161 |                 output = open(data_folder + "Online_facetTraffic_phase2_" + str(sketch_size) + "_dataset.csv", "w") 
162 |                 f = open(data_folder + "FacetTraffic_50_phase2_dataset.csv", 'r')
163 |                 reader = csv.reader(f, delimiter=',')
164 | 
165 |                 #Process data row
166 |                 for n, row in enumerate(reader):
167 |                     if(n == 0):
168 |                         output.write(",".join(str(x) for x in n_bits) + "\n")
169 |                     else:
170 |                         #Gather the packet vector array (v_f)
171 |                         packet_count_vector = []
172 |                         for i in row[:-1]:
173 |                             packet_count_vector.append(int(i))
174 | 
175 |                         #Compute the integer array (c_f)
176 |                         integer_array = []
177 |                         for i in range(0, sketch_size):
178 |                             c_f_i = 0
179 |                             for j in range(0, real_bucket_number):
180 |                                 #print "Random_base_vector: " + str(random_base_vectors[i])
181 |                                 c_f_i += random_base_vectors[i][j] * packet_count_vector[j]
182 |                             integer_array.append(c_f_i)
183 | 
184 |                         #Compute the binary array (s_f)
185 |                         binary_array = []
186 |                         for i in integer_array:
187 |                             if(i <= 0):
188 |                                 binary_array.append(0)
189 |                             else:
190 |                                 binary_array.append(1)
191 | 
192 |                         #print "Binary array: " + str(binary_array)
193 |                         output.write(",".join(str(x) for x in binary_array) + "," + row[-1] + "\n")
194 |                 output.close()
195 | 
196 | 


--------------------------------------------------------------------------------
/Security Tasks Evaluation/MPTAnalysis/DeltaShaperAnalysis/online_sketching.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import numpy as np
  3 | 
  4 | 
  5 | def CreateBinaryVectorRepresentation(BIN_WIDTH, TOPK, SKETCH_SIZE):
  6 |     
  7 |     for sketch_size in SKETCH_SIZE:
  8 |         for binWidth in BIN_WIDTH:
  9 |             for topk in TOPK:
 10 |                 
 11 |                 """
 12 |                 Generate random base vectors
 13 |                 """
 14 |                 
 15 |                 if(topk != 1500):
 16 |                     real_bucket_number = topk
 17 |                 else:
 18 |                     real_bucket_number = 1500/binWidth
 19 |                         
 20 |                 random_base_vectors = []
 21 |                 for i in range(0, sketch_size):
 22 |                     random_base_vector = (2*np.random.randint(0,2,size=(real_bucket_number))-1)
 23 |                     random_base_vectors.append(random_base_vector)
 24 | 
 25 |                 n_bits = range(0, sketch_size)
 26 | 
 27 |                 """
 28 |                 Process Phase 1 Data
 29 |                 """
 30 | 
 31 |                 feature_set = 'PL_60_' + str(binWidth) + '_' + str(topk)
 32 |                 data_folder = 'FeatureSets/' + feature_set + '/'
 33 | 
 34 |                 #Regular Traffic
 35 |                 print "Online_Sketch: Phase 1, Regular - " + feature_set + "/Online_regularTraffic_phase1_" + str(sketch_size) + "_dataset.csv"
 36 |                 output = open(data_folder + "Online_regularTraffic_phase1_" + str(sketch_size) + "_dataset.csv", "w") 
 37 |                 f = open(data_folder + "RegularTraffic_phase1_dataset.csv", 'r')
 38 |                 reader = csv.reader(f, delimiter=',')
 39 | 
 40 |                 #Process data row
 41 |                 for n, row in enumerate(reader):
 42 |                     if(n == 0):
 43 |                         output.write(",".join(str(x) for x in n_bits) + "," + row[-1] + "\n")
 44 |                     else:
 45 |                         #Gather the packet vector array (v_f)
 46 |                         packet_count_vector = []
 47 |                         for i in row[:-1]:
 48 |                             packet_count_vector.append(int(i))
 49 | 
 50 |                         #Compute the integer array (c_f)
 51 |                         integer_array = []
 52 |                         for i in range(0, sketch_size):
 53 |                             c_f_i = 0
 54 |                             for j in range(0, real_bucket_number):
 55 |                                 #print "Random_base_vector: " + str(random_base_vectors[i])
 56 |                                 c_f_i += random_base_vectors[i][j] * packet_count_vector[j]
 57 |                             integer_array.append(c_f_i)
 58 | 
 59 |                         #Compute the binary array (s_f)
 60 |                         binary_array = []
 61 |                         for i in integer_array:
 62 |                             if(i <= 0):
 63 |                                 binary_array.append(0)
 64 |                             else:
 65 |                                 binary_array.append(1)
 66 | 
 67 |                         #print "Binary array: " + str(binary_array)
 68 |                         output.write(",".join(str(x) for x in binary_array) + "," + row[-1] + "\n")
 69 |                 output.close()
 70 | 
 71 | 
 72 |                 #DeltaShaper Traffic
 73 |                 print "Online_Sketch: Phase 1, DeltaShaper - " + feature_set + "/Online_deltashaperTraffic_phase1_" + str(sketch_size) + "_dataset.csv"
 74 |                 output = open(data_folder + "Online_deltashaperTraffic_phase1_" + str(sketch_size) + "_dataset.csv", "w") 
 75 |                 f = open(data_folder + "DeltaShaperTraffic_320_phase1_dataset.csv", 'r')
 76 |                 reader = csv.reader(f, delimiter=',')
 77 | 
 78 |                 #Process data row
 79 |                 for n, row in enumerate(reader):
 80 |                     if(n == 0):
 81 |                         output.write(",".join(str(x) for x in n_bits) + "," + row[-1] + "\n")
 82 |                     else:
 83 |                         #Gather the packet vector array (v_f)
 84 |                         packet_count_vector = []
 85 |                         for i in row[:-1]:
 86 |                             packet_count_vector.append(int(i))
 87 | 
 88 |                         #Compute the integer array (c_f)
 89 |                         integer_array = []
 90 |                         for i in range(0, sketch_size):
 91 |                             c_f_i = 0
 92 |                             for j in range(0, real_bucket_number):
 93 |                                 #print "Random_base_vector: " + str(random_base_vectors[i])
 94 |                                 c_f_i += random_base_vectors[i][j] * packet_count_vector[j]
 95 |                             integer_array.append(c_f_i)
 96 | 
 97 |                         #Compute the binary array (s_f)
 98 |                         binary_array = []
 99 |                         for i in integer_array:
100 |                             if(i <= 0):
101 |                                 binary_array.append(0)
102 |                             else:
103 |                                 binary_array.append(1)
104 | 
105 |                         #print "Binary array: " + str(binary_array)
106 |                         output.write(",".join(str(x) for x in binary_array) + "," + row[-1] + "\n")
107 |                 output.close()
108 | 
109 |                 ########################################################################################
110 |                 ########################################################################################
111 |                 ########################################################################################
112 | 
113 | 
114 |                 """
115 |                 Process Phase 2 Data
116 |                 """
117 | 
118 |                 feature_set = 'PL_60_' + str(binWidth) + '_' + str(topk)
119 |                 data_folder = 'FeatureSets/' + feature_set + '/'
120 | 
121 |                 #Regular Traffic
122 |                 print "Online_Sketch: Phase 2, Regular - " + feature_set + "/Online_regularTraffic_phase2_" + str(sketch_size) + "_dataset.csv"
123 |                 output = open(data_folder + "Online_regularTraffic_phase2_" + str(sketch_size) + "_dataset.csv", "w") 
124 |                 f = open(data_folder + "RegularTraffic_phase2_dataset.csv", 'r')
125 |                 reader = csv.reader(f, delimiter=',')
126 | 
127 |                 #Process data row
128 |                 for n, row in enumerate(reader):
129 |                     if(n == 0):
130 |                         output.write(",".join(str(x) for x in n_bits) + "," + row[-1] + "\n")
131 |                     else:
132 |                         #Gather the packet vector array (v_f)
133 |                         packet_count_vector = []
134 |                         for i in row[:-1]:
135 |                             packet_count_vector.append(int(i))
136 | 
137 |                         #Compute the integer array (c_f)
138 |                         integer_array = []
139 |                         for i in range(0, sketch_size):
140 |                             c_f_i = 0
141 |                             for j in range(0, real_bucket_number):
142 |                                 #print "Random_base_vector: " + str(random_base_vectors[i])
143 |                                 c_f_i += random_base_vectors[i][j] * packet_count_vector[j]
144 |                             integer_array.append(c_f_i)
145 | 
146 |                         #Compute the binary array (s_f)
147 |                         binary_array = []
148 |                         for i in integer_array:
149 |                             if(i <= 0):
150 |                                 binary_array.append(0)
151 |                             else:
152 |                                 binary_array.append(1)
153 | 
154 |                         #print "Binary array: " + str(binary_array)
155 |                         output.write(",".join(str(x) for x in binary_array) + "," + row[-1] + "\n")
156 |                 output.close()
157 | 
158 | 
159 |                 #DeltaShaper Traffic
160 |                 print "Online_Sketch: Phase 2, DeltaShaper - " + feature_set + "/Online_deltashaperTraffic_phase2_" + str(sketch_size) + "_dataset.csv"
161 |                 output = open(data_folder + "Online_deltashaperTraffic_phase2_" + str(sketch_size) + "_dataset.csv", "w") 
162 |                 f = open(data_folder + "DeltaShaperTraffic_320_phase2_dataset.csv", 'r')
163 |                 reader = csv.reader(f, delimiter=',')
164 | 
165 |                 #Process data row
166 |                 for n, row in enumerate(reader):
167 |                     if(n == 0):
168 |                         output.write(",".join(str(x) for x in n_bits) + "," + row[-1] + "\n")
169 |                     else:
170 |                         #Gather the packet vector array (v_f)
171 |                         packet_count_vector = []
172 |                         for i in row[:-1]:
173 |                             packet_count_vector.append(int(i))
174 | 
175 |                         #Compute the integer array (c_f)
176 |                         integer_array = []
177 |                         for i in range(0, sketch_size):
178 |                             c_f_i = 0
179 |                             for j in range(0, real_bucket_number):
180 |                                 #print "Random_base_vector: " + str(random_base_vectors[i])
181 |                                 c_f_i += random_base_vectors[i][j] * packet_count_vector[j]
182 |                             integer_array.append(c_f_i)
183 | 
184 |                         #Compute the binary array (s_f)
185 |                         binary_array = []
186 |                         for i in integer_array:
187 |                             if(i <= 0):
188 |                                 binary_array.append(0)
189 |                             else:
190 |                                 binary_array.append(1)
191 | 
192 |                         #print "Binary array: " + str(binary_array)
193 |                         output.write(",".join(str(x) for x in binary_array) + "," + row[-1] + "\n")
194 |                 output.close()
195 | 
196 | 


--------------------------------------------------------------------------------
/Security Tasks Evaluation/WFAnalysis/SingleWebsiteAnalysis/generateFigures.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from decimal import Decimal
  3 | import numpy as np
  4 | import csv
  5 | 
  6 | import matplotlib
  7 | if os.environ.get('DISPLAY','') == '':
  8 |     print('no display found. Using non-interactive Agg backend')
  9 |     matplotlib.use('Agg')
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | 
 13 | colors = ["0.8", "0.6", "0.2", "0.0"]
 14 | colors = ["salmon", "lightsteelblue", "darkseagreen", "thistle", "wheat", "khaki", "skyblue"]
 15 | 
 16 | """
 17 | Attach a text label above each bar displaying its height
 18 | """
 19 | def autolabel(rects, ax):
 20 |     for rect in rects:
 21 |         height = rect.get_height()
 22 |         ax.text(rect.get_x() + rect.get_width()/2., 1.005*height,   # original height was 1.005*height
 23 |                 "{0:.2f}".format(float(height)), fontsize=7, ha='center', va='bottom')
 24 | 
 25 | 
 26 | def PlotSingleWebsiteStats():
 27 | 
 28 |     for profile in os.listdir("classificationResults/"):
 29 |         if(".DS_Store" in profile):
 30 |             continue
 31 | 
 32 |         profile_data = open("classificationResults/" + profile, 'rb')
 33 |         csv_reader = csv.reader(profile_data, delimiter=',')
 34 | 
 35 |         binWidth = []
 36 |         acc = []
 37 |         fpr = []
 38 |         fnr = []
 39 | 
 40 |         for n, row in enumerate(csv_reader):
 41 |             if(n == 0):
 42 |                 continue
 43 |             binWidth.append(row[0])
 44 |             acc.append(float(row[1]))
 45 |             fpr.append(float(row[2]))
 46 |             fnr.append(float(row[3]))
 47 | 
 48 | 
 49 |         fig = plt.figure()
 50 |         ax1 = fig.add_subplot(111)
 51 | 
 52 |         print "Current feature set: "+ str(binWidth)
 53 |         
 54 |         
 55 |         ind = np.arange(len(binWidth))  # the x locations for the groups
 56 |         width = 0.20
 57 | 
 58 |         rects0 = ax1.bar(ind - width, acc, width, colors[0], label='Acc')
 59 |         rects1 = ax1.bar(ind, fpr, width, colors[1], label='FPR')
 60 |         rects2 = ax1.bar(ind + width, fnr, width, colors[2], label='FNR')
 61 | 
 62 | 
 63 |         ax1.yaxis.grid(color='black', linestyle='dotted')
 64 |         ax1.set_title('Scores for Quantization')
 65 |         ax1.set_yscale("log")
 66 |         ax1.set_xticks(ind)
 67 |         labels = binWidth
 68 |         ax1.set_xticklabels(labels)
 69 |         ax1.legend()
 70 | 
 71 | 
 72 |         plt.tight_layout()
 73 |         #plt.ylim(0, 1)
 74 | 
 75 |         fig.savefig('WF_%s.pdf'%(profile[:-4]))   # save the figure to file
 76 |         fig.savefig('WF_%s.png'%(profile[:-4]))   # save the figure to file
 77 |         plt.close(fig)
 78 |         profile_data.close()
 79 | 
 80 |                     
 81 | def PlotNormalFPRComparison():
 82 |     websites = set()
 83 | 
 84 |     #Compute the set of websites to compare
 85 |     for profile in os.listdir("classificationResults/"):
 86 |         if(".DS_Store" in profile):
 87 |             continue
 88 |         website = profile.split("_")[2]
 89 |         website = website[:-4]
 90 |         websites.add(website)
 91 | 
 92 | 
 93 |     for website in websites:
 94 |         if not os.path.exists("Figures/%s"%(website)):
 95 |             os.makedirs("Figures/%s"%(website))
 96 | 
 97 |         #Gather results for full distribution
 98 |         profile_data_full = open("classificationResults/SingleWebsite_full_" + website + ".csv", 'rb')
 99 |         csv_reader_full = csv.reader(profile_data_full, delimiter=',')
100 | 
101 |         binWidth_full = []
102 |         acc_full = []
103 |         fpr_full = []
104 |         fnr_full = []
105 | 
106 |         for n, row in enumerate(csv_reader_full):
107 |             if(n == 0):
108 |                 continue
109 |             binWidth_full.append(row[0])
110 |             acc_full.append(round(Decimal(float(row[1])), 4))
111 |             fpr_full.append(round(Decimal(float(row[2])), 9))
112 |             fnr_full.append(round(Decimal(float(row[3])), 4))
113 | 
114 | 
115 |         #Gather results for truncated distribution
116 |         profile_data_truncated = open("classificationResults/SingleWebsite_truncated_" + website + ".csv", 'rb')
117 |         csv_reader_truncated = csv.reader(profile_data_truncated, delimiter=',')
118 | 
119 |         binWidth_truncated = []
120 |         acc_truncated = []
121 |         fpr_truncated = []
122 |         fnr_truncated = []
123 | 
124 |         for n, row in enumerate(csv_reader_truncated):
125 |             if(n == 0):
126 |                 continue
127 |             binWidth_truncated.append(row[0])
128 |             acc_truncated.append(round(Decimal(float(row[1])), 4))
129 |             fpr_truncated.append(round(Decimal(float(row[2])), 9))
130 |             fnr_truncated.append(round(Decimal(float(row[3])), 4))
131 | 
132 |         #Gather number of bins used in the truncation
133 |         truncated_info_file = open("truncationInfo/" + website + ".csv", 'r')
134 |         truncation_info = csv.reader(truncated_info_file, delimiter=',')
135 |         truncated_bins = []
136 | 
137 |         for n, row in enumerate(truncation_info):
138 |             if(n == 0):
139 |                 continue
140 |             truncated_bins.append(row[1])
141 | 
142 |         #Generate plot
143 |         fig = plt.figure()
144 |         ax1 = fig.add_subplot(111)
145 | 
146 |         print "Current feature set: "+ str(binWidth_full)
147 |         print "FPR-Full: " + str(fpr_full)
148 |         print "FPR-Truncated: " + str(fpr_truncated)
149 |         
150 |         ind = np.arange(len(binWidth_full))  # the x locations for the groups
151 |         width = 0.40
152 | 
153 |         rects1 = ax1.bar(ind - width, fpr_full, width, color=colors[0], label='FPR-Full')
154 |         #autolabel(rects1,ax1)
155 |         rects2 = ax1.bar(ind, fpr_truncated, width, color=colors[1], label='FPR-Truncated')
156 |         #autolabel(rects2,ax1)
157 | 
158 | 
159 |         ax1.yaxis.grid(color='black', linestyle='dotted')
160 |         ax1.set_title('Truncation effect on FPR - %s'%(website), fontsize = 10)
161 |         
162 |         ax1.set_xticks(ind)
163 |         labels = ["K = " + x + "\nBins = " + str(truncated_bins[n]) for n, x in enumerate(binWidth_full)]
164 |         ax1.set_xticklabels(labels)
165 |         ax1.legend()
166 | 
167 |         plt.xticks(fontsize=7)
168 |         plt.tight_layout()
169 |         #plt.ylim(0, 1)
170 |         fig.savefig('Figures/%s/WF_FPR_normal_%s.pdf'%(website, website))   # save the figure to file
171 |         fig.savefig('Figures/%s/WF_FPR_normal_%s.png'%(website, website))   # save the figure to file
172 |         plt.close(fig)
173 |         profile_data_full.close()
174 |         profile_data_truncated.close()
175 | 
176 | 
177 | def PlotNormalFNRComparison():
178 |     websites = set()
179 | 
180 |     #Compute the set of websites to compare
181 |     for profile in os.listdir("classificationResults/"):
182 |         if(".DS_Store" in profile):
183 |             continue
184 |         website = profile.split("_")[2]
185 |         website = website[:-4]
186 |         websites.add(website)
187 | 
188 | 
189 |     for website in websites:
190 |         if not os.path.exists("Figures/%s"%(website)):
191 |             os.makedirs("Figures/%s"%(website))
192 | 
193 |         #Gather results for full distribution
194 |         profile_data_full = open("classificationResults/SingleWebsite_full_" + website + ".csv", 'rb')
195 |         csv_reader_full = csv.reader(profile_data_full, delimiter=',')
196 | 
197 |         binWidth_full = []
198 |         acc_full = []
199 |         fpr_full = []
200 |         fnr_full = []
201 | 
202 |         for n, row in enumerate(csv_reader_full):
203 |             if(n == 0):
204 |                 continue
205 |             binWidth_full.append(row[0])
206 |             acc_full.append(round(Decimal(float(row[1])), 4))
207 |             fpr_full.append(round(Decimal(float(row[2])), 4))
208 |             fnr_full.append(round(Decimal(float(row[3])), 4))
209 | 
210 | 
211 |         #Gather results for truncated distribution
212 |         profile_data_truncated = open("classificationResults/SingleWebsite_truncated_" + website + ".csv", 'rb')
213 |         csv_reader_truncated = csv.reader(profile_data_truncated, delimiter=',')
214 | 
215 |         binWidth_truncated = []
216 |         acc_truncated = []
217 |         fpr_truncated = []
218 |         fnr_truncated = []
219 | 
220 |         for n, row in enumerate(csv_reader_truncated):
221 |             if(n == 0):
222 |                 continue
223 |             binWidth_truncated.append(row[0])
224 |             acc_truncated.append(round(Decimal(float(row[1])), 4))
225 |             fpr_truncated.append(round(Decimal(float(row[2])), 4))
226 |             fnr_truncated.append(round(Decimal(float(row[3])), 4))
227 | 
228 | 
229 |         #Gather number of bins used in the truncation
230 |         truncated_info_file = open("truncationInfo/" + website + ".csv", 'r')
231 |         truncation_info = csv.reader(truncated_info_file, delimiter=',')
232 |         truncated_bins = []
233 | 
234 |         for n, row in enumerate(truncation_info):
235 |             if(n == 0):
236 |                 continue
237 |             truncated_bins.append(row[1])
238 | 
239 | 
240 |         #Generate plot
241 |         fig = plt.figure()
242 |         ax1 = fig.add_subplot(111)
243 | 
244 |         print "Current feature set: "+ str(binWidth_full)
245 |         print "FNR-Full: " + str(fnr_full)
246 |         print "FNR-Truncated: " + str(fnr_truncated)
247 |         
248 |         ind = np.arange(len(binWidth_full))  # the x locations for the groups
249 |         width = 0.40
250 | 
251 |         rects1 = ax1.bar(ind - width, fnr_full, width, color=colors[0], label='FNR-Full')
252 |         autolabel(rects1,ax1)
253 |         rects2 = ax1.bar(ind, fnr_truncated, width, color=colors[1], label='FNR-Truncated')
254 |         autolabel(rects2,ax1)
255 | 
256 | 
257 |         ax1.yaxis.grid(color='black', linestyle='dotted')
258 |         ax1.set_title('Truncation effect on FNR - %s'%(website), fontsize = 10)
259 |         
260 |         ax1.set_xticks(ind)
261 |         labels = ["K = " + x + "\nBins = " + str(truncated_bins[n]) for n, x in enumerate(binWidth_full)]
262 |         ax1.set_xticklabels(labels)
263 |         ax1.legend()
264 | 
265 |         plt.xticks(fontsize=7)
266 |         plt.tight_layout()
267 |         plt.ylim(0, 1)
268 |         fig.savefig('Figures/%s/WF_FNR_normal_%s.pdf'%(website, website))   # save the figure to file
269 |         fig.savefig('Figures/%s/WF_FNR_normal_%s.png'%(website, website))   # save the figure to file
270 |         plt.close(fig)
271 |         profile_data_full.close()
272 |         profile_data_truncated.close()
273 | 
274 | 
275 | 
276 | def GenerateFigures():
277 |     if not os.path.exists("Figures"):
278 |         os.makedirs("Figures")
279 | 
280 |     PlotNormalFNRComparison()
281 |     PlotNormalFPRComparison()


--------------------------------------------------------------------------------
/Security Tasks Evaluation/MPTAnalysis/FacetAnalysis/generateFigures.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import math
  4 | 
  5 | import matplotlib
  6 | if os.environ.get('DISPLAY','') == '':
  7 |     print('no display found. Using non-interactive Agg backend')
  8 |     matplotlib.use('Agg')
  9 | import matplotlib.pyplot as plt
 10 | 
 11 | 
 12 | colors = ["0.8", "0.6", "0.2", "0.0"]
 13 | colors = ["salmon", "lightsteelblue", "darkseagreen", "thistle", "wheat", "khaki", "skyblue"]
 14 | 
 15 | """
 16 | Attach a text label above each bar displaying its height
 17 | """
 18 | def autolabel(rects, ax):
 19 |     for rect in rects:
 20 |         height = rect.get_height()
 21 |         ax.text(rect.get_x() + rect.get_width()/2., 1.005*height,   # original height was 1.005*height
 22 |                 "{0:.2f}".format(float(height)), fontsize=7, ha='center', va='bottom')
 23 | 
 24 | 
 25 | def PlotQuantization(binWidths, n_flows):
 26 |     print "PlotQuantization"
 27 |     feature_sets = []
 28 |     set_acc = []
 29 |     set_fpr =[]
 30 |     set_fnr = []
 31 | 
 32 |     for binWidth in binWidths:
 33 |           
 34 |         feature_folder = 'PL_60_' + str(binWidth) + '_1500'
 35 |         #print feature_folder
 36 | 
 37 |         #Load configuration results
 38 |         data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase2_NoSketch.npy"
 39 |         results = np.load(data_folder)
 40 |         set_acc.append(results[0])
 41 |         set_fpr.append(results[1])
 42 |         set_fnr.append(results[2])
 43 |         feature_sets.append(feature_folder)
 44 | 
 45 | 
 46 |     max_acc = 0
 47 |     max_fset = ""
 48 |     for i, f_set in enumerate(feature_sets):
 49 |         if set_acc[i] > max_acc:
 50 |             max_acc = set_acc[i]
 51 |             max_fset = f_set
 52 |     print "Max acc: %s, Best quantization set: %s"%(max_acc, max_fset)
 53 | 
 54 |     fig = plt.figure(figsize=(10,4))
 55 |     ax1 = fig.add_subplot(111)
 56 |     
 57 |     curr_fset = feature_sets
 58 |     curr_acc = set_acc
 59 |     curr_fpr = set_fpr
 60 |     curr_fnr = set_fnr
 61 |     #print "Current feature set: "+ str(curr_fset)
 62 |     
 63 |     ind = np.arange(len(curr_fset))  # the x locations for the groups
 64 |     width = 0.20
 65 | 
 66 |     rects0 = ax1.bar(ind - width - width/2, curr_acc, width, color=colors[0], label='Acc')
 67 |     autolabel(rects0,ax1)
 68 |     rects1 = ax1.bar(ind - width/2 , curr_fpr, width, color=colors[1], label='FPR')
 69 |     autolabel(rects1,ax1)
 70 |     rects2 = ax1.bar(ind + width - width/2, curr_fnr, width, color=colors[2], label='FNR')
 71 |     autolabel(rects2,ax1)
 72 | 
 73 | 
 74 |     ax1.yaxis.grid(color='black', linestyle='dotted')
 75 |     ax1.set_title('Scores for Quantization')
 76 |     ax1.set_xticks(ind)
 77 |     labels = ["K = " + str(int(x.split('_')[2])) + " -> " + str(1500/int(x.split('_')[2])) + " bins" + "\n(PerFlow = " + str(int(1500/int(x.split('_')[2]))*4) + " B)" + "\n(CGMem = " + str((n_flows * int(1500/int(x.split('_')[2]))*4)/1024) + " KB)" for x in feature_sets]
 78 |     ax1.set_xticklabels(labels)
 79 |     plt.xticks(fontsize=7)
 80 |     ax1.legend()
 81 | 
 82 |     plt.ylim(top=1)
 83 |     plt.legend(loc='upper right', fontsize=8)
 84 |     plt.tight_layout()
 85 |     fig.savefig('Figures/Facet_bin_NoSketch.pdf')   # save the figure to file
 86 |     fig.savefig('Figures/Facet_bin_NoSketch.png')   # save the figure to file
 87 |     plt.close(fig)
 88 | 
 89 | 
 90 | def PlotQuantizationLines(binWidths, n_flows):
 91 |     print "PlotQuantizationLines"
 92 |     feature_sets = []
 93 |     set_acc = []
 94 | 
 95 |     for binWidth in binWidths:
 96 |           
 97 |         feature_folder = 'PL_60_' + str(binWidth) + '_1500'
 98 |         #print feature_folder
 99 | 
100 |         #Load configuration results
101 |         data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase2_NoSketch.npy"
102 |         results = np.load(data_folder)
103 |         set_acc.append(results[3])
104 |         feature_sets.append(feature_folder)
105 | 
106 | 
107 | 
108 |     fig = plt.figure(figsize=(10,4))
109 |     ax1 = fig.add_subplot(111)
110 |     
111 |     curr_fset = feature_sets
112 |     curr_acc = set_acc
113 |     
114 |     ind = np.arange(len(curr_fset))  # the x locations for the groups
115 |     print curr_acc
116 |     ax1.plot(curr_acc, color=colors[0], marker=".", markersize=12, lw=3, label='AUC')
117 |     ax1.hlines(0.99, 0, len(ind)-1, lw=2, label='Baseline, AUC = 0.99')
118 | 
119 |     for i,j in zip(ind,curr_acc):
120 |         ax1.annotate("{0:.2f}".format(j),xy=(i-0.15,j-0.08))
121 | 
122 | 
123 |     ax1.yaxis.grid(color='black', linestyle='dotted')
124 |     plt.yticks(fontsize=14)
125 |     plt.ylim(bottom=0,top=1)
126 |     plt.ylabel("AUC Score", fontsize=14)
127 | 
128 | 
129 |     plt.xlim(-0.3, len(ind)-1+0.3)
130 |     ax1.set_xticks(ind)
131 |     labels = [str(int(x.split('_')[2])) for x in feature_sets]
132 |     #labels = ["K = " + str(int(x.split('_')[2])) + " -> " + str(1500/int(x.split('_')[2])) + " bins" + "\n(PerFlow = " + str(int(1500/int(x.split('_')[2]))*4) + " B)" + "\n(CGMem = " + str((n_flows * int(1500/int(x.split('_')[2]))*4)/1024) + " KB)" for x in feature_sets]
133 |     #labels = ["K = " + str(int(x.split('_')[2])) + "\nPF = " + str(int(1500/int(x.split('_')[2]))*4) + " B" + "\nTM = " + str((n_flows * int(1500/int(x.split('_')[2]))*4)/1024) + " KB" for x in feature_sets]
134 |     ax1.set_xticklabels(labels)
135 |     plt.xticks(fontsize=11)
136 |     plt.xlabel("Quantization Factor", fontsize=14)
137 |     ax1.legend()
138 | 
139 |     
140 |     plt.legend(loc='lower right', fontsize=12)
141 |     plt.tight_layout()
142 |     fig.savefig('Figures/Facet_bin_NoSketch_Lines.pdf')   # save the figure to file
143 |     fig.savefig('Figures/Facet_bin_NoSketch_Lines.png')   # save the figure to file
144 |     plt.close(fig)
145 | 
146 | 
147 | def PlotKQuantizationAndTruncation(binWidths, topk_features, n_flows):
148 |     print "PlotKQuantizationAndTruncation"
149 |     if not os.path.exists('Figures/Truncation_comparison'):
150 |         os.makedirs('Figures/Truncation_comparison')
151 | 
152 |     for binWidth in binWidths:
153 |         feature_sets = []
154 |         set_acc = []
155 |         set_fpr =[]
156 |         set_fnr = []
157 | 
158 |         for topk in topk_features:
159 |               
160 |             feature_folder = 'PL_60_' + str(binWidth) + '_' + str(topk)
161 |             #print feature_folder
162 | 
163 |             if(topk != 1500 and topk > 1500/binWidth):
164 |                 #print "Skipping sample, invalid configuration. TopK = " + str(topk) + " Total Features = " + str(1500/binWidth)
165 |                 set_acc.append(0)
166 |                 set_fpr.append(0)
167 |                 set_fnr.append(0)
168 |                 feature_sets.append(feature_folder)
169 |                 continue
170 | 
171 |             #Load configuration results
172 |             #if(topk == 1500):
173 |             #    data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase1_NoSketch.npy"
174 |             #else:
175 |             data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase2_NoSketch.npy"
176 |             results = np.load(data_folder)
177 |             set_acc.append(results[0])
178 |             set_fpr.append(results[1])
179 |             set_fnr.append(results[2])
180 |             feature_sets.append(feature_folder)
181 | 
182 | 
183 |         #Check best truncation value
184 |         max_acc = 0
185 |         max_fset = ""
186 |         for i, f_set in enumerate(feature_sets[:-1]):
187 |             if set_acc[i] > max_acc:
188 |                 max_acc = set_acc[i]
189 |                 max_fset = f_set
190 |         print "K = " + str(binWidth) + ", Max acc: %s, Best Truncation: %s"%(max_acc, max_fset)
191 | 
192 | 
193 |         #Plot figures
194 |         fig = plt.figure(figsize=(10,4))
195 |         ax1 = fig.add_subplot(111)
196 |         
197 |         curr_fset = feature_sets
198 |         curr_acc = set_acc
199 |         curr_fpr = set_fpr
200 |         curr_fnr = set_fnr
201 |         #print "Current feature set: "+ str(curr_fset)
202 |         
203 |         ind = np.arange(len(curr_fset))  # the x locations for the groups
204 |         width = 0.20
205 | 
206 |         rects0 = ax1.bar(ind - width - width/2, curr_acc, width, color=colors[0], label='Acc')
207 |         autolabel(rects0,ax1)
208 |         rects1 = ax1.bar(ind - width/2 , curr_fpr, width, color=colors[1], label='FPR')
209 |         autolabel(rects1,ax1)
210 |         rects2 = ax1.bar(ind + width - width/2, curr_fnr, width, color=colors[2], label='FNR')
211 |         autolabel(rects2,ax1)
212 | 
213 |         ax1.yaxis.grid(color='black', linestyle='dotted')
214 |         ax1.set_title('Truncation Scores for K ='+str(binWidth))
215 |         ax1.set_xticks(ind)
216 |         print feature_sets
217 |         labels = ["Top-k= " + str(int(x.split('_')[3])) + "\n(PerFlow = " + str(int(x.split('_')[3])*4) + " B)" + "\n(CGMem = " + str((n_flows * int(x.split('_')[3]) * 4)/1024) + " KB)" for x in feature_sets]
218 |         labels[len(topk_features)-1] = str(int(1500/binWidth)) + " features\n(PerFlow = " + str(int(1500/binWidth)*4) + " B)" + "\n(CGMem = " + str(int((n_flows * int(1500/binWidth) * 4)/1024)) + " KB)"
219 |         ax1.set_xticklabels(labels)
220 |         plt.xticks(fontsize=9)
221 |         ax1.legend()
222 | 
223 |         plt.ylim(top=1)
224 |         plt.legend(loc='upper right', fontsize=10)
225 |         plt.tight_layout()
226 |         fig.savefig('Figures/Truncation_comparison/Facet_bin' + str(binWidth) + '_topk_NoSketch.pdf')   # save the figure to file
227 |         fig.savefig('Figures/Truncation_comparison/Facet_bin' + str(binWidth) + '_topk_NoSketch.png')   # save the figure to file
228 |         plt.close(fig)
229 | 
230 | def PlotKQuantizationAndTruncationLines(binWidths, topk_features, n_flows):
231 |     print "PlotKQuantizationAndTruncation"
232 |     if not os.path.exists('Figures/Truncation_comparison'):
233 |         os.makedirs('Figures/Truncation_comparison')
234 | 
235 |     for binWidth in binWidths:
236 |         feature_sets = []
237 |         set_acc = []
238 | 
239 |         for topk in topk_features:
240 |               
241 |             feature_folder = 'PL_60_' + str(binWidth) + '_' + str(topk)
242 |             #print feature_folder
243 | 
244 |             if(topk != 1500 and topk > 1500/binWidth):
245 |                 #print "Skipping sample, invalid configuration. TopK = " + str(topk) + " Total Features = " + str(1500/binWidth)
246 |                 set_acc.append(0)
247 |                 feature_sets.append(feature_folder)
248 |                 continue
249 | 
250 |             #Load configuration results
251 |             #if(topk == 1500):
252 |             #    data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase1_NoSketch.npy"
253 |             #else:
254 |             data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase2_NoSketch.npy"
255 |             results = np.load(data_folder)
256 |             set_acc.append(results[3])
257 |             feature_sets.append(feature_folder)
258 | 
259 | 
260 |         #Plot figures
261 |         fig = plt.figure(figsize=(10,4))
262 |         ax1 = fig.add_subplot(111)
263 |         
264 |         curr_fset = feature_sets
265 |         curr_acc = set_acc
266 | 
267 |         #print "Current feature set: "+ str(curr_fset)
268 |         
269 |         ind = np.arange(len(curr_fset))  # the x locations for the groups
270 | 
271 | 
272 |         ax1.plot(curr_acc, color=colors[0], marker=".", markersize=12, lw=3, label='AUC')
273 |         ax1.hlines(0.99, 0, len(ind)-1, lw=3, label='Baseline, AUC = 0.99')
274 |         
275 |         for i,j in zip(ind,curr_acc):
276 |             ax1.annotate("{0:.2f}".format(j),xy=(i-0.1,j-0.08))
277 | 
278 |         plt.xlim(-0.3, len(ind)-1+0.3)
279 |         ax1.yaxis.grid(color='black', linestyle='dotted')
280 | 
281 |         ax1.set_xticks(ind)
282 |         print feature_sets
283 |         labels = [str(int(x.split('_')[3])) for x in feature_sets]
284 |         #labels = ["Top-n= " + str(int(x.split('_')[3])) + "\nPF = " + str(int(x.split('_')[3])*4) + " B" + "\nTM = " + str((n_flows * int(x.split('_')[3]) * 4)/1024) + " KB" for x in feature_sets]
285 |         #labels[len(topk_features)-1] = str(int(1500/binWidth)) + " features\n(PF = " + str(int(1500/binWidth)*4) + " B)" + "\n(TMem = " + str(int((n_flows * int(1500/binWidth) * 4)/1024)) + " KB)"
286 |         ax1.set_xticklabels(labels)
287 |         plt.xticks(fontsize=9)
288 |         plt.xlabel("Truncation Factor", fontsize=12)
289 |         ax1.legend()
290 |         
291 |         
292 |         plt.yticks(fontsize=12)
293 |         plt.ylim(bottom=0,top=1)
294 |         plt.ylabel("AUC Score", fontsize=12)
295 | 
296 |         plt.legend(loc='lower right', fontsize=12)
297 |         plt.tight_layout()
298 |         fig.savefig('Figures/Truncation_comparison/Facet_bin' + str(binWidth) + '_topk_NoSketch_Lines.pdf')   # save the figure to file
299 |         fig.savefig('Figures/Truncation_comparison/Facet_bin' + str(binWidth) + '_topk_NoSketch_Lines.png')   # save the figure to file
300 |         plt.close(fig)
301 |     
302 | 
303 | def GenerateFigures(binWidths, topk_features, nFlows):
304 |     if not os.path.exists('Figures'):
305 |         os.makedirs('Figures')
306 | 
307 |     PlotQuantization(binWidths, nFlows)
308 |     PlotKQuantizationAndTruncation(binWidths, topk_features, nFlows)
309 |     
310 | 
311 | 
312 | def GenerateFiguresLine(binWidths, topk_features, nFlows):
313 |     if not os.path.exists('Figures'):
314 |         os.makedirs('Figures')
315 | 
316 |     TOPK = [10, 20, 30, 40, 50]
317 |     PlotQuantizationLines(binWidths, nFlows)
318 |     PlotKQuantizationAndTruncationLines(binWidths, TOPK, nFlows)
319 |     
320 | 
321 | 
322 | if __name__ == "__main__":
323 | 
324 |     #Quantization
325 |     BIN_WIDTH = [1, 4, 8, 16, 32, 64, 128, 256]
326 | 
327 |     #Truncation Top-K features
328 |     TOPK = [5, 10, 20, 30, 40, 50, 1500]
329 |     TOPK = [10, 20, 30, 40, 50]
330 | 
331 |     #Total amount of flows per dataset
332 |     N_FLOWS = 1000
333 | 
334 |     PlotQuantizationLines(BIN_WIDTH, N_FLOWS)
335 |     PlotKQuantizationAndTruncationLines(BIN_WIDTH, TOPK, N_FLOWS)


--------------------------------------------------------------------------------
/Security Tasks Evaluation/MPTAnalysis/DeltaShaperAnalysis/generateFigures.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import math
  4 | 
  5 | import matplotlib
  6 | if os.environ.get('DISPLAY','') == '':
  7 |     print('no display found. Using non-interactive Agg backend')
  8 |     matplotlib.use('Agg')
  9 | import matplotlib.pyplot as plt
 10 | 
 11 | 
 12 | colors = ["0.8", "0.6", "0.2", "0.0"]
 13 | colors = ["salmon", "lightsteelblue", "darkseagreen", "thistle", "wheat", "khaki", "skyblue"]
 14 | 
 15 | """
 16 | Attach a text label above each bar displaying its height
 17 | """
 18 | def autolabel(rects, ax):
 19 |     for rect in rects:
 20 |         height = rect.get_height()
 21 |         ax.text(rect.get_x() + rect.get_width()/2., 1.005*height,   # original height was 1.005*height
 22 |                 "{0:.2f}".format(float(height)), fontsize=7, ha='center', va='bottom')
 23 | 
 24 | 
 25 | def PlotQuantization(binWidths, n_flows):
 26 |     print "PlotQuantization"
 27 |     feature_sets = []
 28 |     set_acc = []
 29 |     set_fpr =[]
 30 |     set_fnr = []
 31 | 
 32 |     for binWidth in binWidths:
 33 |           
 34 |         feature_folder = 'PL_60_' + str(binWidth) + '_1500'
 35 |         #print feature_folder
 36 | 
 37 |         #Load configuration results
 38 |         data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase2_NoSketch.npy"
 39 |         results = np.load(data_folder)
 40 |         set_acc.append(results[0])
 41 |         set_fpr.append(results[1])
 42 |         set_fnr.append(results[2])
 43 |         feature_sets.append(feature_folder)
 44 | 
 45 | 
 46 |     max_acc = 0
 47 |     max_fset = ""
 48 |     for i, f_set in enumerate(feature_sets):
 49 |         if set_acc[i] > max_acc:
 50 |             max_acc = set_acc[i]
 51 |             max_fset = f_set
 52 |     print "Max acc: %s, Best quantization set: %s"%(max_acc, max_fset)
 53 | 
 54 |     fig = plt.figure(figsize=(10,4))
 55 |     ax1 = fig.add_subplot(111)
 56 |     
 57 |     curr_fset = feature_sets
 58 |     curr_acc = set_acc
 59 |     curr_fpr = set_fpr
 60 |     curr_fnr = set_fnr
 61 |     #print "Current feature set: "+ str(curr_fset)
 62 |     
 63 |     ind = np.arange(len(curr_fset))  # the x locations for the groups
 64 |     width = 0.20
 65 | 
 66 |     rects0 = ax1.bar(ind - width - width/2, curr_acc, width, color=colors[0], label='Acc')
 67 |     autolabel(rects0,ax1)
 68 |     rects1 = ax1.bar(ind - width/2 , curr_fpr, width, color=colors[1], label='FPR')
 69 |     autolabel(rects1,ax1)
 70 |     rects2 = ax1.bar(ind + width - width/2, curr_fnr, width, color=colors[2], label='FNR')
 71 |     autolabel(rects2,ax1)
 72 | 
 73 | 
 74 |     ax1.yaxis.grid(color='black', linestyle='dotted')
 75 |     ax1.set_title('Scores for Quantization')
 76 |     ax1.set_xticks(ind)
 77 |     labels = ["K = " + str(int(x.split('_')[2])) + " -> " + str(1500/int(x.split('_')[2])) + " bins" + "\n(PerFlow = " + str(int(1500/int(x.split('_')[2]))*4) + " B)" + "\n(CGMem = " + str((n_flows * int(1500/int(x.split('_')[2]))*4)/1024) + " KB)" for x in feature_sets]
 78 |     ax1.set_xticklabels(labels)
 79 |     plt.xticks(fontsize=7)
 80 |     ax1.legend()
 81 | 
 82 |     plt.ylim(top=1)
 83 |     plt.legend(loc='upper right', fontsize=8)
 84 |     plt.tight_layout()
 85 |     fig.savefig('Figures/DeltaShaper_bin_NoSketch.pdf')   # save the figure to file
 86 |     fig.savefig('Figures/DeltaShaper_bin_NoSketch.png')   # save the figure to file
 87 |     plt.close(fig)
 88 | 
 89 | 
 90 | def PlotQuantizationLines(binWidths, n_flows):
 91 |     print "PlotQuantizationLines"
 92 |     feature_sets = []
 93 |     set_acc = []
 94 | 
 95 |     for binWidth in binWidths:
 96 |           
 97 |         feature_folder = 'PL_60_' + str(binWidth) + '_1500'
 98 |         #print feature_folder
 99 | 
100 |         #Load configuration results
101 |         data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase2_NoSketch.npy"
102 |         results = np.load(data_folder)
103 |         set_acc.append(results[3])
104 |         feature_sets.append(feature_folder)
105 | 
106 | 
107 | 
108 |     fig = plt.figure(figsize=(10,4))
109 |     ax1 = fig.add_subplot(111)
110 |     
111 |     curr_fset = feature_sets
112 |     curr_acc = set_acc
113 |     
114 |     ind = np.arange(len(curr_fset))  # the x locations for the groups
115 |     print curr_acc
116 |     ax1.plot(curr_acc, color=colors[0], marker=".", markersize=12, lw=3, label='AUC')
117 |     ax1.hlines(0.95, 0, len(ind)-1, lw=3, label='Baseline, AUC = 0.95')
118 | 
119 |     for i,j in zip(ind,curr_acc):
120 |         ax1.annotate("{0:.2f}".format(j),xy=(i-0.15,j+0.03))
121 | 
122 |     ax1.yaxis.grid(color='black', linestyle='dotted')
123 |     plt.yticks(fontsize=14)
124 |     plt.ylim(bottom=0,top=1)
125 |     plt.ylabel("AUC Score", fontsize=14)
126 | 
127 | 
128 |     plt.xlim(-0.3, len(ind)-1+0.3)
129 |     ax1.set_xticks(ind)
130 |     #labels = ["K = " + str(int(x.split('_')[2])) + " -> " + str(1500/int(x.split('_')[2])) + " bins" + "\n(PerFlow = " + str(int(1500/int(x.split('_')[2]))*4) + " B)" + "\n(CGMem = " + str((n_flows * int(1500/int(x.split('_')[2]))*4)/1024) + " KB)" for x in feature_sets]
131 |     #labels = ["K = " + str(int(x.split('_')[2])) + "\nPF = " + str(int(1500/int(x.split('_')[2]))*4) + " B" + "\nTM = " + str((n_flows * int(1500/int(x.split('_')[2]))*4)/1024) + " KB" for x in feature_sets]
132 |     labels = [str(int(x.split('_')[2])) for x in feature_sets]
133 | 
134 |     ax1.set_xticklabels(labels)
135 |     plt.xticks(fontsize=9)
136 |     plt.xlabel("DeltaShaper Quantization Factor (K)", fontsize=12)
137 |     ax1.legend()
138 | 
139 |     
140 |     plt.legend(loc='lower right', fontsize=12)
141 |     plt.tight_layout()
142 |     fig.savefig('Figures/DeltaShaper_bin_NoSketch_Lines.pdf')   # save the figure to file
143 |     fig.savefig('Figures/DeltaShaper_bin_NoSketch_Lines.png')   # save the figure to file
144 |     plt.close(fig)
145 | 
146 | 
147 | def PlotKQuantizationAndTruncation(binWidths, topk_features, n_flows):
148 |     print "PlotKQuantizationAndTruncation"
149 |     if not os.path.exists('Figures/Truncation_comparison'):
150 |         os.makedirs('Figures/Truncation_comparison')
151 | 
152 |     for binWidth in binWidths:
153 |         feature_sets = []
154 |         set_acc = []
155 |         set_fpr =[]
156 |         set_fnr = []
157 | 
158 |         for topk in topk_features:
159 |               
160 |             feature_folder = 'PL_60_' + str(binWidth) + '_' + str(topk)
161 |             #print feature_folder
162 | 
163 |             if(topk != 1500 and topk > 1500/binWidth):
164 |                 #print "Skipping sample, invalid configuration. TopK = " + str(topk) + " Total Features = " + str(1500/binWidth)
165 |                 set_acc.append(0)
166 |                 set_fpr.append(0)
167 |                 set_fnr.append(0)
168 |                 feature_sets.append(feature_folder)
169 |                 continue
170 | 
171 |             #Load configuration results
172 |             #if(topk == 1500):
173 |             #    data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase1_NoSketch.npy"
174 |             #else:
175 |             data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase2_NoSketch.npy"
176 |             results = np.load(data_folder)
177 |             set_acc.append(results[0])
178 |             set_fpr.append(results[1])
179 |             set_fnr.append(results[2])
180 |             feature_sets.append(feature_folder)
181 | 
182 | 
183 |         #Check best truncation value
184 |         max_acc = 0
185 |         max_fset = ""
186 |         for i, f_set in enumerate(feature_sets[:-1]):
187 |             if set_acc[i] > max_acc:
188 |                 max_acc = set_acc[i]
189 |                 max_fset = f_set
190 |         print "K = " + str(binWidth) + ", Max acc: %s, Best Truncation: %s"%(max_acc, max_fset)
191 | 
192 | 
193 |         #Plot figures
194 |         fig = plt.figure(figsize=(10,4))
195 |         ax1 = fig.add_subplot(111)
196 |         
197 |         curr_fset = feature_sets
198 |         curr_acc = set_acc
199 |         curr_fpr = set_fpr
200 |         curr_fnr = set_fnr
201 |         #print "Current feature set: "+ str(curr_fset)
202 |         
203 |         ind = np.arange(len(curr_fset))  # the x locations for the groups
204 |         width = 0.20
205 | 
206 |         rects0 = ax1.bar(ind - width - width/2, curr_acc, width, color=colors[0], label='Acc')
207 |         autolabel(rects0,ax1)
208 |         rects1 = ax1.bar(ind - width/2 , curr_fpr, width, color=colors[1], label='FPR')
209 |         autolabel(rects1,ax1)
210 |         rects2 = ax1.bar(ind + width - width/2, curr_fnr, width, color=colors[2], label='FNR')
211 |         autolabel(rects2,ax1)
212 | 
213 |         ax1.yaxis.grid(color='black', linestyle='dotted')
214 |         ax1.set_title('Truncation Scores for K ='+str(binWidth))
215 |         ax1.set_xticks(ind)
216 |         print feature_sets
217 |         labels = ["Top-k= " + str(int(x.split('_')[3])) + "\n(PerFlow = " + str(int(x.split('_')[3])*4) + " B)" + "\n(CGMem = " + str((n_flows * int(x.split('_')[3]) * 4)/1024) + " KB)" for x in feature_sets]
218 |         labels[len(topk_features)-1] = str(int(1500/binWidth)) + " features\n(PerFlow = " + str(int(1500/binWidth)*4) + " B)" + "\n(CGMem = " + str(int((n_flows * int(1500/binWidth) * 4)/1024)) + " KB)"
219 |         ax1.set_xticklabels(labels)
220 |         plt.xticks(fontsize=9)
221 |         ax1.legend()
222 | 
223 |         plt.ylim(top=1)
224 |         plt.legend(loc='upper right', fontsize=10)
225 |         plt.tight_layout()
226 |         fig.savefig('Figures/Truncation_comparison/DeltaShaper_bin' + str(binWidth) + '_topk_NoSketch.pdf')   # save the figure to file
227 |         fig.savefig('Figures/Truncation_comparison/DeltaShaper_bin' + str(binWidth) + '_topk_NoSketch.png')   # save the figure to file
228 |         plt.close(fig)
229 | 
230 | def PlotKQuantizationAndTruncationLines(binWidths, topk_features, n_flows):
231 |     print "PlotKQuantizationAndTruncation"
232 |     if not os.path.exists('Figures/Truncation_comparison'):
233 |         os.makedirs('Figures/Truncation_comparison')
234 | 
235 |     for binWidth in binWidths:
236 |         feature_sets = []
237 |         set_acc = []
238 | 
239 |         for topk in topk_features:
240 |               
241 |             feature_folder = 'PL_60_' + str(binWidth) + '_' + str(topk)
242 |             #print feature_folder
243 | 
244 |             if(topk != 1500 and topk > 1500/binWidth):
245 |                 #print "Skipping sample, invalid configuration. TopK = " + str(topk) + " Total Features = " + str(1500/binWidth)
246 |                 set_acc.append(0)
247 |                 feature_sets.append(feature_folder)
248 |                 continue
249 | 
250 |             #Load configuration results
251 |             #if(topk == 1500):
252 |             #    data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase1_NoSketch.npy"
253 |             #else:
254 |             data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase2_NoSketch.npy"
255 |             results = np.load(data_folder)
256 |             set_acc.append(results[3])
257 |             feature_sets.append(feature_folder)
258 | 
259 | 
260 |         #Plot figures
261 |         fig = plt.figure(figsize=(10,4))
262 |         ax1 = fig.add_subplot(111)
263 |         
264 |         curr_fset = feature_sets
265 |         curr_acc = set_acc
266 | 
267 |         #print "Current feature set: "+ str(curr_fset)
268 |         
269 |         ind = np.arange(len(curr_fset))  # the x locations for the groups
270 | 
271 | 
272 |         ax1.plot(curr_acc, color=colors[0], marker=".", markersize=12, lw=3, label='AUC')
273 |         ax1.hlines(0.95, 0, len(ind)-1, lw=3, label='Baseline, AUC = 0.95')
274 |         
275 |         for i,j in zip(ind,curr_acc):
276 |             ax1.annotate("{0:.2f}".format(j),xy=(i-0.1,j+0.03))
277 | 
278 |         plt.xlim(-0.3, len(ind)-1+0.3)
279 |         ax1.yaxis.grid(color='black', linestyle='dotted')
280 | 
281 |         ax1.set_xticks(ind)
282 |         print feature_sets
283 |         labels = [str(int(x.split('_')[3])) for x in feature_sets]
284 |         #labels = ["Top-n= " + str(int(x.split('_')[3])) + "\nPF = " + str(int(x.split('_')[3])*4) + " B" + "\nTM = " + str((n_flows * int(x.split('_')[3]) * 4)/1024) + " KB" for x in feature_sets]
285 |         #labels[len(topk_features)-1] = str(int(1500/binWidth)) + " features\n(PF = " + str(int(1500/binWidth)*4) + " B)" + "\n(TMem = " + str(int((n_flows * int(1500/binWidth) * 4)/1024)) + " KB)"
286 |         ax1.set_xticklabels(labels)
287 |         plt.xticks(fontsize=9)
288 |         plt.xlabel("Truncation Factor", fontsize=12)
289 |         ax1.legend()
290 |         
291 |         
292 |         plt.yticks(fontsize=12)
293 |         plt.ylim(bottom=0,top=1)
294 |         plt.ylabel("AUC Score", fontsize=12)
295 | 
296 |         plt.legend(loc='lower right', fontsize=12)
297 |         plt.tight_layout()
298 |         fig.savefig('Figures/Truncation_comparison/DeltaShaper_bin' + str(binWidth) + '_topk_NoSketch_Lines.pdf')   # save the figure to file
299 |         fig.savefig('Figures/Truncation_comparison/DeltaShaper_bin' + str(binWidth) + '_topk_NoSketch_Lines.png')   # save the figure to file
300 |         plt.close(fig)
301 | 
302 |         
303 | 
304 | def GenerateFigures(binWidths, topk_features, nFlows):
305 |     if not os.path.exists('Figures'):
306 |         os.makedirs('Figures')
307 | 
308 |     PlotQuantization(binWidths, nFlows)
309 |     PlotKQuantizationAndTruncation(binWidths, topk_features, nFlows)
310 |     
311 | 
312 | def GenerateFiguresLines(binWidths, topk_features, nFlows):
313 |     if not os.path.exists('Figures'):
314 |         os.makedirs('Figures')
315 | 
316 |     TOPK = [10, 20, 30, 40, 50]
317 |     PlotQuantizationLines(binWidths, nFlows)
318 |     PlotKQuantizationAndTruncationLines(binWidths, TOPK, nFlows)
319 |     
320 | 
321 | 
322 | if __name__ == "__main__":
323 | 
324 |     #Quantization
325 |     BIN_WIDTH = [1, 4, 8, 16, 32, 64, 128, 256]
326 | 
327 |     #Truncation Top-K features
328 |     TOPK = [5, 10, 20, 30, 40, 50, 1500]
329 |     TOPK = [10, 20, 30, 40, 50]
330 | 
331 | 
332 |     #Total amount of flows per dataset
333 |     N_FLOWS = 300
334 | 
335 |     PlotQuantizationLines(BIN_WIDTH, N_FLOWS)
336 |     PlotKQuantizationAndTruncationLines(BIN_WIDTH, TOPK, N_FLOWS)


--------------------------------------------------------------------------------
/Security Tasks Evaluation/MPTAnalysis/FacetAnalysis/compressive_ta.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import numpy as np
  3 | import os
  4 | import math
  5 | 
  6 | 
  7 | def CreateCompressiveRepresentation(MODE, BIN_WIDTH, TOPK, SIGMA_PARAM, COMPRESSIVE_RATIO):
  8 | 
  9 |     for compressive_ratio in COMPRESSIVE_RATIO:
 10 |         for binWidth in BIN_WIDTH:
 11 |             for topk in TOPK:
 12 |                 feature_set = 'PL_60_' + str(binWidth) + '_' + str(topk)
 13 |                 data_folder = 'FeatureSets/' + feature_set + '/'
 14 | 
 15 |                 #Sensing matrix parameters
 16 |                 N = 0
 17 |                 f = open(data_folder + "RegularTraffic_phase1_dataset.csv", 'r')
 18 |                 reader = csv.reader(f, delimiter=',')
 19 |                 for n, row in enumerate(reader):
 20 |                             if(n == 0):
 21 |                                 N = len(row) -1 #Read number of bins from file
 22 |                 f.close()
 23 | 
 24 |                 M = N/compressive_ratio
 25 | 
 26 |                 if(M < 1):
 27 |                     print "Cannot compress further(features = %d, ratio = %d), only 1 feature left"%(N, compressive_ratio)
 28 |                     continue
 29 | 
 30 |                 np.random.seed(1)
 31 | 
 32 |                 print "Compressive Ratio: %d"%(compressive_ratio)
 33 |                 print "M: %d"%(M)
 34 |                 print "N: %d"%(N)
 35 | 
 36 |                 ######################################
 37 |                 # GAUSSIAN SENSING MATRIX
 38 |                 ######################################
 39 |                 if MODE == "compressive_gaussian":
 40 |                     print "Start Compressive Gaussian Representation"
 41 |                     for sigma_param in SIGMA_PARAM:
 42 |                         
 43 |                         """
 44 |                         Generate sensing matrix
 45 |                         """
 46 | 
 47 |                         sensing_matrix = np.random.normal(0,1,(M,N))
 48 | 
 49 |                         """
 50 |                         Process Phase 1 Data
 51 |                         """
 52 | 
 53 |                         #Regular Traffic
 54 |                         print "Compressive Gaussian: Phase 1, Regular - " + feature_set + "/CompressiveGaussian_regularTraffic_phase1_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv"
 55 |                         output = open(data_folder + "CompressiveGaussian_regularTraffic_phase1_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv", "w") 
 56 |                         f = open(data_folder + "RegularTraffic_phase1_dataset.csv", 'r')
 57 |                         reader = csv.reader(f, delimiter=',')
 58 | 
 59 |                         #Process data row
 60 |                         for n, row in enumerate(reader):
 61 |                             if(n == 0):
 62 |                                 output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
 63 |                             else:
 64 |                                 #Gather the first n packets array
 65 |                                 first_n_packets_vector = []
 66 |                                 for i in row[:-1]:
 67 |                                     first_n_packets_vector.append(int(i))
 68 | 
 69 |                                 compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
 70 | 
 71 |                                 #print "Compressed vector: " + str(compressed_vector)
 72 |                                 output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
 73 |                         output.close()
 74 | 
 75 | 
 76 |                         #Facet Traffic
 77 |                         print "Compressive Gaussian: Phase 1, Facet - " + feature_set + "/CompressiveGaussian_facetTraffic_phase1_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv"
 78 |                         output = open(data_folder + "CompressiveGaussian_facetTraffic_phase1_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv", "w") 
 79 |                         f = open(data_folder + "FacetTraffic_50_phase1_dataset.csv", 'r')
 80 |                         reader = csv.reader(f, delimiter=',')
 81 | 
 82 |                         #Process data row
 83 |                         for n, row in enumerate(reader):
 84 |                             if(n == 0):
 85 |                                 output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
 86 |                             else:
 87 |                                 #Gather the first n packets array
 88 |                                 first_n_packets_vector = []
 89 |                                 for i in row[:-1]:
 90 |                                     first_n_packets_vector.append(int(i))
 91 | 
 92 |                                 compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
 93 | 
 94 |                                 #print "Compressed vector: " + str(compressed_vector)
 95 |                                 output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
 96 |                         output.close()
 97 | 
 98 |                         ########################################################################################
 99 |                         ########################################################################################
100 |                         ########################################################################################
101 | 
102 | 
103 |                         """
104 |                         Process Phase 2 Data
105 |                         """
106 | 
107 |                         #Regular Traffic
108 |                         print "Compressive Gaussian: Phase 2, Regular - " + feature_set + "/CompressiveGaussian_regularTraffic_phase2_" + str(sigma_param) + "_"  + str(compressive_ratio) + "_dataset.csv"
109 |                         output = open(data_folder + "CompressiveGaussian_regularTraffic_phase2_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv", "w") 
110 |                         f = open(data_folder + "RegularTraffic_phase2_dataset.csv", 'r')
111 |                         reader = csv.reader(f, delimiter=',')
112 | 
113 |                         #Process data row
114 |                         for n, row in enumerate(reader):
115 |                             if(n == 0):
116 |                                 output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
117 |                             else:
118 |                                 #Gather the first n packets array
119 |                                 first_n_packets_vector = []
120 |                                 for i in row[:-1]:
121 |                                     first_n_packets_vector.append(int(i))
122 | 
123 |                                 compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
124 | 
125 |                                 #print "Compressed vector: " + str(compressed_vector)
126 |                                 output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
127 |                         output.close()
128 | 
129 | 
130 |                         #Facet Traffic
131 |                         print "Compressive Gaussian Phase 2, Facet - " + feature_set + "/CompressiveGaussian_facetTraffic_phase2_" + str(sigma_param) + "_" + "_" + str(compressive_ratio) + "_dataset.csv"
132 |                         output = open(data_folder + "CompressiveGaussian_facetTraffic_phase2_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv", "w") 
133 |                         f = open(data_folder + "FacetTraffic_50_phase2_dataset.csv", 'r')
134 |                         reader = csv.reader(f, delimiter=',')
135 | 
136 |                         #Process data row
137 |                         for n, row in enumerate(reader):
138 |                             if(n == 0):
139 |                                 output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
140 |                             else:
141 |                                 #Gather the first n packets array
142 |                                 first_n_packets_vector = []
143 |                                 for i in row[:-1]:
144 |                                     first_n_packets_vector.append(int(i))
145 | 
146 |                                 compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
147 | 
148 |                                 #print "Compressed vector: " + str(compressed_vector)
149 |                                 output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
150 |                         output.close()
151 | 
152 |                 ######################################
153 |                 # BERNOULLI SENSING MATRIX
154 |                 ######################################
155 |                 elif MODE == "compressive_bernoulli":
156 |                     print "Start Compressive Bernoulli Representation"
157 | 
158 |                     """
159 |                     Generate sensing matrix
160 |                     """
161 |                     values = [-1/float(math.sqrt(N)), 1/float(math.sqrt(N))]
162 |                     sensing_matrix = np.random.choice(values,(M,N), p=[0.5, 0.5])
163 | 
164 | 
165 |                     """
166 |                     Process Phase 1 Data
167 |                     """
168 | 
169 |                     #Regular Traffic
170 |                     print "Compressive Bernoulli: Phase 1, Regular - " + feature_set + "/CompressiveBernoulli_regularTraffic_phase1_" + str(compressive_ratio) + "_dataset.csv"
171 |                     output = open(data_folder + "CompressiveBernoulli_regularTraffic_phase1_" + str(compressive_ratio) + "_dataset.csv", "w") 
172 |                     f = open(data_folder + "RegularTraffic_phase1_dataset.csv", 'r')
173 |                     reader = csv.reader(f, delimiter=',')
174 | 
175 |                     #Process data row
176 |                     for n, row in enumerate(reader):
177 |                         if(n == 0):
178 |                             output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
179 |                         else:
180 |                             #Gather the first n packets array
181 |                             first_n_packets_vector = []
182 |                             for i in row[:-1]:
183 |                                 first_n_packets_vector.append(int(i))
184 | 
185 |                             compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
186 | 
187 |                             #print "Compressed vector: " + str(compressed_vector)
188 |                             output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
189 |                     output.close()
190 | 
191 | 
192 |                     #Facet Traffic
193 |                     print "Compressive Bernoulli: Phase 1, Facet - " + feature_set + "/CompressiveBernoulli_facetTraffic_phase1_" + str(compressive_ratio) + "_dataset.csv"
194 |                     output = open(data_folder + "CompressiveBernoulli_facetTraffic_phase1_" + str(compressive_ratio) + "_dataset.csv", "w") 
195 |                     f = open(data_folder + "FacetTraffic_50_phase1_dataset.csv", 'r')
196 |                     reader = csv.reader(f, delimiter=',')
197 | 
198 |                     #Process data row
199 |                     for n, row in enumerate(reader):
200 |                         if(n == 0):
201 |                             output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
202 |                         else:
203 |                             #Gather the first n packets array
204 |                             first_n_packets_vector = []
205 |                             for i in row[:-1]:
206 |                                 first_n_packets_vector.append(int(i))
207 | 
208 |                             compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
209 | 
210 |                             #print "Compressed vector: " + str(compressed_vector)
211 |                             output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
212 |                     output.close()
213 | 
214 |                     ########################################################################################
215 |                     ########################################################################################
216 |                     ########################################################################################
217 | 
218 | 
219 |                     """
220 |                     Process Phase 2 Data
221 |                     """
222 | 
223 |                     #Regular Traffic
224 |                     print "Compressive Bernoulli: Phase 2, Regular - " + feature_set + "/CompressiveBernoulli_regularTraffic_phase2_" + str(compressive_ratio) + "_dataset.csv"
225 |                     output = open(data_folder + "CompressiveBernoulli_regularTraffic_phase2_" + str(compressive_ratio) + "_dataset.csv", "w") 
226 |                     f = open(data_folder + "RegularTraffic_phase2_dataset.csv", 'r')
227 |                     reader = csv.reader(f, delimiter=',')
228 | 
229 |                     #Process data row
230 |                     for n, row in enumerate(reader):
231 |                         if(n == 0):
232 |                             output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
233 |                         else:
234 |                             #Gather the first n packets array
235 |                             first_n_packets_vector = []
236 |                             for i in row[:-1]:
237 |                                 first_n_packets_vector.append(int(i))
238 | 
239 |                             compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
240 | 
241 |                             #print "Compressed vector: " + str(compressed_vector)
242 |                             output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
243 |                     output.close()
244 | 
245 | 
246 |                     #Facet Traffic
247 |                     print "Compressive Bernoulli Phase 2, Facet - " + feature_set + "/CompressiveBernoulli_facetTraffic_phase2_" + str(compressive_ratio) + "_dataset.csv"
248 |                     output = open(data_folder + "CompressiveBernoulli_facetTraffic_phase2_" + str(compressive_ratio) + "_dataset.csv", "w") 
249 |                     f = open(data_folder + "FacetTraffic_50_phase2_dataset.csv", 'r')
250 |                     reader = csv.reader(f, delimiter=',')
251 | 
252 |                     #Process data row
253 |                     for n, row in enumerate(reader):
254 |                         if(n == 0):
255 |                             output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
256 |                         else:
257 |                             #Gather the first n packets array
258 |                             first_n_packets_vector = []
259 |                             for i in row[:-1]:
260 |                                 first_n_packets_vector.append(int(i))
261 | 
262 |                             compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
263 | 
264 |                             #print "Compressed vector: " + str(compressed_vector)
265 |                             output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
266 |                     output.close()


--------------------------------------------------------------------------------
/Security Tasks Evaluation/MPTAnalysis/DeltaShaperAnalysis/compressive_ta.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import numpy as np
  3 | import os
  4 | import math
  5 | 
  6 | 
  7 | def CreateCompressiveRepresentation(MODE, BIN_WIDTH, TOPK, SIGMA_PARAM, COMPRESSIVE_RATIO):
  8 | 
  9 |     for compressive_ratio in COMPRESSIVE_RATIO:
 10 |         for binWidth in BIN_WIDTH:
 11 |             for topk in TOPK:
 12 |                 feature_set = 'PL_60_' + str(binWidth) + '_' + str(topk)
 13 |                 data_folder = 'FeatureSets/' + feature_set + '/'
 14 | 
 15 |                 #Sensing matrix parameters
 16 |                 N = 0
 17 |                 f = open(data_folder + "RegularTraffic_phase1_dataset.csv", 'r')
 18 |                 reader = csv.reader(f, delimiter=',')
 19 |                 for n, row in enumerate(reader):
 20 |                             if(n == 0):
 21 |                                 N = len(row) -1 #Read number of bins from file
 22 |                 f.close()
 23 | 
 24 |                 M = N/compressive_ratio
 25 | 
 26 |                 if(M < 1):
 27 |                     print "Cannot compress further(features = %d, ratio = %d), only 1 feature left"%(N, compressive_ratio)
 28 |                     continue
 29 | 
 30 |                 np.random.seed(1)
 31 | 
 32 |                 print "Compressive Ratio: %d"%(compressive_ratio)
 33 |                 print "M: %d"%(M)
 34 |                 print "N: %d"%(N)
 35 | 
 36 |                 ######################################
 37 |                 # GAUSSIAN SENSING MATRIX
 38 |                 ######################################
 39 |                 if MODE == "compressive_gaussian":
 40 |                     print "Start Compressive Gaussian Representation"
 41 |                     for sigma_param in SIGMA_PARAM:
 42 |                         
 43 |                         """
 44 |                         Generate sensing matrix
 45 |                         """
 46 | 
 47 |                         sensing_matrix = np.random.normal(0,1,(M,N))
 48 | 
 49 |                         """
 50 |                         Process Phase 1 Data
 51 |                         """
 52 | 
 53 |                         #Regular Traffic
 54 |                         print "Compressive Gaussian: Phase 1, Regular - " + feature_set + "/CompressiveGaussian_regularTraffic_phase1_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv"
 55 |                         output = open(data_folder + "CompressiveGaussian_regularTraffic_phase1_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv", "w") 
 56 |                         f = open(data_folder + "RegularTraffic_phase1_dataset.csv", 'r')
 57 |                         reader = csv.reader(f, delimiter=',')
 58 | 
 59 |                         #Process data row
 60 |                         for n, row in enumerate(reader):
 61 |                             if(n == 0):
 62 |                                 output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
 63 |                             else:
 64 |                                 #Gather the first n packets array
 65 |                                 first_n_packets_vector = []
 66 |                                 for i in row[:-1]:
 67 |                                     first_n_packets_vector.append(int(i))
 68 | 
 69 |                                 compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
 70 | 
 71 |                                 #print "Compressed vector: " + str(compressed_vector)
 72 |                                 output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
 73 |                         output.close()
 74 | 
 75 | 
 76 |                         #DeltaShaper Traffic
 77 |                         print "Compressive Gaussian: Phase 1, DeltaShaper - " + feature_set + "/CompressiveGaussian_deltashaperTraffic_phase1_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv"
 78 |                         output = open(data_folder + "CompressiveGaussian_deltashaperTraffic_phase1_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv", "w") 
 79 |                         f = open(data_folder + "DeltaShaperTraffic_320_phase1_dataset.csv", 'r')
 80 |                         reader = csv.reader(f, delimiter=',')
 81 | 
 82 |                         #Process data row
 83 |                         for n, row in enumerate(reader):
 84 |                             if(n == 0):
 85 |                                 output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
 86 |                             else:
 87 |                                 #Gather the first n packets array
 88 |                                 first_n_packets_vector = []
 89 |                                 for i in row[:-1]:
 90 |                                     first_n_packets_vector.append(int(i))
 91 | 
 92 |                                 compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
 93 | 
 94 |                                 #print "Compressed vector: " + str(compressed_vector)
 95 |                                 output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
 96 |                         output.close()
 97 | 
 98 |                         ########################################################################################
 99 |                         ########################################################################################
100 |                         ########################################################################################
101 | 
102 | 
103 |                         """
104 |                         Process Phase 2 Data
105 |                         """
106 | 
107 |                         #Regular Traffic
108 |                         print "Compressive Gaussian: Phase 2, Regular - " + feature_set + "/CompressiveGaussian_regularTraffic_phase2_" + str(sigma_param) + "_"  + str(compressive_ratio) + "_dataset.csv"
109 |                         output = open(data_folder + "CompressiveGaussian_regularTraffic_phase2_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv", "w") 
110 |                         f = open(data_folder + "RegularTraffic_phase2_dataset.csv", 'r')
111 |                         reader = csv.reader(f, delimiter=',')
112 | 
113 |                         #Process data row
114 |                         for n, row in enumerate(reader):
115 |                             if(n == 0):
116 |                                 output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
117 |                             else:
118 |                                 #Gather the first n packets array
119 |                                 first_n_packets_vector = []
120 |                                 for i in row[:-1]:
121 |                                     first_n_packets_vector.append(int(i))
122 | 
123 |                                 compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
124 | 
125 |                                 #print "Compressed vector: " + str(compressed_vector)
126 |                                 output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
127 |                         output.close()
128 | 
129 | 
130 |                         #DeltaShaper Traffic
131 |                         print "Compressive Gaussian Phase 2, DeltaShaper - " + feature_set + "/CompressiveGaussian_deltashaperTraffic_phase2_" + str(sigma_param) + "_" + "_" + str(compressive_ratio) + "_dataset.csv"
132 |                         output = open(data_folder + "CompressiveGaussian_deltashaperTraffic_phase2_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv", "w") 
133 |                         f = open(data_folder + "DeltaShaperTraffic_320_phase2_dataset.csv", 'r')
134 |                         reader = csv.reader(f, delimiter=',')
135 | 
136 |                         #Process data row
137 |                         for n, row in enumerate(reader):
138 |                             if(n == 0):
139 |                                 output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
140 |                             else:
141 |                                 #Gather the first n packets array
142 |                                 first_n_packets_vector = []
143 |                                 for i in row[:-1]:
144 |                                     first_n_packets_vector.append(int(i))
145 | 
146 |                                 compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
147 | 
148 |                                 #print "Compressed vector: " + str(compressed_vector)
149 |                                 output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
150 |                         output.close()
151 | 
152 |                 ######################################
153 |                 # BERNOULLI SENSING MATRIX
154 |                 ######################################
155 |                 elif MODE == "compressive_bernoulli":
156 |                     print "Start Compressive Bernoulli Representation"
157 | 
158 |                     """
159 |                     Generate sensing matrix
160 |                     """
161 |                     values = [-1/float(math.sqrt(N)), 1/float(math.sqrt(N))]
162 |                     sensing_matrix = np.random.choice(values,(M,N), p=[0.5, 0.5])
163 | 
164 | 
165 |                     """
166 |                     Process Phase 1 Data
167 |                     """
168 | 
169 |                     #Regular Traffic
170 |                     print "Compressive Bernoulli: Phase 1, Regular - " + feature_set + "/CompressiveBernoulli_regularTraffic_phase1_" + str(compressive_ratio) + "_dataset.csv"
171 |                     output = open(data_folder + "CompressiveBernoulli_regularTraffic_phase1_" + str(compressive_ratio) + "_dataset.csv", "w") 
172 |                     f = open(data_folder + "RegularTraffic_phase1_dataset.csv", 'r')
173 |                     reader = csv.reader(f, delimiter=',')
174 | 
175 |                     #Process data row
176 |                     for n, row in enumerate(reader):
177 |                         if(n == 0):
178 |                             output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
179 |                         else:
180 |                             #Gather the first n packets array
181 |                             first_n_packets_vector = []
182 |                             for i in row[:-1]:
183 |                                 first_n_packets_vector.append(int(i))
184 | 
185 |                             compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
186 | 
187 |                             #print "Compressed vector: " + str(compressed_vector)
188 |                             output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
189 |                     output.close()
190 | 
191 | 
192 |                     #DeltaShaper Traffic
193 |                     print "Compressive Bernoulli: Phase 1, DeltaShaper - " + feature_set + "/CompressiveBernoulli_deltashaperTraffic_phase1_" + str(compressive_ratio) + "_dataset.csv"
194 |                     output = open(data_folder + "CompressiveBernoulli_deltashaperTraffic_phase1_" + str(compressive_ratio) + "_dataset.csv", "w") 
195 |                     f = open(data_folder + "DeltaShaperTraffic_320_phase1_dataset.csv", 'r')
196 |                     reader = csv.reader(f, delimiter=',')
197 | 
198 |                     #Process data row
199 |                     for n, row in enumerate(reader):
200 |                         if(n == 0):
201 |                             output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
202 |                         else:
203 |                             #Gather the first n packets array
204 |                             first_n_packets_vector = []
205 |                             for i in row[:-1]:
206 |                                 first_n_packets_vector.append(int(i))
207 | 
208 |                             compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
209 | 
210 |                             #print "Compressed vector: " + str(compressed_vector)
211 |                             output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
212 |                     output.close()
213 | 
214 |                     ########################################################################################
215 |                     ########################################################################################
216 |                     ########################################################################################
217 | 
218 | 
219 |                     """
220 |                     Process Phase 2 Data
221 |                     """
222 | 
223 |                     #Regular Traffic
224 |                     print "Compressive Bernoulli: Phase 2, Regular - " + feature_set + "/CompressiveBernoulli_regularTraffic_phase2_" + str(compressive_ratio) + "_dataset.csv"
225 |                     output = open(data_folder + "CompressiveBernoulli_regularTraffic_phase2_" + str(compressive_ratio) + "_dataset.csv", "w") 
226 |                     f = open(data_folder + "RegularTraffic_phase2_dataset.csv", 'r')
227 |                     reader = csv.reader(f, delimiter=',')
228 | 
229 |                     #Process data row
230 |                     for n, row in enumerate(reader):
231 |                         if(n == 0):
232 |                             output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
233 |                         else:
234 |                             #Gather the first n packets array
235 |                             first_n_packets_vector = []
236 |                             for i in row[:-1]:
237 |                                 first_n_packets_vector.append(int(i))
238 | 
239 |                             compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
240 | 
241 |                             #print "Compressed vector: " + str(compressed_vector)
242 |                             output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
243 |                     output.close()
244 | 
245 | 
246 |                     #DeltaShaper Traffic
247 |                     print "Compressive Bernoulli Phase 2, DeltaShaper - " + feature_set + "/CompressiveBernoulli_deltashaperTraffic_phase2_" + str(compressive_ratio) + "_dataset.csv"
248 |                     output = open(data_folder + "CompressiveBernoulli_deltashaperTraffic_phase2_" + str(compressive_ratio) + "_dataset.csv", "w") 
249 |                     f = open(data_folder + "DeltaShaperTraffic_320_phase2_dataset.csv", 'r')
250 |                     reader = csv.reader(f, delimiter=',')
251 | 
252 |                     #Process data row
253 |                     for n, row in enumerate(reader):
254 |                         if(n == 0):
255 |                             output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
256 |                         else:
257 |                             #Gather the first n packets array
258 |                             first_n_packets_vector = []
259 |                             for i in row[:-1]:
260 |                                 first_n_packets_vector.append(int(i))
261 | 
262 |                             compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
263 | 
264 |                             #print "Compressed vector: " + str(compressed_vector)
265 |                             output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
266 |                     output.close()


--------------------------------------------------------------------------------
/Flow Marker Accumulator/flowlens-v1model.p4:
--------------------------------------------------------------------------------
  1 | /* -*- P4_16 -*- */
  2 | #include <core.p4>
  3 | #include <v1model.p4>
  4 | 
  5 | /*Set number of shifts according to the quantization level
  6 |     QL=2,    1
  7 |     QL=4,    2
  8 |     QL=8,    3
  9 |     QL=16,   4
 10 |     QL=32,   5
 11 |     QL=64,   6
 12 |     QL=128,  7
 13 |     QL=256,  8
 14 | */
 15 | 
 16 | /* In our running example, we will use QL=16 */
 17 | const bit<8> BIN_WIDTH_SHIFT = 4; 
 18 | 
 19 | /* Number of counters for each flow */
 20 | const bit<32> FLOW_BINS = 1500 >> BIN_WIDTH_SHIFT; //94 flow counters for QL=16
 21 | 
 22 | /* Number of flows in each partition */
 23 | const bit<32> FLOWS_PER_PARTITION = 3000;
 24 | 
 25 | const bit<32> PARTITION_SIZE = FLOWS_PER_PARTITION*FLOW_BINS;
 26 | 
 27 | /* Number of packet sizes considered for truncation */
 28 | const bit<32> NUM_PKT_SIZES = 1500;
 29 | 
 30 | /* To flag bins not to be counted */
 31 | const bit<1> NOBIN_FLAG = 0; 
 32 | 
 33 | 
 34 | typedef bit<9>  egressSpec_t;
 35 | typedef bit<48> macAddr_t;
 36 | typedef bit<32> ip4Addr_t;
 37 | const bit<16> TYPE_IPV4 = 0x800;
 38 | typedef bit<8> ip_protocol_t;
 39 | const ip_protocol_t IP_PROTOCOLS_TCP = 6;
 40 | const ip_protocol_t IP_PROTOCOLS_UDP = 17;
 41 | 
 42 | /*************************************************************************
 43 | *********************** H E A D E R S  ***********************************
 44 | *************************************************************************/
 45 | 
 46 | header ethernet_t {
 47 |     macAddr_t dstAddr;
 48 |     macAddr_t srcAddr;
 49 |     bit<16>   etherType;
 50 | }
 51 | 
 52 | header ipv4_t {
 53 |     bit<4>    version;
 54 |     bit<4>    ihl;
 55 |     bit<8>    diffserv;
 56 |     bit<16>   totalLen;
 57 |     bit<16>   identification;
 58 |     bit<3>    flags;
 59 |     bit<13>   fragOffset;
 60 |     bit<8>    ttl;
 61 |     bit<8>    protocol;
 62 |     bit<16>   hdrChecksum;
 63 |     ip4Addr_t srcAddr;
 64 |     ip4Addr_t dstAddr;
 65 | }
 66 | 
 67 | header tcp_t {
 68 |     bit<16> srcPort;
 69 |     bit<16> dstPort;
 70 |     bit<32> seqNo;
 71 |     bit<32> ackNo;
 72 |     bit<4>  dataOffset;
 73 |     bit<3>  res;
 74 |     bit<3>  ecn;
 75 |     bit<6>  ctrl;
 76 |     bit<16> window;
 77 |     bit<16> checksum;
 78 |     bit<16> urgentPtr;
 79 | }
 80 | 
 81 | header udp_t {
 82 |     bit<16> srcPort;
 83 |     bit<16> dstPort;
 84 |     bit<16> length_;
 85 |     bit<16> checksum;
 86 | }
 87 | 
 88 | //User-defined metadata 
 89 | struct metadata {
 90 |     bit  truncation_flag; // marks whether or not the current pkt has to be counted
 91 |     bit<32> rg_bin_offset; // this is computed by adding the binIndex_posTruncation to the flow_offset
 92 | 	bit<32> binIndex_preTruncation;
 93 | 	bit<32> binIndex_posTruncation;
 94 | }
 95 | 
 96 | struct headers {
 97 |     ethernet_t   ethernet;
 98 |     ipv4_t       ipv4;
 99 |     tcp_t        tcp;
100 |     udp_t        udp;
101 | }
102 | 
103 | /*************************************************************************
104 | *********************** P A R S E R  ***********************************
105 | *************************************************************************/
106 | 
107 | parser MyParser(packet_in packet,
108 |                 out headers hdr,
109 |                 inout metadata meta,
110 |                 inout standard_metadata_t standard_metadata) {
111 | 
112 |     // Initial state of the parser
113 |     state start {
114 |         transition parse_ethernet;
115 |     }
116 | 
117 |     state parse_ethernet {
118 |         packet.extract(hdr.ethernet);
119 |         transition select(hdr.ethernet.etherType) {
120 |             TYPE_IPV4: parse_ipv4;
121 |             default: accept;
122 |         }
123 |     }
124 | 
125 |     state parse_ipv4 {
126 |         packet.extract(hdr.ipv4);
127 |         transition select(hdr.ipv4.protocol) {
128 |             6: parse_tcp;
129 |             17: parse_udp;
130 |             default: accept;
131 |         }
132 |     }
133 | 
134 |     state parse_tcp {
135 |         packet.extract(hdr.tcp);
136 |         transition accept;
137 |     }
138 | 
139 |     state parse_udp {
140 |         packet.extract(hdr.udp);
141 |         transition accept;
142 |     }
143 | }
144 | 
145 | 
146 | /*************************************************************************
147 | ************   C H E C K S U M    V E R I F I C A T I O N   *************
148 | *************************************************************************/
149 | 
150 | control MyVerifyChecksum(inout headers hdr, inout metadata meta) {   
151 |     apply {  }
152 | }
153 | 
154 | /*************************************************************************
155 | **************  I N G R E S S   P R O C E S S I N G   *******************
156 | *************************************************************************/
157 | 
158 | control MyIngress(inout headers hdr,
159 |                   inout metadata meta,
160 |                   inout standard_metadata_t standard_metadata) {
161 | 
162 |     action drop() {
163 |         mark_to_drop(standard_metadata);
164 |     }
165 |     
166 | 
167 |     ///////////////////////////////////////////////////////
168 |     //Set ipv4 forwarding for packets traversing the switch
169 |     ///////////////////////////////////////////////////////
170 |     action ipv4_forward(macAddr_t dstAddr, egressSpec_t port) {
171 |         standard_metadata.egress_spec = port;     //Sets the egress port for the next hop. 
172 |         hdr.ethernet.srcAddr = hdr.ethernet.dstAddr;  //Updates the ethernet destination address with the address of the next hop. 
173 |         hdr.ethernet.dstAddr = dstAddr; //Updates the ethernet source address with the address of the switch.
174 |         hdr.ipv4.ttl = hdr.ipv4.ttl - 1;  //Decrements time to live
175 |     }
176 | 
177 | 
178 |     table ipv4_lpm {
179 |         key = {
180 |             hdr.ipv4.dstAddr: exact;
181 |             hdr.ipv4.srcAddr: exact;
182 |         }
183 |         actions = {
184 |             ipv4_forward;
185 |             drop;
186 |         }
187 |         size = 1024;
188 |         default_action = drop();
189 |     }
190 |     
191 |     apply {
192 | 
193 |         if (hdr.ipv4.isValid()) {
194 |             
195 |             ipv4_lpm.apply();
196 |             
197 |         }
198 | 		else {
199 | 			drop();
200 | 		}
201 |     }
202 | }
203 | 
204 | /*************************************************************************
205 | ****************  E G R E S S   P R O C E S S I N G   *******************
206 | *************************************************************************/
207 | 
208 | control MyEgress(inout headers hdr,
209 |                  inout metadata meta,
210 |                  inout standard_metadata_t standard_metadata) {
211 | 
212 |     register<bit<16>>(PARTITION_SIZE) reg_grid0;
213 |     register<bit<16>>(PARTITION_SIZE) reg_grid1;
214 |     register<bit<16>>(PARTITION_SIZE) reg_grid2;
215 |     register<bit<16>>(PARTITION_SIZE) reg_grid3;
216 |     register<bit<16>>(PARTITION_SIZE) reg_grid4;
217 |     register<bit<16>>(PARTITION_SIZE) reg_grid5;
218 |     register<bit<16>>(PARTITION_SIZE) reg_grid6;
219 |     register<bit<16>>(PARTITION_SIZE) reg_grid7;
220 |     register<bit<16>>(PARTITION_SIZE) reg_grid8;
221 | 
222 | 
223 | 	//****************** Register Actions Definition************************
224 |     action reg_grid0_action() { 
225 |         bit<16> value;  
226 |         reg_grid0.read(value, meta.rg_bin_offset);
227 |         value = value+1;
228 |         reg_grid0.write(meta.rg_bin_offset, value);
229 |     }
230 | 
231 |     action reg_grid1_action() { 
232 |         bit<16> value;  
233 |         reg_grid1.read(value, meta.rg_bin_offset);
234 |         value = value+1;
235 |         reg_grid1.write(meta.rg_bin_offset, value);
236 |     }
237 | 
238 |     action reg_grid2_action() { 
239 |         bit<16> value;  
240 |         reg_grid2.read(value, meta.rg_bin_offset);
241 |         value = value+1;
242 |         reg_grid2.write(meta.rg_bin_offset, value);
243 |     }
244 | 
245 |     action reg_grid3_action() { 
246 |         bit<16> value;  
247 |         reg_grid3.read(value, meta.rg_bin_offset);
248 |         value = value+1;
249 |         reg_grid3.write(meta.rg_bin_offset, value);
250 |     }
251 | 
252 |     action reg_grid4_action() { 
253 |         bit<16> value;  
254 |         reg_grid4.read(value, meta.rg_bin_offset);
255 |         value = value+1;
256 |         reg_grid4.write(meta.rg_bin_offset, value);
257 |     }
258 | 
259 |     action reg_grid5_action() { 
260 |         bit<16> value;  
261 |         reg_grid5.read(value, meta.rg_bin_offset);
262 |         value = value+1;
263 |         reg_grid5.write(meta.rg_bin_offset, value);
264 |     }
265 | 
266 |     action reg_grid6_action() { 
267 |         bit<16> value;  
268 |         reg_grid6.read(value, meta.rg_bin_offset);
269 |         value = value+1;
270 |         reg_grid6.write(meta.rg_bin_offset, value);
271 |     }
272 | 
273 |     action reg_grid7_action() { 
274 |         bit<16> value;  
275 |         reg_grid7.read(value, meta.rg_bin_offset);
276 |         value = value+1;
277 |         reg_grid7.write(meta.rg_bin_offset, value);
278 |     }
279 | 
280 |     action reg_grid8_action() { 
281 |         bit<16> value;  
282 |         reg_grid8.read(value, meta.rg_bin_offset);
283 |         value = value+1;
284 |         reg_grid8.write(meta.rg_bin_offset, value);
285 |     }
286 | 
287 | 	//******************End Register Actions Definition*********************
288 | 
289 | 	//****************** Other Actions Definition************************
290 | 
291 |     // flow_offset: is used for indexing the flow within a bin of the reg_grid 
292 |     action set_flow_data(bit<32> flow_offset) {
293 | 		meta.rg_bin_offset = flow_offset + meta.binIndex_posTruncation;
294 |     }
295 | 
296 | 	action quantization_act(){
297 | 		meta.binIndex_preTruncation =  (bit<32>) (standard_metadata.packet_length >> BIN_WIDTH_SHIFT);
298 | 	}
299 | 
300 |     action truncate_binIndex(bit<32> new_index, bit flag) {
301 |         meta.binIndex_posTruncation = new_index;
302 |         meta.truncation_flag = flag;
303 |     }
304 | 
305 | 
306 | 	//******************End Other Actions Definition*********************
307 | 
308 | 	//******************Tables Definition**************************
309 | 
310 |     table flow_tbl0 {
311 |         key = {
312 |             hdr.ipv4.dstAddr: exact;
313 |             hdr.ipv4.srcAddr: exact;
314 | 			meta.truncation_flag : exact;
315 |         }
316 |         actions = {
317 |             set_flow_data;
318 |             NoAction();
319 |         }
320 |         default_action = NoAction();
321 | 		size = FLOWS_PER_PARTITION;
322 |     }
323 | 
324 |     table flow_tbl1 {
325 |         key = {
326 |             hdr.ipv4.dstAddr: exact;
327 |             hdr.ipv4.srcAddr: exact;
328 | 			meta.truncation_flag : exact;
329 |         }
330 |         actions = {
331 |             set_flow_data;
332 |             NoAction();
333 |         }
334 |         default_action = NoAction();
335 | 		size = FLOWS_PER_PARTITION;
336 |     }
337 | 
338 |     table flow_tbl2 {
339 |         key = {
340 |             hdr.ipv4.dstAddr: exact;
341 |             hdr.ipv4.srcAddr: exact;
342 | 			meta.truncation_flag : exact;
343 |         }
344 |         actions = {
345 |             set_flow_data;
346 |             NoAction();
347 |         }
348 |         default_action = NoAction();
349 | 		size = FLOWS_PER_PARTITION;
350 |     }
351 | 
352 |     table flow_tbl3 {
353 |         key = {
354 |             hdr.ipv4.dstAddr: exact;
355 |             hdr.ipv4.srcAddr: exact;
356 | 			meta.truncation_flag : exact;
357 |         }
358 |         actions = {
359 |             set_flow_data;
360 |             NoAction();
361 |         }
362 |         default_action = NoAction();
363 | 		size = FLOWS_PER_PARTITION;
364 |     }
365 | 
366 |     table flow_tbl4 {
367 |         key = {
368 |             hdr.ipv4.dstAddr: exact;
369 |             hdr.ipv4.srcAddr: exact;
370 | 			meta.truncation_flag : exact;
371 |         }
372 |         actions = {
373 |             set_flow_data;
374 |             NoAction();
375 |         }
376 |         default_action = NoAction();
377 | 		size = FLOWS_PER_PARTITION;
378 |     }
379 | 
380 |     table flow_tbl5 {
381 |         key = {
382 |             hdr.ipv4.dstAddr: exact;
383 |             hdr.ipv4.srcAddr: exact;
384 | 			meta.truncation_flag : exact;
385 |         }
386 |         actions = {
387 |             set_flow_data;
388 |             NoAction();
389 |         }
390 |         default_action = NoAction();
391 | 		size = FLOWS_PER_PARTITION;
392 |     }
393 | 
394 |     table flow_tbl6 {
395 |         key = {
396 |             hdr.ipv4.dstAddr: exact;
397 |             hdr.ipv4.srcAddr: exact;
398 | 			meta.truncation_flag : exact;
399 |         }
400 |         actions = {
401 |             set_flow_data;
402 |             NoAction();
403 |         }
404 |         default_action = NoAction();
405 | 		size = FLOWS_PER_PARTITION;
406 |     }
407 | 
408 |     table flow_tbl7 {
409 |         key = {
410 |             hdr.ipv4.dstAddr: exact;
411 |             hdr.ipv4.srcAddr: exact;
412 | 			meta.truncation_flag : exact;
413 |         }
414 |         actions = {
415 |             set_flow_data;
416 |             NoAction();
417 |         }
418 |         default_action = NoAction();
419 | 		size = FLOWS_PER_PARTITION;
420 |     }
421 | 
422 |     table flow_tbl8 {
423 |         key = {
424 |             hdr.ipv4.dstAddr: exact;
425 |             hdr.ipv4.srcAddr: exact;
426 | 			meta.truncation_flag : exact;
427 |         }
428 |         actions = {
429 |             set_flow_data;
430 |             NoAction();
431 |         }
432 |         default_action = NoAction();
433 | 		size = FLOWS_PER_PARTITION;
434 |     }
435 | 
436 |     table truncation_tbl {
437 |         key = {
438 |             meta.binIndex_preTruncation: exact;
439 |         }
440 |         actions = {
441 |             truncate_binIndex();
442 |             NoAction();
443 |         }
444 |         default_action = truncate_binIndex(0, NOBIN_FLAG);
445 | 		size = NUM_PKT_SIZES;
446 |     }
447 | 
448 | 
449 | 	//******************End Tables Definition***********************
450 | 
451 | 
452 |     apply {
453 | 
454 | 			quantization_act();
455 | 
456 | 			truncation_tbl.apply();
457 | 
458 |             if(flow_tbl0.apply().hit) {
459 |              	reg_grid0_action();
460 | 			}	
461 | 			else {
462 | 			 if(flow_tbl1.apply().hit) {
463 |              	reg_grid1_action();
464 | 			 }
465 | 			 else {
466 | 			  if(flow_tbl2.apply().hit) {
467 |              	reg_grid2_action();
468 | 			  }
469 | 			  else {
470 | 			  	if(flow_tbl3.apply().hit) {
471 |               		reg_grid3_action();
472 | 			  	}
473 | 				else {
474 | 			  	 if(flow_tbl4.apply().hit) {
475 |               	 	reg_grid4_action();
476 | 			  	 }
477 | 				 else {
478 | 			  	  if(flow_tbl5.apply().hit) {
479 |               	  		reg_grid5_action();
480 | 			  	  }
481 | 				  else {
482 | 			  	   if(flow_tbl6.apply().hit) {
483 |               	   		reg_grid6_action();
484 | 			  	   }
485 | 				   else {
486 | 			  	    if(flow_tbl7.apply().hit) {
487 |               	    	reg_grid7_action();
488 | 			  	    }
489 | 					else {
490 | 			  	     if(flow_tbl8.apply().hit) {
491 |               	     	reg_grid8_action();
492 | 			  	     }
493 | 					}
494 | 				   }
495 | 				  }
496 | 				 }
497 | 				}
498 | 			  }
499 | 			 }
500 | 			}
501 | 			       
502 |     } // end of the apply block
503 | 
504 | }
505 | 
506 | /*************************************************************************
507 | *************   C H E C K S U M    C O M P U T A T I O N   **************
508 | *************************************************************************/
509 | 
510 | control MyComputeChecksum(inout headers hdr, inout metadata meta) {
511 |      apply {
512 | 	update_checksum(
513 | 	    hdr.ipv4.isValid(),
514 |             { hdr.ipv4.version,
515 | 	      	  hdr.ipv4.ihl,
516 |               hdr.ipv4.diffserv,
517 |               hdr.ipv4.totalLen,
518 |               hdr.ipv4.identification,
519 |               hdr.ipv4.flags,
520 |               hdr.ipv4.fragOffset,
521 |               hdr.ipv4.ttl,
522 |               hdr.ipv4.protocol,
523 |               hdr.ipv4.srcAddr,
524 |               hdr.ipv4.dstAddr },
525 |             hdr.ipv4.hdrChecksum,
526 |             HashAlgorithm.csum16);
527 |     }
528 | }
529 | 
530 | 
531 | /*************************************************************************
532 | ***********************  D E P A R S E R  *******************************
533 | *************************************************************************/
534 | 
535 | control MyDeparser(packet_out packet, in headers hdr) {
536 | 
537 |     //deparser that selects the order in which fields inserted into the outgoing packet.
538 |     apply {
539 |         packet.emit(hdr.ethernet);
540 |         packet.emit(hdr.ipv4);
541 |         packet.emit(hdr.tcp);
542 |         packet.emit(hdr.udp);
543 |     }
544 | }
545 | 
546 | /*************************************************************************
547 | ***********************  S W I T C H  *******************************
548 | *************************************************************************/
549 | 
550 | V1Switch(
551 | MyParser(),
552 | MyVerifyChecksum(),
553 | MyIngress(),
554 | MyEgress(),
555 | MyComputeChecksum(),
556 | MyDeparser()
557 | ) main;
558 | 


--------------------------------------------------------------------------------
/Security Tasks Evaluation/MPTAnalysis/FacetAnalysis/generateFeatures.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import collections
  3 | import dpkt
  4 | import subprocess
  5 | import socket
  6 | import os
  7 | import sys
  8 | import math
  9 | import csv
 10 | import numpy as np
 11 | from itertools import product
 12 | from scipy.stats import kurtosis, skew
 13 | import time
 14 | import glob
 15 | 
 16 | 
 17 | DEST_IP = '172.31.0.2'
 18 | SOURCE_IP = '172.31.0.19'
 19 | 
 20 | def MergeDatasets(data_folder):
 21 |     if(os.path.exists(data_folder + '/full_dataset.csv')):
 22 |         os.remove(data_folder + '/full_dataset.csv')
 23 | 
 24 |     features_files = [data_folder + "facet_dataset.csv", data_folder + "RegularTraffic_dataset.csv"]
 25 | 
 26 |     print "Merging full dataset..."
 27 |     header_saved = False
 28 |     with open(data_folder + '/full_dataset.csv','wb') as fout:
 29 |         for filename in features_files:
 30 |             print "merging " + filename
 31 |             with open(filename) as fin:
 32 |                 header = next(fin)
 33 |                 if not header_saved:
 34 |                     fout.write(header)
 35 |                     header_saved = True
 36 |                 for line in fin:
 37 |                     fout.write(line)
 38 |     print "Dataset merged!"
 39 | 
 40 | 
 41 | def CombinedMerging(data_folder):
 42 |     if(os.path.exists(data_folder + '/regular_50_dataset.csv')):
 43 |         os.remove(data_folder + '/regular_50_dataset.csv')
 44 | 
 45 |     features_files = [data_folder + "FacetTraffic_50_dataset.csv", data_folder + "RegularTraffic_dataset.csv"]
 46 | 
 47 |     print "Merging dataset..."
 48 |     header_saved = False
 49 |     with open(data_folder + '/regular_50_dataset.csv','wb') as fout:
 50 |         for filename in features_files:
 51 |             print "merging " + filename
 52 |             with open(filename) as fin:
 53 |                 header = next(fin)
 54 |                 if not header_saved:
 55 |                     fout.write(header)
 56 |                     header_saved = True
 57 |                 for line in fin:
 58 |                     fout.write(line)
 59 |     print "Dataset merged!"
 60 | 
 61 | 
 62 | def MergeSamples(data_folder):
 63 |     #Generate training dataset
 64 |     facet_files = glob.glob(data_folder + "/FacetTraffic_*.csv")
 65 | 
 66 |     header_saved = False
 67 |     with open(data_folder + '/facet_dataset.csv','wb') as fout:
 68 |         for filename in facet_files:
 69 |             with open(filename) as fin:
 70 |                 header = next(fin)
 71 |                 if not header_saved:
 72 |                     fout.write(header)
 73 |                     header_saved = True
 74 |                 for line in fin:
 75 |                     fout.write(line)
 76 | 
 77 | 
 78 | def GenerateDatasets(data_folder):
 79 |     MergeSamples(data_folder)
 80 |     CombinedMerging(data_folder)
 81 |     #MergeDatasets(data_folder)
 82 | 
 83 | 
 84 | def RoundToNearest(n, m):
 85 |         r = n % m
 86 |         return n + m - r if r + r >= m else n - r
 87 | 
 88 | 
 89 | def FeatureExtractionPLBenchmark(sampleFolder, binWidth, topk):
 90 |     #Bucket importance in decreasing order
 91 |     BUCKETS_TO_MEASURE = []
 92 |     
 93 |     #Measure interesting buckets
 94 |     if(topk != 1500):
 95 |         #Buckets in decreasing importance order 
 96 |         f_imp = np.load('classificationResults/PL_60_' + str(binWidth) + '_1500/FeatureImportance_XGBoost_FacetTraffic_50.npy')
 97 |         #Print top k
 98 |         #for f in f_imp:
 99 |         #    print str(f[1]) + " " + str(f[2])
100 |         
101 |         if(topk > len(f_imp)):
102 |             print "Skipping, not enough features to accomodate for. TopK = " + str(topk) + " Features = " + str(len(f_imp))
103 |             return
104 |         for i in range(0,topk):
105 |             b = int(f_imp[i][2].split("_")[1])
106 |             print "Top-" + str(i) + " = " + str(b)
107 |             BUCKETS_TO_MEASURE.append(b)
108 | 
109 |     #Measure all buckets
110 |     elif(topk == 1500):
111 |         for i in range(0,1500,binWidth):
112 |             BUCKETS_TO_MEASURE.append(i/binWidth)
113 | 
114 | 
115 |     quantized_buckets_to_measure = sorted(BUCKETS_TO_MEASURE)
116 |     print "Quantized buckets to measure = " + str(quantized_buckets_to_measure)
117 |     print "Number of buckets to measure = " + str(len(quantized_buckets_to_measure))
118 | 
119 | 
120 |     traceInterval = 60 #Amount of time in packet trace to consider for feature extraction
121 |     feature_set_folder = 'FeatureSets/PL_' + str(traceInterval) + "_" + str(binWidth) + "_" + str(topk)
122 |     print feature_set_folder
123 | 
124 |     if not os.path.exists(feature_set_folder):
125 |                 os.makedirs(feature_set_folder)
126 |     arff_path = feature_set_folder + '/' + os.path.basename(sampleFolder) + '_dataset.csv'
127 |     arff = open(arff_path, 'wb')
128 |     written_header = False
129 | 
130 | 
131 |     for sample in os.listdir(sampleFolder):
132 |         f = open(sampleFolder + "/" + sample + "/" + sample + ".pcap")
133 |         pcap = dpkt.pcap.Reader(f)
134 | 
135 |         #Analyse packets transmited
136 |         bin_dict = {}
137 |         
138 | 
139 |         for i in quantized_buckets_to_measure:
140 |             bin_dict[i] = 0
141 | 
142 | 
143 |         firstTime = 0.0
144 |         setFirst = False
145 |         for ts, buf in pcap:
146 |             if(not(setFirst)):
147 |                 firstTime = ts
148 |                 setFirst = True
149 | 
150 |             if(ts < (firstTime + traceInterval)):
151 | 
152 |                 eth = dpkt.ethernet.Ethernet(buf)
153 |                 ip_hdr = eth.data
154 |                 try:
155 |                     src_ip_addr_str = socket.inet_ntoa(ip_hdr.src)
156 |                     #Target UDP communication between both cluster machines
157 |                     if (ip_hdr.p == 17 and src_ip_addr_str == SOURCE_IP):
158 |                         binned = RoundToNearest(len(buf),binWidth)
159 |                         if(binned/binWidth in quantized_buckets_to_measure):
160 |                             bin_dict[binned/binWidth]+=1
161 |                 except:
162 |                     pass
163 |         f.close()
164 | 
165 |         od_dict = collections.OrderedDict(sorted(bin_dict.items(), key=lambda t: float(t[0])))
166 |         bin_list = []
167 |         for i in od_dict:
168 |             bin_list.append(od_dict[i])
169 | 
170 | 
171 |         label = os.path.basename(sampleFolder)
172 |         if('Regular' in sampleFolder):
173 |             label = 'Regular'
174 | 
175 |         #Write sample features to the csv file
176 |         f_names = []
177 |         f_values = []
178 | 
179 |         for i, b in enumerate(bin_list):
180 |             f_names.append('packetLengthBin_' + str(quantized_buckets_to_measure[i]))
181 |             f_values.append(b)
182 | 
183 | 
184 |         #print len(f_names)
185 |         f_names.append('Class')
186 |         f_values.append(label)
187 | 
188 |         if(not written_header):
189 |             arff.write(', '.join(f_names))
190 |             arff.write('\n')
191 |             print "Writing header"
192 |             written_header = True
193 | 
194 |         l = []
195 |         for v in f_values:
196 |             l.append(str(v))
197 |         arff.write(', '.join(l))
198 |         arff.write('\n')
199 |     arff.close()
200 |     return feature_set_folder
201 | 
202 | 
203 | def CompressFeatures(BIN_WIDTH, TOPK):
204 |     sampleFolders = [
205 |     "TrafficCaptures/240Resolution/FacetTraffic_50",
206 |     "TrafficCaptures/240Resolution/RegularTraffic",
207 |     ]
208 |     
209 | 
210 |     if not os.path.exists('FeatureSets'):
211 |                 os.makedirs('FeatureSets')
212 |     
213 |     for topk in TOPK:
214 |         for binWidth in BIN_WIDTH:
215 |             start = time.time()
216 |             print "\n#####################################"
217 |             print "Generating Dataset based on Binned Packet Length Features"
218 |             for sampleFolder in sampleFolders:
219 |                 print "\n#############################"
220 |                 print "Parsing " + sampleFolder
221 |                 print "#############################"
222 |                 feature_set_folder = FeatureExtractionPLBenchmark(sampleFolder, binWidth, topk)
223 |             if(feature_set_folder is not None):
224 |                 GenerateDatasets(feature_set_folder + '/')
225 |             end = time.time()
226 |             print "Optimize_compress_bin_%s_topk_%s_time_%s"%(binWidth, topk, end-start)
227 | 
228 | 
229 | def SplitDataset(DATASET_SPLIT, N_FLOWS, COVERT_FLOWS_PERC):
230 | 
231 |     print "Splitting datasets with DATASET_SPLIT= %s, N_FLOWS = %s, REG_FLOWS_PROP = %s"%(DATASET_SPLIT, N_FLOWS, COVERT_FLOWS_PERC)
232 |     split_value = DATASET_SPLIT * N_FLOWS #samples
233 |     covert_split_value = COVERT_FLOWS_PERC * split_value
234 | 
235 |     print "SPLIT_VALUE = %s"%(split_value)
236 |     print "COVERT_SAMPLES_VALUE = %s"%(covert_split_value)
237 | 
238 |     for feature_folder in os.listdir("FeatureSets"):
239 |         if(".DS_Store" not in feature_folder):
240 |             start = time.time()
241 |             print "Splitting %s"%("FeatureSets/" + feature_folder + "/RegularTraffic_dataset.csv")
242 |             #Split RegularFlows
243 |             RegularFile = open("FeatureSets/" + feature_folder + "/RegularTraffic_dataset.csv", 'rb')
244 |             csv_reader = csv.reader(RegularFile, delimiter=',')
245 | 
246 |             PhaseOneRegularFile = open("FeatureSets/" + feature_folder + "/RegularTraffic_phase1_dataset.csv", 'w')
247 |             PhaseTwoRegularFile = open("FeatureSets/" + feature_folder + "/RegularTraffic_phase2_dataset.csv", 'w')
248 | 
249 |             for n, row in enumerate(csv_reader):
250 |                 if(n == 0):
251 |                     row_string = ",".join(row) + "\n"
252 |                     PhaseOneRegularFile.write(row_string)
253 |                     PhaseTwoRegularFile.write(row_string)
254 |                 elif(n < split_value):
255 |                     row_string = ",".join(row) + "\n"
256 |                     PhaseOneRegularFile.write(row_string)
257 |                 else:
258 |                     row_string = ",".join(row) + "\n"
259 |                     PhaseTwoRegularFile.write(row_string)
260 | 
261 |             RegularFile.close()
262 |             PhaseOneRegularFile.close()
263 |             PhaseTwoRegularFile.close()
264 | 
265 | 
266 |             #Split CovertFlows
267 |             print "Splitting %s"%("FeatureSets/" + feature_folder + "/FacetTraffic_50_dataset.csv")
268 |             CovertFile = open("FeatureSets/" + feature_folder + "/FacetTraffic_50_dataset.csv", "rb")
269 |             csv_reader = csv.reader(CovertFile, delimiter=',')
270 | 
271 |             PhaseOneCovertFile = open("FeatureSets/" + feature_folder + "/FacetTraffic_50_phase1_dataset.csv", "w")
272 |             PhaseTwoCovertFile = open("FeatureSets/" + feature_folder +  "/FacetTraffic_50_phase2_dataset.csv", "w")
273 | 
274 |             for n, row in enumerate(csv_reader):
275 |                 if(n == 0):
276 |                     row_string = ",".join(row) + "\n"
277 |                     PhaseOneCovertFile.write(row_string)
278 |                     PhaseTwoCovertFile.write(row_string)
279 |                 elif(n < split_value):
280 |                     row_string = ",".join(row) + "\n"
281 |                     PhaseOneCovertFile.write(row_string)
282 |                 else:
283 |                     row_string = ",".join(row) + "\n"
284 |                     PhaseTwoCovertFile.write(row_string)
285 | 
286 |             CovertFile.close()
287 |             PhaseOneCovertFile.close()
288 |             PhaseTwoCovertFile.close()
289 |             end = time.time()
290 |             binWidth = feature_folder.split("_")[2]
291 |             topk = feature_folder.split("_")[3]
292 |             print "Optimize_split_bin_%s_topk_%s_time_%s"%(binWidth, topk, end-start)
293 | 
294 | 
295 | def MergeTestData():
296 |     for feature_folder in os.listdir("FeatureSets"):
297 |         if(".DS_Store" not in feature_folder):
298 |             print "Merging %s"%("FeatureSets/" + feature_folder + "/RegularTraffic_phase2_dataset.csv")
299 |             print "Merging %s"%("FeatureSets/" + feature_folder + "/FacetTraffic_50_phase2_dataset.csv")
300 | 
301 |             #Merging Phase2
302 |             PhaseTwoFile = open("FeatureSets/" + feature_folder + "/Phase2_dataset.csv", 'w')
303 |             
304 |             PhaseTwoRegularFile = open("FeatureSets/" + feature_folder + "/RegularTraffic_phase2_dataset.csv", 'rb')
305 |             PhaseTwoCovertFile = open("FeatureSets/" + feature_folder +  "/FacetTraffic_50_phase2_dataset.csv", "rb")
306 | 
307 | 
308 |             #Write data from the regular file
309 |             csv_reader = csv.reader(PhaseTwoRegularFile, delimiter=',')
310 |             for n, row in enumerate(csv_reader):
311 |                 row_string = ",".join(row) + "\n"
312 |                 PhaseTwoFile.write(row_string)
313 |             
314 |             #Write data from the covert file
315 |             csv_reader = csv.reader(PhaseTwoCovertFile, delimiter=',')
316 |             for n, row in enumerate(csv_reader):
317 |                 if(n == 0):
318 |                     continue
319 |                 row_string = ",".join(row) + "\n"
320 |                 PhaseTwoFile.write(row_string)
321 |             
322 |             PhaseTwoFile.close()
323 |             PhaseTwoRegularFile.close()
324 |             PhaseTwoCovertFile.close()
325 | 
326 | 
327 | 
328 | def FeatureExtractionPLBenchmarkBasedOnTrainData(sampleFolder, binWidth, topk):
329 |     #Bucket importance in decreasing order
330 |     BUCKETS_TO_MEASURE = []
331 |     
332 |     #Measure interesting buckets
333 |     if(topk != 1500):
334 |         #Buckets in decreasing importance order 
335 |         f_imp = np.load('classificationResults/PL_60_' + str(binWidth) + '_1500/FeatureImportance_XGBoost_FacetTraffic_50_phase1.npy')
336 |         #Print top k
337 |         #for f in f_imp:
338 |         #    print str(f[1]) + " " + str(f[2])
339 |         
340 |         if(topk > len(f_imp)):
341 |             print "Skipping, not enough features to accomodate for. TopK = " + str(topk) + " Features = " + str(len(f_imp))
342 |             return
343 |         for i in range(0,topk):
344 |             b = int(f_imp[i][2].split("_")[1])
345 |             print "Top-" + str(i) + " = " + str(b)
346 |             BUCKETS_TO_MEASURE.append(b)
347 | 
348 |     #Measure all buckets
349 |     elif(topk == 1500):
350 |         print "Measuring all buckets according to quantization"
351 |         for i in range(0,1500,binWidth):
352 |             BUCKETS_TO_MEASURE.append(i/binWidth)
353 | 
354 | 
355 |     quantized_buckets_to_measure = sorted(BUCKETS_TO_MEASURE)
356 |     print "Quantized buckets to measure = " + str(quantized_buckets_to_measure)
357 |     print "Number of buckets to measure = " + str(len(quantized_buckets_to_measure))
358 | 
359 | 
360 |     traceInterval = 60 #Amount of time in packet trace to consider for feature extraction
361 |     feature_set_folder = 'FeatureSets/PL_' + str(traceInterval) + "_" + str(binWidth) + "_" + str(topk)
362 |     print feature_set_folder
363 | 
364 |     if not os.path.exists(feature_set_folder):
365 |                 os.makedirs(feature_set_folder)
366 |     arff_path = feature_set_folder + '/' + os.path.basename(sampleFolder) + '_dataset.csv'
367 |     arff = open(arff_path, 'wb')
368 |     written_header = False
369 | 
370 | 
371 |     for sample in os.listdir(sampleFolder):
372 |         if(".DS_Store" in sample):
373 |             continue
374 |         f = open(sampleFolder + "/" + sample + "/" + sample + ".pcap")
375 |         pcap = dpkt.pcap.Reader(f)
376 | 
377 |         #Analyse packets transmited
378 |         packetSizesIn = []
379 |         packetSizesOut = []
380 |         bin_dict = {}
381 |         
382 | 
383 |         for i in quantized_buckets_to_measure:
384 |             bin_dict[i] = 0
385 | 
386 | 
387 |         firstTime = 0.0
388 |         setFirst = False
389 |         for ts, buf in pcap:
390 |             if(not(setFirst)):
391 |                 firstTime = ts
392 |                 setFirst = True
393 | 
394 |             if(ts < (firstTime + traceInterval)):
395 | 
396 |                 eth = dpkt.ethernet.Ethernet(buf)
397 |                 ip_hdr = eth.data
398 |                 try:
399 |                     src_ip_addr_str = socket.inet_ntoa(ip_hdr.src)
400 |                     dst_ip_addr_str = socket.inet_ntoa(ip_hdr.dst)
401 |                     #Target UDP communication between both cluster machines
402 |                     if (ip_hdr.p == 17 and src_ip_addr_str == SOURCE_IP):
403 |                         binned = RoundToNearest(len(buf),binWidth)
404 |                         if(binned/binWidth in quantized_buckets_to_measure):
405 |                             bin_dict[binned/binWidth]+=1
406 |                 except:
407 |                     pass
408 |         f.close()
409 | 
410 |         od_dict = collections.OrderedDict(sorted(bin_dict.items(), key=lambda t: float(t[0])))
411 |         bin_list = []
412 |         for i in od_dict:
413 |             bin_list.append(od_dict[i])
414 | 
415 | 
416 |         label = os.path.basename(sampleFolder)
417 |         if('Regular' in sampleFolder):
418 |             label = 'Regular'
419 | 
420 |         #Write sample features to the csv file
421 |         f_names = []
422 |         f_values = []
423 | 
424 |         for i, b in enumerate(bin_list):
425 |             f_names.append('packetLengthBin_' + str(quantized_buckets_to_measure[i]))
426 |             f_values.append(b)
427 | 
428 | 
429 |         #print len(f_names)
430 |         f_names.append('Class')
431 |         f_values.append(label)
432 | 
433 |         if(not written_header):
434 |             arff.write(', '.join(f_names))
435 |             arff.write('\n')
436 |             print "Writing header"
437 |             written_header = True
438 | 
439 |         l = []
440 |         for v in f_values:
441 |             l.append(str(v))
442 |         arff.write(', '.join(l))
443 |         arff.write('\n')
444 |     arff.close()
445 |     return feature_set_folder
446 | 
447 | 
448 | 
449 | def CompressFeaturesBasedOnTrainData(BIN_WIDTH, TOPK):
450 |     sampleFolders = [
451 |     "TrafficCaptures/240Resolution/FacetTraffic_50",
452 |     "TrafficCaptures/240Resolution/RegularTraffic",
453 |     ]
454 | 
455 | 
456 |     if not os.path.exists('FeatureSets'):
457 |                 os.makedirs('FeatureSets')
458 |     
459 |     for topk in TOPK:
460 |         for binWidth in BIN_WIDTH:
461 |             start = time.time()
462 |             print "\n#####################################"
463 |             print "Generating Dataset based on Binned Packet Length Features"
464 |             for sampleFolder in sampleFolders:
465 |                 print "\n#############################"
466 |                 print "Parsing " + sampleFolder
467 |                 print "#############################"
468 |                 feature_set_folder = FeatureExtractionPLBenchmarkBasedOnTrainData(sampleFolder, binWidth, topk)
469 |             if(feature_set_folder is not None):
470 |                 GenerateDatasets(feature_set_folder + '/')
471 |             end = time.time()
472 |             print "Optimize_compress_bin_%s_topk_%s_time_%s"%(binWidth, topk, end-start)
473 | 
474 | 
475 | 
476 | 
477 | def ExtractFirstNPackets(sampleFolder, number_of_packets):
478 | 
479 |     traceInterval = 60 #Amount of time in packet trace to consider for feature extraction
480 |     feature_set_folder = 'FeatureSets/First_%d_packets'%(number_of_packets)
481 |     print feature_set_folder
482 | 
483 |     if not os.path.exists(feature_set_folder):
484 |                 os.makedirs(feature_set_folder)
485 |     arff_path = feature_set_folder + '/' + os.path.basename(sampleFolder) + '_dataset.csv'
486 |     arff = open(arff_path, 'wb')
487 |     written_header = False
488 | 
489 | 
490 |     for sample in os.listdir(sampleFolder):
491 |         if(".DS_Store" in sample):
492 |             continue
493 |         f = open(sampleFolder + "/" + sample + "/" + sample + ".pcap")
494 |         pcap = dpkt.pcap.Reader(f)
495 | 
496 | 
497 |         packet_array1 = []
498 |         packet_array2 = []
499 |         firstTime = 0.0
500 |         setFirst = False
501 |         for ts, buf in pcap:
502 |             if(len(packet_array1) >= number_of_packets and len(packet_array2) >= number_of_packets):
503 |                 break
504 | 
505 |             if(not(setFirst)):
506 |                 firstTime = ts
507 |                 setFirst = True
508 | 
509 |             if(ts < (firstTime + traceInterval)):
510 | 
511 |                 eth = dpkt.ethernet.Ethernet(buf)
512 |                 ip_hdr = eth.data
513 |                 try:
514 |                     src_ip_addr_str = socket.inet_ntoa(ip_hdr.src)
515 |                     dst_ip_addr_str = socket.inet_ntoa(ip_hdr.dst)
516 |                     #Target UDP communication between both cluster machines
517 |                     if (ip_hdr.p == 17 and src_ip_addr_str == SOURCE_IP):
518 |                         if(len(packet_array1) < number_of_packets):
519 |                             packet_array1.append(len(buf))
520 |                     elif(ip_hdr.p == 17 and src_ip_addr_str != SOURCE_IP):
521 |                         if(len(packet_array2) < number_of_packets):
522 |                             packet_array2.append(len(buf))
523 |                 except:
524 |                     pass
525 |         f.close()
526 | 
527 |         label = os.path.basename(sampleFolder)
528 |         if('Regular' in sampleFolder):
529 |             label = 'Regular'
530 | 
531 |         if(len(packet_array1) >= number_of_packets and len(packet_array2) >= number_of_packets):
532 |             #Write sample features to the csv file
533 |             f_names = []
534 |             f_values = []
535 | 
536 |             for i, b in enumerate(packet_array1):
537 |                 f_names.append('packetNumberOut_' + str(i))
538 |                 f_values.append(b)
539 | 
540 |             for i, b in enumerate(packet_array2):
541 |                 f_names.append('packetNumberIn_' + str(i))
542 |                 f_values.append(b)
543 | 
544 | 
545 |             f_names.append('Class')
546 |             f_values.append(label)
547 | 
548 |             if(not written_header):
549 |                 arff.write(', '.join(f_names))
550 |                 arff.write('\n')
551 |                 print "Writing header"
552 |                 written_header = True
553 | 
554 |             l = []
555 |             for v in f_values:
556 |                 l.append(str(v))
557 |             arff.write(', '.join(l))
558 |             arff.write('\n')
559 |         else:
560 |             print "Sample %s has not enough packets"%(sampleFolder + "/" + sample + "/" + sample + ".pcap")
561 |     arff.close()
562 |     return feature_set_folder
563 | 
564 | 
565 | def ExtractPacketSample(NUMBER_OF_PACKETS):
566 |     sampleFolders = [
567 |     "TrafficCaptures/240Resolution/FacetTraffic_50",
568 |     "TrafficCaptures/240Resolution/RegularTraffic",
569 |     ]
570 | 
571 |     if not os.path.exists('FeatureSets'):
572 |                 os.makedirs('FeatureSets')
573 |     
574 |     for number_of_packets in NUMBER_OF_PACKETS:
575 |         print "\n#####################################"
576 |         print "Extracting first %d packet sizes"%(number_of_packets)
577 | 
578 |         for sampleFolder in sampleFolders:
579 |             print "\n#############################"
580 |             print "Parsing " + sampleFolder
581 |             print "#############################"
582 |             feature_set_folder = ExtractFirstNPackets(sampleFolder, number_of_packets)
583 |         if(feature_set_folder is not None):
584 |             GenerateDatasets(feature_set_folder + '/')


--------------------------------------------------------------------------------