├── .gitignore
├── Security Tasks Evaluation
├── BotnetAnalysis
│ ├── peershark
│ │ ├── __init__.py
│ │ ├── PcapInputFiles.txt
│ │ ├── TsharkOptions.txt
│ │ ├── P2P_CONSTANTS.py
│ │ ├── Packet.py
│ │ ├── createTrainingData.py
│ │ ├── generateSuperFlows.py
│ │ ├── FilterPackets.py
│ │ ├── SuperFlow.py
│ │ ├── GenerateFlows.py
│ │ ├── FilterPacketsHelper.py
│ │ ├── README.md
│ │ └── Flow.py
│ ├── Data
│ │ ├── Storm
│ │ │ └── placeholder.csv
│ │ ├── P2PTraffic
│ │ │ └── placeholder.csv
│ │ └── Waledac
│ │ │ └── placeholder.csv
│ ├── TrafficCaptures
│ │ ├── Storm
│ │ │ └── placeholder.pcap
│ │ ├── Waledac
│ │ │ └── placeholder.pcap
│ │ └── P2PTraffic
│ │ │ └── placeholder.pcap
│ ├── fullRun.sh
│ ├── README.md
│ ├── quantize.py
│ └── runExperiment.py
├── WFAnalysis
│ ├── AllWebsiteAnalysis
│ │ ├── Data
│ │ │ └── placeholder.data
│ │ ├── ParsingUtilities
│ │ │ ├── CSVParseToWeka.py
│ │ │ ├── CSVParseToSimulateHerrman.py
│ │ │ └── CSVParseWebsiteUnbalanced.py
│ │ ├── generateFigures.py
│ │ └── runExperiment.py
│ ├── SingleWebsiteAnalysis
│ │ ├── Data
│ │ │ └── placeholder.data
│ │ ├── ParsingUtilities
│ │ │ ├── CSVParseToWeka.py
│ │ │ └── CSVParseWebsiteUnbalanced.py
│ │ ├── runExperiment.py
│ │ └── generateFigures.py
│ └── README.md
└── MPTAnalysis
│ ├── README.md
│ ├── FacetAnalysis
│ ├── runExperiment.py
│ ├── online_sketching.py
│ ├── generateFigures.py
│ ├── compressive_ta.py
│ └── generateFeatures.py
│ └── DeltaShaperAnalysis
│ ├── runExperiment.py
│ ├── online_sketching.py
│ ├── generateFigures.py
│ └── compressive_ta.py
├── README.md
└── Flow Marker Accumulator
└── flowlens-v1model.p4
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 |
--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/peershark/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/Data/Storm/placeholder.csv:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/Data/P2PTraffic/placeholder.csv:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/Data/Waledac/placeholder.csv:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/TrafficCaptures/Storm/placeholder.pcap:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/TrafficCaptures/Waledac/placeholder.pcap:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/TrafficCaptures/P2PTraffic/placeholder.pcap:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/Security Tasks Evaluation/WFAnalysis/AllWebsiteAnalysis/Data/placeholder.data:
--------------------------------------------------------------------------------
1 | Place openssh.data here
--------------------------------------------------------------------------------
/Security Tasks Evaluation/WFAnalysis/SingleWebsiteAnalysis/Data/placeholder.data:
--------------------------------------------------------------------------------
1 | Place openssh.data here
--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/peershark/PcapInputFiles.txt:
--------------------------------------------------------------------------------
1 | /Users/dmbb/Desktop/flowscope/BotnetAnalysis/Data/P2PTraffic
2 |
--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/peershark/TsharkOptions.txt:
--------------------------------------------------------------------------------
1 | -t e
2 | -T fields
3 | -E separator=,
4 | -e ip.src -e ip.dst -e ip.proto -e frame.time_epoch -e tcp.len -e udp.length
5 | -Y "(ip.proto==6)||(ip.proto==17)"
6 |
--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/fullRun.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | for BIN_WIDTH in 1 16 32 64 128 256
4 | do
5 | for IPT_BIN_WIDTH in 0 1 10 60 300 900
6 | do
7 | python runExperiment.py $BIN_WIDTH $IPT_BIN_WIDTH
8 | done
9 | done
--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/peershark/P2P_CONSTANTS.py:
--------------------------------------------------------------------------------
1 | PCAPDATADIR = './pcapdata/'
2 | PCAPFILES = 'PcapInputFiles.txt'
3 | TSHARKOPTIONSFILE = 'TsharkOptions.txt'
4 | TCP_PROTO = '6'
5 | UDP_PROTO = '17'
6 | UDP_HEADERLENGTH = 8
7 |
8 | #utility functions
9 | import os
10 | def getCSVFiles(dirname):
11 | csvfiles = []
12 | for eachfile in os.listdir(dirname):
13 | if eachfile.endswith('.csv'):
14 | csvfiles.append(dirname + eachfile)
15 | return csvfiles
--------------------------------------------------------------------------------
/Security Tasks Evaluation/WFAnalysis/README.md:
--------------------------------------------------------------------------------
1 | ##Dependencies and Data
2 |
3 |
4 | ### Website Fingerprinting
5 |
6 | - Download the OpenSSH dataset (parsed and obtained from the original Herrman MySQL database) available [here](https://turbina.gsd.inesc-id.pt/resources/openSSH_herrman/openssh_data.tar.gz)
7 | - Place it inside `WFAnalysis/AllWebsiteAnalysis/Data` and `WFAnalysis/SingleWebsiteAnalysis/Data`.
8 |
9 |
10 |
11 | ### Running the code
12 |
13 | - Execute the `RunExperiment.py` script in each of the considered website fingerprinting settings
--------------------------------------------------------------------------------
/Security Tasks Evaluation/MPTAnalysis/README.md:
--------------------------------------------------------------------------------
1 | ##Dependencies and Data
2 |
3 |
4 | ### Multimedia Protocol Tunneling
5 |
6 | - Download the traffic captures of covert channel tools available [here](https://turbina.gsd.inesc-id.pt/resources/mpt_detection/)
7 | - Place them in `MPTAnalysis/DeltaShaperAnalysis/TrafficCaptures` and `MPTAnalysis/FacetAnalysis/TrafficCaptures`, respectively.
8 |
9 |
10 | ### Running the code
11 |
12 | - Execute the `RunExperiment.py` script in each of the particular covert channel generating tools folder. Then execute `generateFigures.py` to generate figures from the obtained results.
--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/peershark/Packet.py:
--------------------------------------------------------------------------------
1 | import socket
2 | #defines properties of a packet
3 | class Packet:
4 | def __init__(self,fields):
5 | if fields == None:
6 | self.source = None
7 | self.dest = None
8 | self.timestamp = None
9 | self.size = 0
10 | self.key = None
11 | else:
12 | self.source = socket.inet_aton(fields[0])
13 | self.dest = socket.inet_aton(fields[1])
14 | self.timestamp = float(fields[2])
15 | self.size = int(fields[3])
16 | if self.source < self.dest:
17 | self.key = self.source + self.dest
18 | else:
19 | self.key = self.dest + self.source
20 |
21 |
--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/README.md:
--------------------------------------------------------------------------------
1 | ##Dependencies and Data
2 |
3 |
4 | ### Botnets
5 |
6 | - Download the P2P and botnet datasets gathered for PeerRush, available [here](http://peerrush.cs.uga.edu/peerrush/)
7 | - Place them inside `BotnetAnalysis/Data`
8 | - Botnet detection code by Pratik Narang is available [here](https://github.com/pratiknarang/peershark)
9 |
10 | ### Parse Original Captures Used in PeerShark
11 |
12 | - For each dataset (Waledac, Storm, P2P)
13 | - Run `peershark/FilterPackets.py`
14 | - Retrieve original parse of the .pcap at `pcapdata` folder
15 |
16 | *Note: Storm data samples must be appended with ".pcap"*
17 | `for f in *; do mv "$f" "$f.pcap"; done`
18 |
19 | ### Run the FlowLens botnet detection experiment
20 |
21 | Run `fullRun.sh`, which is responsible for applying different quantization parameter combinations on the PL and IPT of P2P packet flows
--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/peershark/createTrainingData.py:
--------------------------------------------------------------------------------
1 | from P2P_CONSTANTS import *
2 |
3 |
4 | def runTrainingDataGenerator(super_flow_data_dir, training_data_dir, bin_width, ipt_bin_width):
5 | #takes takes 50,000 examples and puts it in necessary format for training
6 | csvfiles = []
7 | if os.path.isdir(super_flow_data_dir):
8 | csvfiles += getCSVFiles(super_flow_data_dir)
9 |
10 | #print ".csv files to generate training data: %s"%(csvfiles)
11 |
12 | outfile = open(training_data_dir + 'trainingdata_' + str(bin_width) + "_" + str(ipt_bin_width) + '.csv','w')
13 | for filename in csvfiles:
14 | label = filename.split('/')[-2]
15 | inputfile = open(filename)
16 | line = inputfile.readline().strip()
17 | while line!='':
18 | fields = line.split(',')
19 | if float(fields[4])!=0 and float(fields[3])!=0 and float(fields[7])!=0:
20 | outfile.write(
21 | fields[2] + ',' +
22 | fields[3] + ',' +
23 | fields[4] + ',' +
24 | fields[7] + ',' +
25 | label + '\n')
26 | line = inputfile.readline().strip()
27 | inputfile.close()
28 | outfile.close()
--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/peershark/generateSuperFlows.py:
--------------------------------------------------------------------------------
1 | from P2P_CONSTANTS import *
2 | import socket
3 | import Flow
4 | import SuperFlow
5 | import sys
6 |
7 |
8 | def runGenerateSuperFlows(flow_data_dir, super_flow_data_dir, flowgap):
9 | #TIMEGAP IN SECONDS
10 | csvfiles = getCSVFiles(flow_data_dir)
11 | #print csvfiles
12 |
13 | flowdata = []
14 | for filename in csvfiles:
15 | inputfile = open(filename)
16 | data = [line.strip() for line in inputfile]
17 | inputfile.close()
18 |
19 | for eachline in data:
20 | fields = eachline.split(',')
21 | flowdata.append(SuperFlow.SuperFlow(fields))
22 | print '\tNo. of flows to be processed: ' + str(len(flowdata))
23 |
24 |
25 | flowdata = Flow.combineFlows(flowdata, flowgap)
26 | print '\tSuperflows (Flows with flowgap = ' + str(flowgap) + ' sec) : ' + str(len(flowdata))
27 |
28 | outfile = open(super_flow_data_dir + str(flowgap) + '.csv', 'w')
29 |
30 | to_write = []
31 | for flow in flowdata:
32 | to_write.append(
33 | socket.inet_ntoa(flow.ip1) + ',' +
34 | socket.inet_ntoa(flow.ip2) + ',' +
35 | str(flow.getNoOfPackets()) + ',' +
36 | str(flow.getNoOfBytes()) + ',' +
37 | '%.6f'%flow.getInterArrivaltime() + ',' +
38 | '%.6f'%flow.getStart() + ',' +
39 | '%.6f'%flow.getEnd() + ',' +
40 | '%.6f'%flow.getDurationInSeconds())
41 | outfile.write("\n".join(to_write))
42 | outfile.close()
--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/peershark/FilterPackets.py:
--------------------------------------------------------------------------------
1 | ## Module to obtain packet data from a pcap/dump file
2 | ## and save it in csv format using tshark.
3 | ## Filenames of input pcap files are taken from InputFiles.txt
4 | ## Tshark options are present in TsharkOptions.txt
5 | ## TsharkOptions.txt should not contain the -r option.
6 |
7 | ## usage: python FilterPackets.py
8 |
9 | #import global constants
10 | from P2P_CONSTANTS import *
11 | from FilterPacketsHelper import *
12 | import multiprocessing as MP
13 | import subprocess
14 |
15 | #execute a shell command as a child process
16 | def executeCommand(command,outfilename):
17 | sem.acquire()
18 |
19 | subprocess.call(command, shell = True)
20 |
21 | infile = open(outfilename, 'r')
22 | data = [eachline.strip() for eachline in infile]
23 | infile.close()
24 |
25 | data = preprocess(data)
26 |
27 | outfile = open(outfilename,'w')
28 | for eachcomponent in data:
29 | outfile.write(eachcomponent)
30 | outfile.close()
31 |
32 | print 'done processing : ' + outfilename
33 | sem.release()
34 |
35 | #obtain input parameters and pcapfilenames
36 | inputfiles = getPCapFileNames()
37 | print "Input Files: " + str(inputfiles)
38 | tsharkOptions = getTsharkOptions()
39 |
40 | #create a semaphore so as not to exceed threadlimit
41 | sem = MP.Semaphore(THREADLIMIT)
42 |
43 | #get tshark commands to be executed
44 | for filename in inputfiles:
45 | print filename
46 | (command,outfilename) = contructTsharkCommand(filename,tsharkOptions)
47 | task = MP.Process(target = executeCommand, args = (command, outfilename,))
48 | task.start()
--------------------------------------------------------------------------------
/Security Tasks Evaluation/WFAnalysis/AllWebsiteAnalysis/ParsingUtilities/CSVParseToWeka.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import csv
4 |
5 | def main(argv):
6 | OutputFile = open(argv[1], 'w')
7 | InputFile = open(argv[0])
8 |
9 | bin_dict = {}
10 |
11 | OutputFile.write("@relation\'WF\'\n\n")
12 | OutputFile.write("@attribute Text string\n")
13 | OutputFile.write("@attribute class {")
14 |
15 | csv_reader = csv.reader(InputFile, delimiter=',')
16 |
17 | csv_header = ""
18 | website_list = set()
19 | text = []
20 |
21 | for n, row in enumerate(csv_reader):
22 | if(n == 0):
23 | #Init bin dict
24 | csv_header = row
25 | prefix = "packetLengthBin_"
26 | for i in range(len(csv_header)-1):
27 | parsedBucketSize = csv_header[i][(len(prefix) + 1):]
28 | bin_dict[i] = parsedBucketSize
29 | continue
30 |
31 | currWebsite = row[-1]
32 | website_list.add(currWebsite)
33 | bin_list = row[:-1]
34 |
35 | text.append("\'")
36 | if("Online" in argv[1]): #Fix for online Sketching (Coskun et al.)
37 | for i in range(len(bin_list)):
38 | text.append(str(bin_list[i]) + " ")
39 | else: #For the others
40 | for i in range(len(bin_list)):
41 | for _ in range(int(bin_list[i])):
42 | text.append(str(bin_dict[i]) + " ")
43 |
44 | text.append("\'," + currWebsite + "\n")
45 |
46 | #Write classes on header
47 | OutputFile.write(",".join(website_list))
48 | OutputFile.write("}\n\n")
49 | #Write data
50 | OutputFile.write("@data\n\n")
51 | OutputFile.write("".join(text))
52 |
53 |
54 | OutputFile.close()
55 |
56 |
57 | if __name__ == "__main__":
58 | main(sys.argv[1:])
--------------------------------------------------------------------------------
/Security Tasks Evaluation/WFAnalysis/SingleWebsiteAnalysis/ParsingUtilities/CSVParseToWeka.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import csv
4 |
5 | def RoundToNearest(n, m):
6 | if (m == 1):
7 | return n
8 | if (n > 0):
9 | r = n % m
10 | return n + m - r if r + r >= m else n - r
11 | else:
12 | if (n < 0):
13 | return RoundToNearest(abs(n), m) * -1
14 | return 0
15 |
16 | def main(argv):
17 | OutputFile = open(argv[1], 'w')
18 | InputFile = open(argv[0], 'rb')
19 | website = argv[2]
20 |
21 | bin_dict = {}
22 |
23 | OutputFile.write("@relation\'WF\'\n\n")
24 | OutputFile.write("@attribute Text string\n")
25 | OutputFile.write("@attribute class {Nope,%s}\n\n"%(website))
26 | OutputFile.write("@data\n\n")
27 |
28 |
29 | csv_reader = csv.reader(InputFile, delimiter=',')
30 |
31 | csv_header = ""
32 | text = []
33 |
34 | for n, row in enumerate(csv_reader):
35 | if(n == 0):
36 | #Init bin dict
37 | csv_header = row
38 | prefix = "packetLengthBin_"
39 | for i in range(len(csv_header)-1):
40 | parsedBucketSize = csv_header[i][(len(prefix) + 1):]
41 | bin_dict[i] = parsedBucketSize
42 | continue
43 |
44 | currWebsite = row[-1]
45 | bin_list = row[:-1]
46 |
47 | text.append("\'")
48 | for i in range(len(bin_list)):
49 | for _ in range(int(bin_list[i])):
50 | text.append(str(bin_dict[i]) + " ")
51 |
52 | if (website not in currWebsite):
53 | text.append("\'," + "Nope" + "\n")
54 | else:
55 | text.append("\'," + website + "\n")
56 |
57 |
58 | OutputFile.write("".join(text))
59 | OutputFile.close()
60 |
61 | if __name__ == "__main__":
62 | main(sys.argv[1:])
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # FlowLens
2 |
3 | This repository holds the code for the paper "FlowLens: Enabling Efficient Flow Classification for ML-based Network Security Applications".
4 | If you end up using our code for your experiments, please cite our work as follows:
5 |
6 | ```
7 | @inproceedings{protozoa,
8 | title={FlowLens: Enabling Efficient Flow Classification for ML-based Network Security Applications},
9 | author={Barradas, Diogo and Santos, Nuno and Rodrigues, Lu{\'i}s and Signorello, Salvatore and Ramos, Fernando M. V. and Madeira, Andr{\'e}},
10 | booktitle={Proceedings of the 28th Network and Distributed System Security Symposium},
11 | year={2021},
12 | address={San Diego, CA, USA},
13 | }
14 | ```
15 |
16 | ##Dependencies and Data
17 |
18 |
19 | ### General Dependencies
20 |
21 | - Install WEKA
22 | - Run `pip install -r requirements.txt`
23 |
24 | ### Datasets
25 |
26 | - Please check the `README.md` in each specific security task folder
27 |
28 |
29 | ## How may I use your code?
30 |
31 | - The `Security Tasks Evaluation` folder includes the code we used for evaluating different ML-based security tasks when using FlowLens. The code applies different combinations of our quantization and truncation approaches and allows for checking FlowLens flow markers trade-offs between accuracy and memory footprint
32 |
33 | - The `Flow Marker Accumulator` folder includes an adaptation of the P416 code we used for implementing FlowLens' flow marker accumulator in a Barefoot Tofino switch. Due to NDA concerns, we make public this adapted version of our code that can be run on the P4's BMV2 behavioral model.
34 |
35 |
36 | *Todo: Provide a full end-to-end dummy example of FlowLens running in BMV2 - e.g. on P4's tutorial VM.*
--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/peershark/SuperFlow.py:
--------------------------------------------------------------------------------
1 | from Packet import *
2 | import socket
3 | import Flow
4 |
5 | #get median of interarrival time
6 | def getMedian(vallist):
7 | vallist.sort(key = lambda val:val[0])
8 | tot = 0
9 | cfreq = []
10 | for val in vallist:
11 | tot += val[1]
12 | cfreq.append(tot)
13 | medianindex = tot / 2
14 | i = 0
15 | while medianindex > cfreq[i]:
16 | i += 1
17 | return vallist[i][0]
18 |
19 | #defines a superflow
20 | class SuperFlow(Flow.Flow):
21 |
22 | def __init__(self, fields):
23 | if fields == None:
24 | self.ip1 = None
25 | self.ip2 = None
26 | self.key = None
27 | self.n_packet1 = 0
28 | self.n_byte1 = 0
29 | self.t_start1 = 0
30 | self.t_end1 = 0
31 | self.t_interarrival1 = []
32 | self.n_packet2 = 0
33 | self.n_byte2 = 0
34 | self.t_start2 = 0
35 | self.t_end2 = 0
36 | self.t_interarrival2 = []
37 | else:
38 | self.ip1 = socket.inet_aton(fields[0])
39 | self.ip2 = socket.inet_aton(fields[1])
40 | self.key = self.ip1 + self.ip2
41 | self.n_packet1 = int(fields[2])
42 | self.n_byte1 = int(fields[3])
43 | self.t_start1 = float(fields[4])
44 | self.t_end1 = float(fields[5])
45 | self.t_interarrival1 = [(float(fields[6]),self.n_packet1)]
46 | self.n_packet2 = int(fields[7])
47 | self.n_byte2 = int(fields[8])
48 | self.t_start2 = float(fields[9])
49 | self.t_end2 = float(fields[10])
50 | self.t_interarrival2 = [(float(fields[11]),self.n_packet2)]
51 |
52 | #get median of interarrival time irrespective of direction
53 | def getInterArrivaltime(self):
54 | combined = self.t_interarrival1 + self.t_interarrival2
55 | if len(combined) > 0:
56 | return getMedian(combined)
57 | return 0
58 |
59 | #interarrival time for direction1(arbitrary)
60 | def getInterArrivaltime1(self):
61 | if len(self.t_interarrival1) > 0:
62 | return getMedian(self.t_interarrival1)
63 | return 0
64 |
65 | #interarrival time for direction2(arbitrary)
66 | def getInterArrivaltime2(self):
67 | if len(self.t_interarrival2) > 0:
68 | return getMedian(self.t_interarrival2)
69 | return 0
70 |
--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/peershark/GenerateFlows.py:
--------------------------------------------------------------------------------
1 | from P2P_CONSTANTS import *
2 | from Packet import *
3 | from Flow import *
4 | import multiprocessing as MP
5 | import socket
6 | import gc
7 | import time
8 |
9 | ## module to read all the files in the data folder of the
10 | ## project, build flow data and store it in a file
11 |
12 |
13 | def generateFlow(filename, flow_data_dir, timegap, sem):
14 | sem.acquire()
15 |
16 | inputfile = open(filename)
17 | data = [line.strip() for line in inputfile]
18 | inputfile.close()
19 |
20 | packetlist = []
21 | for eachline in data:
22 | fields = eachline.split(',')
23 | fields.pop(2)
24 | packetlist.append(Packet(fields))
25 |
26 | outflowlist = packetsToFlows(packetlist, timegap)
27 | #print 'flows in ' + filename + ' : ' + str(len(outflowlist))
28 |
29 | outfilename = flow_data_dir + (filename.split('/')[-1])
30 | writeFlowsToFile(outflowlist, outfilename)
31 |
32 | #print 'done writing to : ' + outfilename
33 | #start_collect = time.time()
34 | #collected = gc.collect()
35 | #end_collect = time.time()
36 | #print "Time wasted on GC - GenerateFlows: %ss, collected %s objects"%(end_collect-start_collect, collected)
37 | sem.release()
38 |
39 | def runGenerateFlows(quantized_pcap_data_dir, flow_data_dir, n_processes, timegap):
40 | #create a semaphore so as not to exceed n_processes process limit
41 | sem = MP.Semaphore(n_processes)
42 |
43 | csvfiles = getCSVFiles(quantized_pcap_data_dir)
44 |
45 | tasklist = []
46 | #generate flowdata from each input packet file(not pcap) in parallel and store it in a file
47 | #so we get as many output files as number of input files
48 | for filename in csvfiles:
49 | task = MP.Process(target = generateFlow, args = (filename, flow_data_dir, timegap, sem))
50 | tasklist.append(task)
51 |
52 | print "Tasklist size = %s"%(len(tasklist))
53 |
54 | # #execute commands in parallel
55 | for i in range(0, len(tasklist), n_processes):
56 | for k,task in enumerate(tasklist[i:i+n_processes]):
57 | tasklist[i+k].start()
58 | for k, task in enumerate(tasklist[i:i+n_processes]):
59 | tasklist[i+k].join()
60 | #print "Joined task number %s"%(i+k)
--------------------------------------------------------------------------------
/Security Tasks Evaluation/WFAnalysis/AllWebsiteAnalysis/generateFigures.py:
--------------------------------------------------------------------------------
1 | import os
2 | from decimal import Decimal
3 | import numpy as np
4 | import csv
5 |
6 | import matplotlib
7 | if os.environ.get('DISPLAY','') == '':
8 | print('no display found. Using non-interactive Agg backend')
9 | matplotlib.use('Agg')
10 | import matplotlib.pyplot as plt
11 |
12 |
13 | colors = ["0.8", "0.6", "0.2", "0.0"]
14 | colors = ["salmon", "lightsteelblue", "darkseagreen", "thistle", "wheat", "khaki", "skyblue"]
15 |
16 | """
17 | Attach a text label above each bar displaying its height
18 | """
19 | def autolabel(rects, ax):
20 | for rect in rects:
21 | height = rect.get_height()
22 | ax.text(rect.get_x() + rect.get_width()/2., 1.005*height, # original height was 1.005*height
23 | "{0:.2f}".format(float(height)), fontsize=7, ha='center', va='bottom')
24 |
25 |
26 | def PlotNormalAccuracy():
27 | print "Plotting accuracy for no-sketch"
28 | #Gather results for full distribution
29 | profile_data_full = open("classificationResults/AllVsAll.csv", 'rb')
30 | csv_reader_full = csv.reader(profile_data_full, delimiter=',')
31 |
32 | binWidth_full = []
33 | acc_full = []
34 |
35 | for n, row in enumerate(csv_reader_full):
36 | if(n == 0):
37 | continue
38 | binWidth_full.append(row[0])
39 | acc_full.append(round(Decimal(float(row[1])), 4))
40 |
41 |
42 | #Generate plot
43 | fig = plt.figure()
44 | ax1 = fig.add_subplot(111)
45 |
46 | print "Current feature set: "+ str(binWidth_full)
47 | print "ACC-Full: " + str(acc_full)
48 |
49 | ind = np.arange(len(binWidth_full)) # the x locations for the groups
50 | width = 0.4
51 |
52 | rects1 = ax1.bar(ind - width/2, acc_full, width, color=colors[0], label='Accuracy')
53 | autolabel(rects1,ax1)
54 |
55 | ax1.yaxis.grid(color='black', linestyle='dotted')
56 | ax1.set_title('Quantization effect on accuracy - WF Multiclass', fontsize = 10)
57 |
58 | ax1.set_xticks(ind)
59 | labels = ["K = " + x + "\nBins = " + str(3000/int(x)) for n, x in enumerate(binWidth_full)]
60 | ax1.set_xticklabels(labels)
61 | ax1.legend()
62 |
63 | ax1.set_ylabel('Accuracy')
64 | ax1.set_xlabel('Quantization')
65 |
66 | plt.xticks(fontsize=7)
67 | plt.tight_layout()
68 | plt.ylim(0, 1)
69 | fig.savefig('Figures/AllVsAll.pdf') # save the figure to file
70 | fig.savefig('Figures/AllVsAll.png') # save the figure to file
71 | plt.close(fig)
72 | profile_data_full.close()
73 |
74 |
75 |
76 | def GenerateFigures():
77 | if not os.path.exists("Figures"):
78 | os.makedirs("Figures")
79 |
80 | PlotNormalAccuracy()
--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/peershark/FilterPacketsHelper.py:
--------------------------------------------------------------------------------
1 | from P2P_CONSTANTS import *
2 | import os
3 | #return a list of filenames of pcapfiles taken from InputFiles.txt
4 | #if a directory is found then all *.pcap files in the directory are
5 | #included(non-recursive)
6 |
7 | def getPCapFileNames():
8 | pcapInputFile = open(PCAPFILES)
9 | lines = [eachline.strip() for eachline in pcapInputFile]
10 | pcapInputFile.close()
11 |
12 | pcapfilenames = []
13 | for eachline in lines:
14 | if eachline.endswith('.pcap'):
15 | if os.path.exists(eachline):
16 | pcapfilenames.append(eachline)
17 | else:
18 | print eachline + ' does not exist'
19 | exit()
20 | else:
21 | if os.path.isdir(eachline):
22 | for eachfile in os.listdir(eachline):
23 | if eachfile.endswith('.pcap'):
24 | pcapfilenames.append(eachline.rstrip('/') + '/' + eachfile)
25 | else:
26 | print eachline + ' is not a directory'
27 | exit()
28 | return pcapfilenames
29 |
30 | #return a list of options to be used with tshark
31 | def getTsharkOptions():
32 | optionsFile = open(TSHARKOPTIONSFILE)
33 | options = [line.strip() for line in optionsFile]
34 | optionsFile.close()
35 | return options
36 |
37 | #return a tuple (x,y) where
38 | #x = complete tshark command
39 | #y = output csv filename
40 | def contructTsharkCommand(filename,tsharkOptions):
41 | command = 'tshark -r ' + filename + ' '
42 | for eachstring in tsharkOptions:
43 | command = command + eachstring + ' '
44 |
45 | #construct output filename
46 | outfilename = filename.split('/')
47 | outfilename = PCAPDATADIR + outfilename[len(outfilename)-1] + '.csv'
48 |
49 | command += '>'+outfilename
50 | return (command,outfilename)
51 |
52 | #remove missing tcp and udp payload lengths and subtract
53 | #8 bytes from udp payload to account for udp header
54 | #returns a list of strings to be printed
55 | def preprocess(data):
56 | outputdata = []
57 | for eachline in data:
58 | fields = eachline.split(',')
59 |
60 | #sanity check for 6 fields. Has to be changed if tshark options are changed
61 | if len(fields) != 6:
62 | continue
63 |
64 | tcppayload = fields[4].strip()
65 | udppayload = fields[5].strip()
66 |
67 | #subtract udp header length
68 | if udppayload != '':
69 | fields[5] = str(int(udppayload) - UDP_HEADERLENGTH)
70 | if fields[5] == '0':
71 | continue
72 | #ignore packet if both tcp and udp payload lengths are null
73 | elif tcppayload == '' or tcppayload == '0':
74 | continue
75 |
76 | #add all valid fields to output list
77 | for eachfield in fields:
78 | if eachfield.strip() != '':
79 | outputdata.append(eachfield)
80 | outputdata.append(',')
81 | outputdata.pop()
82 | outputdata.append('\n')
83 | return outputdata
--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/peershark/README.md:
--------------------------------------------------------------------------------
1 | PeerShark
2 | ============================
3 | Peer-to-Peer botnet detection by tracking conversations
4 |
5 | ### Contributors
6 | * Pratik Narang
7 | * Subhajit Ray
8 | * Chittaranjan Hota
9 |
10 | ###Research papers:
11 | * Narang, P., Ray, S., Hota, C., & Venkatakrishnan, V. (2014, May). Peershark: detecting peer-to-peer botnets by tracking conversations. In Security and Privacy Workshops (SPW), 2014 IEEE (pp. 108-115). IEEE.
12 | * Narang, P., Hota, C., & Venkatakrishnan, V. N. (2014). PeerShark: flow-clustering and conversation-generation for malicious peer-to-peer traffic identification. EURASIP Journal on Information Security, 2014(1), 1-12.
13 |
14 | PeerShark requires Python v2.7.* and Tshark installed, and has been tested only for Linux environment.
15 |
16 | Modules to be used in the following order:
17 |
18 | 1. FilterPackets.py : Take inputdir or input files from PCAPFILES.
19 | The module runs tshark on each file in inputdir and extracts the
20 | fields mention in TsharkOptions.txt such as src-ip,dst-ip,
21 | protocol, payload length. One new file is created per pcap file
22 | which contains only the fields we want for future analysis. The
23 | new files are stored in PCAPDATADIR.
24 |
25 | usage : python FilterPackets.py
26 |
27 | 2. GenerateFlows.py : Take each file from PCAPDATADIR -> generate
28 | flow information -> store processed data for each file in
29 | FLOWDATADIR.
30 |
31 | usage : python GenerateFlows.py
32 |
33 | 3. generateSuperFlows.py : Take each file from FLOWDATADIR -> merge
34 | flows into superflows based on input parameters -> store in
35 | SUPERFLOWDATADIR.
36 |
37 | usage: python generateSuperFlows.py start(in hrs) increment(in hrs) end(in hrs)
38 |
39 | Number of files generated = (end - start)/increment
40 |
41 | One file is generated for each value of timegap ranging from start to end.
42 |
43 | ####OPTIONAL:
44 |
45 |
46 | 4. createTrainingData.py: use this file to create labelled training data set.
47 | It reads *folders* (not files) residing in SUPERFLOWDATADIR, and creates *one*
48 | labelled file (weka style minus the header) per folder (with required attributes only-
49 | no. of pkts, no. of bytes, iat, duration, label) with the folder name appended as last column.
50 |
51 | After generating a labelled 'training dataset', supervised machine learning algorithms
52 | can be used to generate models for P2P botnet detection.
53 |
54 |
55 | ####Flow structure
56 |
57 | `IP1, IP2, #Packets1, #Bytes1, tFlowStart1, tFlowEnd1, MedianIPT1, #Packets2, #Bytes2, tFlowStart2, tFlowEnd2, MedianIPT2,`
58 |
59 | **Example**
60 | `4.79.17.248,192.168.58.137,3,126,1234920043.252418,1234920049.917001,4.326552,450,18900,1234920045.127448,1234920069.383826,0.000068`
61 |
62 |
63 | ####Superflow structure
64 |
65 | `IP1, IP2, #Packets, #Bytes, MedianIPT, tFlowStart, tFlowEnd, tDuration`
66 |
67 | **Example**
68 | `4.68.25.2, 192.168.58.150, 2, 86, 0.000000, 1234978436.632683, 1234978436.632683, 0.000000`
69 |
70 | ####Training data structure
71 |
72 | `#Packets, #Bytes, MedianIPT, tDuration, label`
73 |
74 | **Example**
75 | `16,672,0.000051,2.108578,Waledac`
76 |
--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/quantize.py:
--------------------------------------------------------------------------------
1 | import os
2 | import csv
3 | import multiprocessing as MP
4 | import socket
5 | import gc
6 | import time
7 |
8 | def RoundToNearest(n, m):
9 | r = n % m
10 | return n + m - r if r + r >= m else n - r
11 |
12 | STORM_IPS = [
13 | "66.154.80.101",
14 | "66.154.80.105",
15 | "66.154.80.111",
16 | "66.154.80.125",
17 | "66.154.83.107",
18 | "66.154.83.113",
19 | "66.154.83.138",
20 | "66.154.83.80",
21 | "66.154.87.39",
22 | "66.154.87.41",
23 | "66.154.87.57",
24 | "66.154.87.58",
25 | "66.154.87.61"
26 | ]
27 |
28 | WALEDAC_IPS = [
29 | "192.168.58.136",
30 | "192.168.58.137",
31 | "192.168.58.150"
32 | ]
33 |
34 |
35 | def runQuantization(dataset, traffic_capture, binWidth, ipt_bin_width, sem):
36 | sem.acquire()
37 |
38 | cap_file = open(dataset + "/" + traffic_capture, 'rb')
39 | csv_reader = csv.reader(cap_file, delimiter=',')
40 |
41 | quantized_csv = open('FeatureSets/' + os.path.basename(dataset) + "/" + traffic_capture[:-4] + "_" + str(binWidth) + "_" + str(ipt_bin_width) + ".csv", "w")
42 |
43 | malicious_ips = []
44 | if(os.path.basename(dataset) == "Storm"):
45 | malicious_ips = STORM_IPS
46 | elif(os.path.basename(dataset) == "Waledac"):
47 | malicious_ips = WALEDAC_IPS
48 |
49 | #print "Malicious IPs = %s"%(malicious_ips)
50 | #print os.path.basename(dataset)
51 |
52 | to_write = []
53 | #Write modified packets
54 | for row in csv_reader:
55 | #Filter out non-malicious flows from Storm and Waledac datasets
56 | if(("Storm" in os.path.basename(dataset) or "Waledac" in os.path.basename(dataset)) and (row[0] not in malicious_ips and row[1] not in malicious_ips)):
57 | #print "Row not in malicious: %s - %s"%(row[0], row[1])
58 | continue
59 | else:
60 | new_row = row
61 |
62 | #Quantize packet size
63 | new_row[4] = str(RoundToNearest(int(new_row[4]), binWidth))
64 |
65 | #Quantize Timestamp
66 | if(ipt_bin_width > 0):
67 | new_row[3] = str(RoundToNearest(int(float(new_row[3])), ipt_bin_width))
68 | to_write.append(",".join(new_row))
69 |
70 | quantized_csv.write("\n".join(to_write))
71 |
72 | cap_file.close()
73 | quantized_csv.close()
74 |
75 | #start_collect = time.time()
76 | #collected = gc.collect()
77 | #end_collect = time.time()
78 | #print "Time wasted on GC - Quantize: %ss, collected %s objects"%(end_collect-start_collect, collected)
79 | sem.release()
80 |
81 |
82 | def QuantizeDataset(dataset, binWidth, ipt_bin_width, n_processes):
83 | sem = MP.Semaphore(n_processes)
84 | traffic_captures = os.listdir(dataset)
85 |
86 | tasklist = []
87 |
88 | for traffic_capture in traffic_captures:
89 | task = MP.Process(target = runQuantization, args = (dataset, traffic_capture, binWidth, ipt_bin_width, sem))
90 | tasklist.append(task)
91 |
92 | print "Tasklist size = %s"%(len(tasklist))
93 |
94 | # #execute commands in parallel
95 | for i in range(0, len(tasklist), n_processes):
96 | for k,task in enumerate(tasklist[i:i+n_processes]):
97 | tasklist[i+k].start()
98 | for k, task in enumerate(tasklist[i:i+n_processes]):
99 | tasklist[i+k].join()
100 | #print "Joined task number %s"%(i+k)
101 |
--------------------------------------------------------------------------------
/Security Tasks Evaluation/WFAnalysis/AllWebsiteAnalysis/ParsingUtilities/CSVParseToSimulateHerrman.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | from datetime import datetime, timedelta
4 | from collections import defaultdict, OrderedDict
5 |
6 | def RoundToNearest(n, m):
7 | if (m == 1):
8 | return n
9 | if (n > 0):
10 | r = n % m
11 | return n + m - r if r + r >= m else n - r
12 | else:
13 | if (n < 0):
14 | return RoundToNearest(abs(n), m) * -1
15 | return 0
16 |
17 | def main(argv):
18 | name = str(argv[0])
19 | BASE_DIR = os.path.dirname(name)
20 | file = open(name,'r')
21 | binWidth = int(argv[1])
22 |
23 | trainSet = open(BASE_DIR+"/TrainSet_" + str(binWidth) + ".csv", 'w')
24 | testSet = open(BASE_DIR+"/TestSet_" + str(binWidth) + ".csv", 'w')
25 |
26 | minBucket = RoundToNearest(-1500, binWidth)
27 | maxBucket = RoundToNearest(1500, binWidth) + 1
28 | for size in range(minBucket, maxBucket, binWidth):
29 | trainSet.write("packetLengthBin_" + str(size) + ", ")
30 | testSet.write("packetLengthBin_" + str(size) + ", ")
31 | trainSet.write("class\n")
32 | testSet.write("class\n")
33 |
34 | i = 0
35 |
36 | TFlineToWrite = []
37 | CNlineToWrite = []
38 |
39 |
40 | lineToWrite = OrderedDict()
41 |
42 | l = file.readline()
43 | l = file.readline()
44 | l = file.readline()
45 | l.rstrip('\n')
46 |
47 | lineNumber = 0
48 | while l:
49 | lineSplit = l.split(" ")
50 | if (lineNumber % 2 == 0):
51 | timestamp = lineSplit[2]
52 | else:
53 | website = lineSplit[0][:-1]
54 | lineToWrite[website+"|"+timestamp] = {}
55 | lineToWrite[website+"|"+timestamp] = defaultdict(lambda:0, lineToWrite[website+"|"+timestamp])
56 | t = lineToWrite[website+"|"+timestamp]
57 | for x in lineSplit[1:]:
58 | try:
59 | t[str(RoundToNearest(int(x), binWidth))] += 1
60 | except:
61 | continue
62 | lineToWrite[website+"|"+timestamp] = t
63 | lineNumber += 1
64 | l = file.readline()
65 | l.rstrip('\n')
66 |
67 | max = 4
68 | max2 = max + 4
69 | counter = 0
70 | currentWebSite = ""
71 | for j in lineToWrite:
72 | if (currentWebSite != j.split("|")[0]):
73 | counter = 0
74 |
75 | currentWebSite = j.split("|")[0]
76 |
77 | if (counter < max):
78 | for s in range(minBucket, maxBucket, binWidth):
79 | trainSet.write(str(lineToWrite[j][str(s)]) + ", ")
80 | trainSet.write(currentWebSite + "\n")
81 | if (counter == 0):
82 | firstTimeStamp = datetime.strptime(j.split("|")[1], "%Y-%m-%d#%H:%M:%S")
83 | secondTimeStamp = firstTimeStamp + timedelta(days=8)
84 | counter += 1
85 | else:
86 | if (datetime.strptime(j.split("|")[1], "%Y-%m-%d#%H:%M:%S") < secondTimeStamp):
87 | lineToWrite[j] = {}
88 | continue
89 | if (counter < max2):
90 | for s in range(minBucket, maxBucket, binWidth):
91 | testSet.write(str(lineToWrite[j][str(s)]) + ", ")
92 | testSet.write(currentWebSite + "\n")
93 | counter += 1
94 |
95 | lineToWrite[j] = {}
96 |
97 | if __name__ == "__main__":
98 | main(sys.argv[1:])
99 |
100 |
--------------------------------------------------------------------------------
/Security Tasks Evaluation/WFAnalysis/AllWebsiteAnalysis/ParsingUtilities/CSVParseWebsiteUnbalanced.py:
--------------------------------------------------------------------------------
1 | import os
2 | import csv
3 | import sys
4 | from datetime import datetime, timedelta
5 | import collections
6 | import math
7 | from collections import defaultdict, OrderedDict
8 | import numpy as np
9 |
10 | def RoundToNearest(n, m):
11 | if (m == 1):
12 | return n
13 | if (n > 0):
14 | r = n % m
15 | return n + m - r if r + r >= m else n - r
16 | else:
17 | if (n < 0):
18 | return RoundToNearest(abs(n), m) * -1
19 | return 0
20 |
21 | def extractDistributionWithoutTruncation(argv):
22 | BASE_DIR = os.path.dirname(argv[0])
23 | file = open(argv[0],'r')
24 |
25 | binWidth = int(argv[1])
26 | websiteToClassify = argv[2]
27 |
28 | if not os.path.exists(BASE_DIR + "/" + websiteToClassify):
29 | os.makedirs(BASE_DIR + "/" + websiteToClassify)
30 |
31 | trainSet = open(BASE_DIR + "/" + websiteToClassify + "/TrainSet_" + str(binWidth) + ".csv", 'w')
32 | testSet = open(BASE_DIR + "/" + websiteToClassify + "/TestSet_" + str(binWidth) + ".csv", 'w')
33 |
34 |
35 | #Set for all possible quantized buckets
36 | binsUsedByWebsite = set()
37 | minBucket = RoundToNearest(-1500, binWidth)
38 | maxBucket = RoundToNearest(1500, binWidth) + 1
39 | for size in range(minBucket, maxBucket, binWidth):
40 | binsUsedByWebsite.add(RoundToNearest(size, binWidth))
41 |
42 |
43 | websiteTrainInstances = int(argv[3])
44 | websiteTestInstances = int(argv[4])
45 |
46 | ################################################
47 | #Build csv with quantized bins
48 | ################################################
49 |
50 | # Write CSV datasets header (with bins used by the target website)
51 | for size in range(minBucket, maxBucket, binWidth):
52 | if (size in binsUsedByWebsite):
53 | trainSet.write("packetLengthBin_" + str(size) + ", ")
54 | testSet.write("packetLengthBin_" + str(size) + ", ")
55 | trainSet.write("class\n")
56 | testSet.write("class\n")
57 |
58 |
59 | file = open(argv[0],'r')
60 | l = file.readline() #Take out dataset header
61 | l = file.readline() #Take out dataset header
62 | trainCounter = 0
63 | testCounter = 0
64 | currWebsite = ""
65 | trainData = []
66 | testData =[]
67 |
68 | for lineNumber, l in enumerate(file.readlines()):
69 | lineSplit = l.rstrip('\n').split(" ")
70 | if (lineNumber % 2 == 1): #Gather website data
71 | website = lineSplit[0][:-1]
72 | if(website != currWebsite):
73 | currWebsite = website
74 | trainCounter = 0
75 | testCounter = 0
76 |
77 | #Build container for sample distribution
78 | website_bin_distribution = OrderedDict()
79 | for i in sorted(binsUsedByWebsite):
80 | website_bin_distribution[i] = 0
81 |
82 | #Add useful bins to the sample distribution
83 | for packet_size in lineSplit[1:-1]:
84 | packet_size_binned = RoundToNearest(int(packet_size), binWidth)
85 | if(packet_size_binned in binsUsedByWebsite):
86 | website_bin_distribution[packet_size_binned] += 1
87 |
88 |
89 | if(trainCounter < websiteTrainInstances):
90 | bin_list = []
91 | for i in website_bin_distribution:
92 | bin_list.append(str(website_bin_distribution[i]))
93 | trainData.append(",".join(bin_list) + ", " + currWebsite + "\n")
94 | trainCounter += 1
95 | elif(testCounter < websiteTestInstances):
96 | bin_list = []
97 | for i in website_bin_distribution:
98 | bin_list.append(str(website_bin_distribution[i]))
99 | testData.append(",".join(bin_list) + ", " + currWebsite + "\n")
100 | #Account for processed sample
101 | testCounter += 1
102 |
103 | trainSet.write("".join(trainData))
104 | testSet.write("".join(testData))
105 | trainSet.close()
106 | testSet.close()
107 |
108 | if __name__ == "__main__":
109 | extractDistributionWithoutTruncation(sys.argv[1:])
110 |
--------------------------------------------------------------------------------
/Security Tasks Evaluation/WFAnalysis/AllWebsiteAnalysis/runExperiment.py:
--------------------------------------------------------------------------------
1 | import os
2 | import math
3 | import subprocess as sub
4 | import shutil
5 | import time
6 | import weka.core.jvm as jvm
7 | import weka.core.converters as converters
8 | from weka.core.converters import Loader
9 | from weka.classifiers import Classifier
10 | from weka.classifiers import Evaluation
11 |
12 |
13 | dataset_location = "Data/openssh.data"
14 |
15 | #export JAVA_HOME=/Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home/
16 | def ClassifyParam(mode, binWidths):
17 | if not os.path.exists("classificationResults"):
18 | os.makedirs("classificationResults")
19 |
20 | if("normal" in mode):
21 | file = open("classificationResults/AllVsAll.csv","w")
22 |
23 | file.write("BinWidth, Accuracy\n")
24 |
25 | for binWidth in binWidths:
26 |
27 | train_set = "Data/arff/TrainSet_%s.arff"%(binWidth)
28 | test_set = "Data/arff/TestSet_%s.arff"%(binWidth)
29 | print "Loading Datasets..."
30 |
31 | train_data = converters.load_any_file(train_set)
32 | test_data = converters.load_any_file(test_set)
33 | #Set class attribute
34 | train_data.class_is_last()
35 | test_data.class_is_last()
36 | print "Dataset Loaded!"
37 |
38 |
39 | classifier_name = "weka.classifiers.meta.FilteredClassifier"
40 |
41 | classifier = Classifier(classname=classifier_name, options=[
42 | "-F", "weka.filters.unsupervised.attribute.StringToWordVector -R first-last -W 1000 -C -T -N 1 -stemmer weka.core.stemmers.NullStemmer -M 1 -tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\" \\\\r\\\\n\\\\t.,;:\\\\\\\'\\\\\\\"()?!\\\"\"",
43 | "-W", "weka.classifiers.bayes.NaiveBayesMultinomial"])
44 |
45 |
46 | start_train = time.time()
47 | classifier.build_classifier(train_data)
48 | end_train = time.time()
49 | print "Train\t%s\t%s"%(binWidth, end_train-start_train)
50 |
51 | for index, inst in enumerate(test_data):
52 | if(index == 0):
53 | start_sample = time.time()
54 | classifier.classify_instance(inst)
55 | end_sample = time.time()
56 | print "Sample\t%s\t%s"%(binWidth, end_sample-start_sample)
57 |
58 | print "Evaluating w/ Multinomial Naive Bayes classifier. BinWidth = %s"%(binWidth)
59 | evaluation = Evaluation(test_data)
60 | start_batch = time.time()
61 | evaluation.test_model(classifier, test_data)
62 | end_batch = time.time()
63 | print "Batch\t%s\t%s"%(binWidth,end_batch-start_batch)
64 |
65 |
66 | print evaluation.summary()
67 | acc = evaluation.percent_correct/100.0
68 | print "Percent correct: " + str(acc)
69 |
70 | file.write("%s, %s\n"%(binWidth, acc))
71 | file.close()
72 |
73 |
74 |
75 | def QuantizeAndCreateTrainTestDataset(binWidths):
76 | #2/3 train, 1/3 test (150 total, 100 -50)
77 | # Currently 50-50
78 | target_train_instances = 75
79 | target_test_instances = 75
80 |
81 | #Placeholder website for parsing script to work (compatibility issues)
82 | website = "www.flickr.com"
83 |
84 | for binWidth in binWidths:
85 | simArgs = "python ParsingUtilities/CSVParseWebsiteUnbalanced.py %s %s %s %s %s"%(dataset_location, binWidth, website, target_train_instances, target_test_instances)
86 | print "Quantizing dataset. binWidth = %s"%(binWidth)
87 | sub.call(simArgs, shell = True)
88 |
89 | print "Moving files to Data directory root"
90 | src_folder = "Data/www.flickr.com/"
91 | files = os.listdir(src_folder)
92 | for f in files:
93 | shutil.move(src_folder+f, "Data/")
94 | os.rmdir(src_folder)
95 |
96 |
97 | def BuildQuantizedArffDatasets(mode, binWidths):
98 | if not os.path.exists("Data/arff"):
99 | os.makedirs("Data/arff")
100 |
101 | if("normal" in mode):
102 | train_set = "TrainSet"
103 | test_set = "TestSet"
104 |
105 | for binWidth in binWidths:
106 | simArgs = "python ParsingUtilities/CSVParseToWeka.py Data/%s_%s.csv Data/arff/%s_%s.arff"%(train_set, binWidth, train_set, binWidth)
107 | print "Generating train dataset. binWidth = %s"%(binWidth)
108 | sub.call(simArgs, shell = True)
109 |
110 | simArgs = "python ParsingUtilities/CSVParseToWeka.py Data/%s_%s.csv Data/arff/%s_%s.arff"%(test_set, binWidth, test_set, binWidth)
111 | print "Generating test dataset. binWidth = %s"%(binWidth)
112 | sub.call(simArgs, shell = True)
113 |
114 |
115 |
116 |
117 | if __name__ == "__main__":
118 |
119 | #Quantization
120 | BIN_WIDTH = [1, 4, 8, 16, 32, 64, 128, 256]
121 |
122 | QuantizeAndCreateTrainTestDataset(BIN_WIDTH)
123 |
124 |
125 | BuildQuantizedArffDatasets("normal", BIN_WIDTH)
126 |
127 | #Classify
128 | #Start WEKA execution
129 | jvm.start(max_heap_size="4096m")
130 |
131 | #Classify
132 | ClassifyParam("normal", BIN_WIDTH)
133 |
134 | #stop weka execution
135 | jvm.stop()
136 |
137 |
--------------------------------------------------------------------------------
/Security Tasks Evaluation/MPTAnalysis/FacetAnalysis/runExperiment.py:
--------------------------------------------------------------------------------
1 | import os
2 | import math
3 | import subprocess as sub
4 |
5 | from generateFeatures import CompressFeatures, SplitDataset, CompressFeaturesBasedOnTrainData, MergeTestData, ExtractPacketSample
6 | from classifier import GenerateFeatureImportanceBasedOnTrainData, ClassifyTestDataBasedOnModel, BuildModelBasedOnTrainData
7 | from generateFigures import GenerateFigures
8 | from online_sketching import CreateBinaryVectorRepresentation
9 | from compressive_ta import CreateCompressiveRepresentation
10 |
11 |
12 | def Experiment(BIN_WIDTH, TOPK, DELTAS, MEMORY_FACTORS, DATASET_SPLIT, COVERT_FLOWS_PERC, N_FLOWS, ONLINE_SKETCH_SIZE, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO):
13 | """
14 | Phase 1a)
15 | Use full information and generate the best buckets.
16 | Datasets are split into half.
17 |
18 | We use the first half to train/test a classifier with a balanced dataset in HoldOut 90/10
19 | """
20 | CompressFeatures(BIN_WIDTH, [TOPK[-1]])
21 | SplitDataset(DATASET_SPLIT, N_FLOWS, 1)
22 | GenerateFeatureImportanceBasedOnTrainData("normal", BIN_WIDTH, [TOPK[-1]])
23 |
24 | """
25 | Phase 1b)
26 | Quantize, truncate and classify according to the best buckets found
27 | The first half of each dataset is again used for train/test the classifier with a balanced dataset in HoldOut 90/10
28 | However, only the top-K bins are used for performing classification
29 |
30 | The built model is saved to use in Phase 2.
31 | """
32 | CompressFeaturesBasedOnTrainData(BIN_WIDTH, TOPK[:-1])
33 | SplitDataset(DATASET_SPLIT, N_FLOWS, COVERT_FLOWS_PERC)
34 | BuildModelBasedOnTrainData("normal", BIN_WIDTH, TOPK)
35 |
36 |
37 | """
38 | Phase 2
39 | Classify new flows using quantized/truncated distributions using the previously built model
40 | The second half of each dataset is used for train/test the classifier with an unbalanced dataset
41 | """
42 | #Quantization + Truncation without sketches
43 | ClassifyTestDataBasedOnModel("normal", BIN_WIDTH, TOPK, N_FLOWS)
44 |
45 | #Generate figures
46 | GenerateFigures(BIN_WIDTH, TOPK, DELTAS, MEMORY_FACTORS, N_FLOWS)
47 |
48 | """
49 | Online Sketching - Coskun et al.
50 | """
51 | CreateBinaryVectorRepresentation(BIN_WIDTH, [TOPK[-1]], ONLINE_SKETCH_SIZE)
52 | BuildModelBasedOnTrainData("online", BIN_WIDTH, [TOPK[-1]], ONLINE_SKETCH_SIZE)
53 | ClassifyTestDataBasedOnModel("online", BIN_WIDTH, [TOPK[-1]], N_FLOWS, ONLINE_SKETCH_SIZE)
54 |
55 | """
56 | Compressive TA adjusted to packet distribution
57 | """
58 | CreateCompressiveRepresentation("compressive_gaussian", BIN_WIDTH, [TOPK[-1]], SIGMA_PARAM, COMPRESSIVE_RATIO)
59 | BuildModelBasedOnTrainData("compressive_gaussian", BIN_WIDTH, [TOPK[-1]], ONLINE_SKETCH_SIZE, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO)
60 | ClassifyTestDataBasedOnModel("compressive_gaussian", BIN_WIDTH, [TOPK[-1]], N_FLOWS, ONLINE_SKETCH_SIZE, DELTAS, MEMORY_FACTORS, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO)
61 |
62 | CreateCompressiveRepresentation("compressive_bernoulli", BIN_WIDTH, [TOPK[-1]], SIGMA_PARAM, COMPRESSIVE_RATIO)
63 | BuildModelBasedOnTrainData("compressive_bernoulli", BIN_WIDTH, [TOPK[-1]], ONLINE_SKETCH_SIZE, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO)
64 | ClassifyTestDataBasedOnModel("compressive_bernoulli", BIN_WIDTH, [TOPK[-1]], N_FLOWS, ONLINE_SKETCH_SIZE, DELTAS, MEMORY_FACTORS, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO)
65 |
66 |
67 | if __name__ == "__main__":
68 |
69 | #Quantization
70 | BIN_WIDTH = [1, 4, 8, 16, 32, 64, 128, 256]
71 |
72 | #Truncation Top-K features
73 | TOPK = [5, 10, 20, 30, 40, 50, 1500]
74 |
75 | #Online Sketch
76 | ONLINE_SKETCH_SIZE = [64, 128, 256, 512, 1024, 2048]
77 |
78 | #Proportion of regular flows to input in sketch
79 | COVERT_FLOWS_PERC = 1
80 |
81 | #Proportion to split training phase (1) and testing phase (2)
82 | DATASET_SPLIT = 0.5
83 |
84 | #Total amount of flows per dataset
85 | N_FLOWS = 1000
86 |
87 | #Standard deviation of Gaussian distribution (compressive TA)
88 | SIGMA_PARAM = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
89 |
90 | #Number of packets to compute compressive TA representation
91 | NUMBER_OF_PACKETS = [1000, 2000, 4000]
92 |
93 | #Compression Ratio for Compressive TA
94 | COMPRESSIVE_RATIO = [4, 8, 16, 32, 64, 128, 256]
95 |
96 | #Deprecated
97 | DELTAS = [0.95]
98 | MEMORY_FACTORS = [8, 4, 2, 1]
99 |
100 | #Run Experiment:
101 | Experiment(BIN_WIDTH, TOPK, DELTAS, MEMORY_FACTORS, DATASET_SPLIT, COVERT_FLOWS_PERC, N_FLOWS, ONLINE_SKETCH_SIZE, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO)
102 |
103 |
104 |
105 |
106 |
--------------------------------------------------------------------------------
/Security Tasks Evaluation/MPTAnalysis/DeltaShaperAnalysis/runExperiment.py:
--------------------------------------------------------------------------------
1 | import os
2 | import math
3 | import subprocess as sub
4 |
5 | from generateFeatures import CompressFeatures, SplitDataset, CompressFeaturesBasedOnTrainData, MergeTestData, ExtractPacketSample
6 | from classifier import GenerateFeatureImportanceBasedOnTrainData, ClassifyTestDataBasedOnModel, BuildModelBasedOnTrainData
7 | from generateFigures import GenerateFigures, GenerateFiguresLines
8 | from online_sketching import CreateBinaryVectorRepresentation
9 | from compressive_ta import CreateCompressiveRepresentation
10 |
11 | def Experiment(BIN_WIDTH, TOPK, DELTAS, MEMORY_FACTORS, DATASET_SPLIT, COVERT_FLOWS_PERC, N_FLOWS, ONLINE_SKETCH_SIZE, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO):
12 | """
13 | Phase 1a)
14 | Use full information and generate the best buckets.
15 | Datasets are split into half.
16 |
17 | We use the first half to train/test a classifier with a balanced dataset in HoldOut 90/10
18 | """
19 | CompressFeatures(BIN_WIDTH, [TOPK[-1]])
20 | SplitDataset(DATASET_SPLIT, N_FLOWS, 1)
21 | GenerateFeatureImportanceBasedOnTrainData("normal", BIN_WIDTH, [TOPK[-1]])
22 |
23 | """
24 | Phase 1b)
25 | Quantize, truncate and classify according to the best buckets found
26 | The first half of each dataset is again used for train/test the classifier with a balanced dataset in HoldOut 90/10
27 | However, only the top-K bins are used for performing classification
28 |
29 | The built model is saved to use in Phase 2.
30 | """
31 | CompressFeaturesBasedOnTrainData(BIN_WIDTH, TOPK[:-1])
32 | SplitDataset(DATASET_SPLIT, N_FLOWS, COVERT_FLOWS_PERC)
33 | BuildModelBasedOnTrainData("normal", BIN_WIDTH, TOPK)
34 |
35 | """
36 | Phase 2
37 | Classify new flows using quantized/truncated distributions using the previously built model
38 | The second half of each dataset is used for train/test the classifier with an unbalanced dataset
39 | """
40 | #Quantization + Truncation
41 | ClassifyTestDataBasedOnModel("normal", BIN_WIDTH, TOPK, N_FLOWS, ONLINE_SKETCH_SIZE)
42 |
43 | #Generate figures
44 | GenerateFiguresLines(BIN_WIDTH, TOPK, DELTAS, MEMORY_FACTORS, N_FLOWS)
45 |
46 |
47 | """
48 | Online Sketching - Coskun et al.
49 | """
50 | CreateBinaryVectorRepresentation(BIN_WIDTH, [TOPK[-1]], ONLINE_SKETCH_SIZE)
51 | BuildModelBasedOnTrainData("online", BIN_WIDTH, [TOPK[-1]], ONLINE_SKETCH_SIZE)
52 | ClassifyTestDataBasedOnModel("online", BIN_WIDTH, [TOPK[-1]], N_FLOWS, ONLINE_SKETCH_SIZE)
53 |
54 |
55 | """
56 | Compressive TA adjusted to packet distribution
57 | """
58 | CreateCompressiveRepresentation("compressive_gaussian", BIN_WIDTH, [TOPK[-1]], SIGMA_PARAM, COMPRESSIVE_RATIO)
59 | BuildModelBasedOnTrainData("compressive_gaussian", BIN_WIDTH, [TOPK[-1]], ONLINE_SKETCH_SIZE, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO)
60 | ClassifyTestDataBasedOnModel("compressive_gaussian", BIN_WIDTH, [TOPK[-1]], N_FLOWS, ONLINE_SKETCH_SIZE, DELTAS, MEMORY_FACTORS, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO)
61 |
62 | CreateCompressiveRepresentation("compressive_bernoulli", BIN_WIDTH, [TOPK[-1]], SIGMA_PARAM, COMPRESSIVE_RATIO)
63 | BuildModelBasedOnTrainData("compressive_bernoulli", BIN_WIDTH, [TOPK[-1]], ONLINE_SKETCH_SIZE, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO)
64 | ClassifyTestDataBasedOnModel("compressive_bernoulli", BIN_WIDTH, [TOPK[-1]], N_FLOWS, ONLINE_SKETCH_SIZE, DELTAS, MEMORY_FACTORS, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO)
65 |
66 |
67 | if __name__ == "__main__":
68 |
69 | #Quantization
70 | BIN_WIDTH = [1, 4, 8, 16, 32, 64, 128, 256]
71 |
72 | #Truncation Top-K features
73 | TOPK = [5, 10, 20, 30, 40, 50, 1500]
74 |
75 | #Online Sketch
76 | ONLINE_SKETCH_SIZE = [64, 128, 256, 512, 1024, 2048]
77 |
78 | #Proportion of regular flows to input in sketch
79 | COVERT_FLOWS_PERC = 1
80 |
81 | #Proportion to split training phase (1) and testing phase (2)
82 | DATASET_SPLIT = 0.5
83 |
84 | #Total amount of flows per dataset
85 | N_FLOWS = 300
86 |
87 | #Standard deviation of Gaussian distribution (compressive TA)
88 | SIGMA_PARAM = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
89 |
90 | #Number of packets to compute compressive TA representation
91 | NUMBER_OF_PACKETS = [1000, 2000, 4000]
92 |
93 | #Compression Ratio for Compressive TA
94 | COMPRESSIVE_RATIO = [4, 8, 16, 32, 64, 128, 256]
95 |
96 | #Deprecated
97 | DELTAS = [0.95]
98 | MEMORY_FACTORS = [8, 4, 2, 1]
99 |
100 | #Run Experiment:
101 | Experiment(BIN_WIDTH, TOPK, DELTAS, MEMORY_FACTORS, DATASET_SPLIT, COVERT_FLOWS_PERC, N_FLOWS, ONLINE_SKETCH_SIZE, SIGMA_PARAM, NUMBER_OF_PACKETS, COMPRESSIVE_RATIO)
102 |
103 |
104 |
105 |
106 |
107 |
108 |
--------------------------------------------------------------------------------
/Security Tasks Evaluation/WFAnalysis/SingleWebsiteAnalysis/runExperiment.py:
--------------------------------------------------------------------------------
1 | import os
2 | import math
3 | import subprocess as sub
4 | import time
5 | import weka.core.jvm as jvm
6 | import weka.core.converters as converters
7 | from weka.core.converters import Loader
8 | from weka.classifiers import Classifier
9 | from weka.classifiers import Evaluation
10 |
11 | from generateFigures import GenerateFigures
12 |
13 | dataset_location = "Data/openssh.data"
14 |
15 | #export JAVA_HOME=/Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home/
16 |
17 | def ClassifyParam(website, mode, binWidths, truncation_modes=["full", "truncated"]):
18 | if not os.path.exists("classificationResults"):
19 | os.makedirs("classificationResults")
20 |
21 |
22 | if("normal" in mode):
23 | for truncation in truncation_modes:
24 | file = open("classificationResults/SingleWebsite_%s_%s.csv"%(truncation, website),"w")
25 | file.write("BinWidth, Accuracy, FalsePositiveRate, FalseNegativeRate\n")
26 |
27 | for binWidth in binWidths:
28 |
29 | train_set_file = "TrainSet_%s_%s.arff"%(truncation, binWidth)
30 | train_set = "Data/%s/arff/%s"%(website, train_set_file)
31 | test_set = "Data/%s/arff/%s"%(website, train_set_file.replace("TrainSet", "TestSet"))
32 |
33 | print "Loading Datasets..."
34 | print "Train: " + train_set
35 | train_data = converters.load_any_file(train_set)
36 | print "Test: " + test_set
37 | test_data = converters.load_any_file(test_set)
38 |
39 | #Set class attribute
40 | train_data.class_is_last()
41 | test_data.class_is_last()
42 | print "Dataset Loaded!"
43 |
44 |
45 | classifier_name = "weka.classifiers.meta.FilteredClassifier"
46 |
47 | classifier = Classifier(classname=classifier_name, options=[
48 | "-F", "weka.filters.unsupervised.attribute.StringToWordVector -R first-last -W 1000 -C -T -N 1 -stemmer weka.core.stemmers.NullStemmer -M 1 -tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\" \\\\r\\\\n\\\\t.,;:\\\\\\\'\\\\\\\"()?!\\\"\"",
49 | "-W", "weka.classifiers.bayes.NaiveBayesMultinomial"])
50 |
51 | start_train = time.time()
52 | classifier.build_classifier(train_data)
53 | end_train = time.time()
54 | print "Train\t%s\t%s"%(binWidth, end_train-start_train)
55 |
56 | for index, inst in enumerate(test_data):
57 | if(index == 0):
58 | start_sample = time.time()
59 | classifier.classify_instance(inst)
60 | end_sample = time.time()
61 | print "Sample\t%s\t%s"%(binWidth, end_sample-start_sample)
62 |
63 | print "Evaluating w/ Multinomial Naive Bayes classifier. BinWidth = %s"%(binWidth)
64 | evaluation = Evaluation(test_data)
65 | start_batch = time.time()
66 | evaluation.test_model(classifier, test_data)
67 | end_batch = time.time()
68 | print "Batch\t%s\t%s"%(binWidth,end_batch-start_batch)
69 |
70 |
71 | print evaluation.summary()
72 | print evaluation.matrix()
73 | #Just as an example, we're measuring the fpr and fnr of the website indexed as class 1
74 |
75 | tp = evaluation.num_true_positives(1)
76 | tn = evaluation.num_true_negatives(1)
77 | fp = evaluation.num_false_positives(1)
78 | fn = evaluation.num_false_negatives(1)
79 |
80 | acc = (tp+tn)/float(tp+tn+fp+fn)
81 | fpr = evaluation.false_positive_rate(1)
82 | fnr = evaluation.false_negative_rate(1)
83 |
84 | print "Accuracy: %s"%(acc)
85 | print "False Positive Rate: %s"%(fpr)
86 | print "False Negative Rate: %s"%(fnr)
87 |
88 | file.write("%s, %s, %s, %s\n"%(binWidth, acc, fpr, fnr))
89 | file.close()
90 |
91 |
92 |
93 | def QuantizeAndCreateUnbalancedTrainTestDataset(truncate, website, binWidths):
94 | #2/3 train, 1/3 test (150 total, 100 -50)
95 | target_train_instances = 75
96 | target_test_instances = 75
97 |
98 | if(truncate):
99 | truncation = 0
100 |
101 | #Init bookeeping of truncated bins
102 | if not os.path.exists("truncationInfo"):
103 | os.makedirs("truncationInfo")
104 | file = open("truncationInfo/" + website + ".csv", "w")
105 | file.write("BinWidth, TruncatedBins\n")
106 | file.close()
107 | else:
108 | truncation = 1
109 |
110 | for binWidth in binWidths:
111 | simArgs = "python ParsingUtilities/CSVParseWebsiteUnbalanced.py %s %s %s %s %s %s"%(dataset_location, binWidth, website, target_train_instances, target_test_instances, truncation)
112 | print "Quantizing dataset. binWidth = %s"%(binWidth) + ", truncation = " + str(truncate) + ", website = " + website
113 | sub.call(simArgs, shell = True)
114 |
115 |
116 |
117 | def BuildQuantizedArffDatasets(website, mode):
118 | if not os.path.exists("Data/%s/arff"%(website)):
119 | os.makedirs("Data/%s/arff"%(website))
120 |
121 | if("normal" in mode):
122 | for f in os.listdir("Data/%s"%(website)):
123 | if(".csv" in f and not f.startswith("CountMin")):
124 |
125 | simArgs = "python ParsingUtilities/CSVParseToWeka.py Data/%s/%s Data/%s/arff/%s %s"%(website, f, website, f[:-3] + "arff", website)
126 | print "Generating dataset. File = " + f[:-3] + "arff"
127 | sub.call(simArgs, shell = True)
128 |
129 |
130 | if __name__ == "__main__":
131 | modes = ["normal", "sketch"]
132 |
133 | TRUNCATION_MODES = [True, False]
134 |
135 | #Quantization
136 | BIN_WIDTH = [1, 4, 8, 16, 32, 64, 128, 256]
137 |
138 | WEBSITES = [
139 | "www.citibank.de",
140 | "mail.google.com",
141 | "www.youtube.com",
142 | "www.amazon.com",
143 | "www.imdb.com",
144 | "www.flickr.com"
145 | ]
146 |
147 | jvm.start(max_heap_size="4096m")
148 | for website in WEBSITES:
149 | for truncate in TRUNCATION_MODES:
150 | # Generates the train and test dataset
151 | #Proportion should be set inside this function
152 | QuantizeAndCreateUnbalancedTrainTestDataset(truncate, website, BIN_WIDTH)
153 |
154 | BuildQuantizedArffDatasets(website, "normal")
155 |
156 | """#Delete raw datasets
157 | for file in os.listdir("Data/" + website):
158 | if(file.endswith(".csv")):
159 | os.remove("Data/" + website + "/" + file)"""
160 |
161 | #Classify
162 | ClassifyParam(website, "normal", BIN_WIDTH)
163 |
164 | """#Delete arff datasets
165 | for file in os.listdir("Data/"):
166 | if(file.endswith(".arff")):
167 | os.remove("Data/" + file)"""
168 |
169 | #Generate figures
170 | GenerateFigures()
171 | jvm.stop()
172 |
--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/peershark/Flow.py:
--------------------------------------------------------------------------------
1 | from Packet import *
2 |
3 | #input: list of packets, timegap - real number
4 | #return val: list of flows
5 | #
6 | #merges collection of packets(objects) into collection of flows(many-to-one)
7 | #Working: group packets with same ip-pair(direction irrelevant) and merge all packets for
8 | #which |packet1.time - packet2.time| < threshold(timegap)
9 | def packetsToFlows(packets,timegap):
10 | #sanity check for 0 packets
11 | if len(packets) == 0:
12 | return None
13 |
14 | outputflows = []
15 |
16 | #perform a radix-sort to group together packets
17 | #with same ip-pairs(packet.key represents an ip-pair)
18 | #and sort these packets according to timestamp
19 | packets.sort(key = lambda packet:packet.timestamp)
20 | packets.sort(key = lambda packet:packet.key)
21 |
22 | nextflow = Flow(None)
23 | for nextpacket in packets:
24 | #if ip-pairs dont match or time-difference of prev and current packet greater
25 | #than timegap, create a new flow
26 | if (nextflow.key != nextpacket.key) or ((nextpacket.timestamp - nextflow.getEnd()) > timegap):
27 | nextflow = Flow(nextpacket)
28 | outputflows.append(nextflow)
29 | #if not then add packet to previous flow
30 | else:
31 | nextflow.addPacket(nextpacket)
32 |
33 | return outputflows
34 |
35 | #same as function packetsToFlow but merges flows instead of packets
36 | def combineFlows(flows, flowgap):
37 | if len(flows) == 0:
38 | return None
39 |
40 | outputflows = []
41 |
42 | flows.sort(key = lambda flow:flow.getStart())
43 | flows.sort(key = lambda flow:flow.key)
44 |
45 | nextoutflow = Flow(None)
46 | for nextflow in flows:
47 | if (nextoutflow.key != nextflow.key) or ((nextflow.getStart() - nextoutflow.getEnd()) > flowgap):
48 | nextoutflow = nextflow
49 | outputflows.append(nextoutflow)
50 | else:
51 | nextoutflow.addFlow(nextflow)
52 |
53 | return outputflows
54 |
55 | def getCustomWeightedAvg(n1, w1, n2, w2):
56 | num = 0
57 | den = 0
58 | if w1 > 0:
59 | num += w1 * n1
60 | den += w1
61 | if w2 > 0:
62 | num += w2 * n2
63 | den += w2
64 | if den <= 0:
65 | den = 1
66 | return num / den
67 |
68 |
69 | #write list of flows into file in desired format
70 | def writeFlowsToFile(flowlist, filename):
71 | outfile = open(filename, 'w')
72 |
73 | to_write = []
74 | for flow in flowlist:
75 | to_write.append(
76 | socket.inet_ntoa(flow.ip1) + ',' +
77 | socket.inet_ntoa(flow.ip2) + ',' +
78 | str(flow.n_packet1) + ',' +
79 | str(flow.n_byte1) + ',' +
80 | '%.6f'%flow.t_start1 + ',' +
81 | '%.6f'%flow.t_end1 + ',' +
82 | '%.6f'%flow.getInterArrivaltime1() + ',' +
83 | str(flow.n_packet2) + ',' +
84 | str(flow.n_byte2) + ',' +
85 | '%.6f'%flow.t_start2 + ',' +
86 | '%.6f'%flow.t_end2 + ',' +
87 | '%.6f'%flow.getInterArrivaltime2())
88 |
89 | outfile.write("\n".join(to_write))
90 | outfile.close()
91 |
92 | #class which defines the structure of flows
93 | class Flow:
94 | #constructor of default flow
95 | def __init__(self,firstpacket):
96 | if firstpacket == None:
97 | self.ip1 = None
98 | self.ip2 = None
99 | self.key = None
100 | self.n_packet1 = 0
101 | self.n_byte1 = 0
102 | self.t_start1 = 0
103 | self.t_end1 = 0
104 | self.t_interarrival1 = []
105 | self.n_packet2 = 0
106 | self.n_byte2 = 0
107 | self.t_start2 = 0
108 | self.t_end2 = 0
109 | self.t_interarrival2 = []
110 | else:
111 | if firstpacket.source < firstpacket.dest:
112 | self.ip1 = firstpacket.source
113 | self.ip2 = firstpacket.dest
114 | self.n_packet1 = 1
115 | self.n_byte1 = firstpacket.size
116 | self.t_start1 = firstpacket.timestamp
117 | self.t_end1 = firstpacket.timestamp
118 | self.t_interarrival1 = []
119 | self.n_packet2 = 0
120 | self.n_byte2 = 0
121 | self.t_start2 = 0
122 | self.t_end2 = 0
123 | self.t_interarrival2 = []
124 | else:
125 | self.ip1 = firstpacket.dest
126 | self.ip2 = firstpacket.source
127 | self.n_packet1 = 0
128 | self.n_byte1 = 0
129 | self.t_start1 = 0
130 | self.t_end1 = 0
131 | self.t_interarrival1 = []
132 | self.n_packet2 = 1
133 | self.n_byte2 = firstpacket.size
134 | self.t_start2 = firstpacket.timestamp
135 | self.t_end2 = firstpacket.timestamp
136 | self.t_interarrival2 = []
137 | self.key = firstpacket.key
138 |
139 | #add a flow to the current flow (by changing volume and duration)
140 | def addFlow(self,flow):
141 | self.t_interarrival1 += flow.t_interarrival1
142 | self.t_interarrival2 += flow.t_interarrival2
143 | self.n_packet1 += flow.n_packet1
144 | self.n_packet2 += flow.n_packet2
145 | self.n_byte1 += flow.n_byte1
146 | self.n_byte2 += flow.n_byte2
147 |
148 | temp = min(self.t_start1,flow.t_start1)
149 | if temp == 0:
150 | self.t_start1 = self.t_start1 + flow.t_start1
151 | else:
152 | self.t_start1 = temp
153 |
154 | temp = min(self.t_start2,flow.t_start2)
155 | if temp == 0:
156 | self.t_start2 = self.t_start2 + flow.t_start2
157 | else:
158 | self.t_start2 = temp
159 |
160 | if(self.t_end1 < flow.t_end1):
161 | self.t_end1 = flow.t_end1
162 | if(self.t_end2 < flow.t_end2):
163 | self.t_end2 = flow.t_end2
164 |
165 | #add a packet to the current flow (by changing volume and duration)
166 | def addPacket(self,packet):
167 | if packet.source == self.ip1 and packet.dest == self.ip2:
168 |
169 | #initialize flow if not initialized
170 | if self.n_packet1 == 0:
171 | self.t_start1 = packet.timestamp
172 | self.t_end1 = packet.timestamp
173 | self.n_packet1 += 1
174 | self.n_byte1 += packet.size
175 | return
176 |
177 | if self.t_end1 < packet.timestamp:
178 | self.t_interarrival1.append(packet.timestamp-self.t_end1)
179 | self.t_end1 = packet.timestamp
180 | elif self.t_start1 > packet.timestamp:
181 | self.t_interarrival1.append(self.t_start1-packet.timestamp)
182 | self.t_start1 = packet.timestamp
183 | self.n_packet1 += 1
184 | self.n_byte1 += packet.size
185 |
186 | elif packet.source == self.ip2 and packet.dest == self.ip1:
187 |
188 | #initialize flow if not initialized
189 | if self.n_packet2 == 0:
190 | self.t_start2 = packet.timestamp
191 | self.t_end2 = packet.timestamp
192 | self.n_packet2 += 1
193 | self.n_byte2 += packet.size
194 | return
195 |
196 | if self.t_end2 < packet.timestamp:
197 | self.t_interarrival2.append(packet.timestamp-self.t_end2)
198 | self.t_end2 = packet.timestamp
199 | elif self.t_start2 > packet.timestamp:
200 | self.t_interarrival2.append(self.t_start2-packet.timestamp)
201 | self.t_start2 = packet.timestamp
202 | self.n_packet2 += 1
203 | self.n_byte2 += packet.size
204 |
205 | else:
206 | raise Exception('packet does not belong to flow')
207 |
208 | def getDurationInSeconds(self):
209 | return self.getEnd() - self.getStart()
210 |
211 | def getInterArrivaltime(self):
212 | combined = (self.t_interarrival1+self.t_interarrival2).sort()
213 | if len(combined) > 0:
214 | return combined[len(combined)/2]
215 | return 0
216 |
217 | def getInterArrivaltime1(self):
218 | self.t_interarrival1.sort()
219 | if len(self.t_interarrival1) > 0:
220 | return self.t_interarrival1[len(self.t_interarrival1)/2]
221 | return 0
222 |
223 | def getInterArrivaltime2(self):
224 | self.t_interarrival2.sort()
225 | if len(self.t_interarrival2) > 0:
226 | return self.t_interarrival2[len(self.t_interarrival2)/2]
227 | return 0
228 |
229 | def getNoOfBytes(self):
230 | return self.n_byte1 + self.n_byte2
231 |
232 | def getNoOfPackets(self):
233 | return self.n_packet1 + self.n_packet2
234 |
235 | def getStart(self):
236 | temp = min(self.t_start1, self.t_start2)
237 | if temp == 0:
238 | return self.t_start1 + self.t_start2
239 | else:
240 | return temp
241 |
242 | def getEnd(self):
243 | return max(self.t_end1, self.t_end2)
244 |
--------------------------------------------------------------------------------
/Security Tasks Evaluation/BotnetAnalysis/runExperiment.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import math
4 | import subprocess as sub
5 | import shutil
6 | import csv
7 | import numpy as np
8 | import multiprocessing as MP
9 | import time
10 |
11 | import gc
12 |
13 | import warnings
14 | warnings.filterwarnings("ignore", category=FutureWarning)
15 |
16 | from sklearn.metrics import accuracy_score, confusion_matrix
17 | from sklearn.model_selection import train_test_split, StratifiedKFold
18 | from sklearn.ensemble import RandomForestClassifier
19 | from joblib import dump, load
20 |
21 | from peershark.GenerateFlows import runGenerateFlows
22 | from peershark.generateSuperFlows import runGenerateSuperFlows
23 | from peershark.createTrainingData import runTrainingDataGenerator
24 | from quantize import QuantizeDataset
25 |
26 | data_location = "Data/"
27 |
28 |
29 | def Classify(binWidth, ipt_bin_width):
30 | dataset_path = 'TrainingData/Datasets/Dataset_%s_%s.csv'%(binWidth, ipt_bin_width)
31 | with open(dataset_path, 'rb') as dataset_file:
32 | print "Loading Dataset: %s ..."%(dataset_path)
33 |
34 | attributes = []
35 | labels = []
36 | csv_reader = csv.reader(dataset_file)
37 | for n, row in enumerate(csv_reader):
38 | if(n == 0):
39 | continue
40 | else:
41 | attributes.append(row[:-1])
42 | labels.append(row[-1])
43 |
44 | #Split data in 66% train, 33% test
45 | train_x, test_x, train_y, test_y = train_test_split(attributes, labels, test_size=0.33, random_state=42, stratify=labels)
46 |
47 | #Define classifier
48 | classifier = RandomForestClassifier(random_state=42)
49 |
50 | #Train classifier
51 | #start_train = time.time()
52 | model = classifier.fit(np.asarray(train_x), np.asarray(train_y))
53 | #end_train = time.time()
54 | #print "Model trained in %ss"%(end_train-start_train)
55 |
56 | #for sample in test_x:
57 | # start_sample = time.time()
58 | # model.predict(np.asarray(sample).reshape((1,-1)))
59 | # end_sample = time.time()
60 | # print "Sample predicted in %ss"%(end_sample-start_sample)
61 |
62 | #Perform predictions
63 | print "Predicting %s samples"%(len(test_x))
64 | #start_batch = time.time()
65 | predictions = model.predict(np.asarray(test_x))
66 | #end_batch = time.time()
67 | #print "Batch predicted in %ss"%(end_batch-start_batch)
68 |
69 | #Generate metrics (benign)
70 | TN, FP, FN, TP = confusion_matrix(np.asarray(test_y), predictions, labels=["malicious","benign"]).ravel()
71 | FPR_BENIGN = float(FP)/(float(FP)+float(TN))
72 | RECALL_BENIGN = float(TP)/(float(TP) + float(FN))
73 | PRECISION_BENIGN = float(TP)/(float(TP) + float(FP))
74 |
75 | print "Model Precision (benign): " + "{0:.3f}".format(PRECISION_BENIGN)
76 | print "Model Recall (benign): " + "{0:.3f}".format(RECALL_BENIGN)
77 | print "Model FPR (benign): " + "{0:.3f}".format(FPR_BENIGN)
78 |
79 |
80 | #Generate metrics (malicious)
81 | TN, FP, FN, TP = confusion_matrix(np.asarray(test_y), predictions, labels=["benign","malicious"]).ravel()
82 | FPR_MALICIOUS = float(FP)/(float(FP)+float(TN))
83 | RECALL_MALICIOUS = float(TP)/(float(TP) + float(FN))
84 | PRECISION_MALICIOUS = float(TP)/(float(TP) + float(FP))
85 |
86 | print "Model Precision (malicious): " + "{0:.3f}".format(PRECISION_MALICIOUS)
87 | print "Model Recall (malicious): " + "{0:.3f}".format(RECALL_MALICIOUS)
88 | print "Model FPR (malicious): " + "{0:.3f}".format(FPR_MALICIOUS)
89 |
90 | results_file = open("classificationResults/results.csv","a")
91 | results_file.write("%s, %s, %s, %s, %s, %s, %s, %s\n"%(binWidth, ipt_bin_width, "{0:.3f}".format(PRECISION_BENIGN), "{0:.3f}".format(RECALL_BENIGN), "{0:.3f}".format(FPR_BENIGN), "{0:.3f}".format(PRECISION_MALICIOUS), "{0:.3f}".format(RECALL_MALICIOUS), "{0:.3f}".format(FPR_MALICIOUS)))
92 | results_file.flush()
93 | results_file.close()
94 | print ""
95 |
96 |
97 | def GenerateDataset(datasets, binWidth, ipt_bin_width):
98 | if not os.path.exists('TrainingData/Datasets'):
99 | os.makedirs('TrainingData/Datasets')
100 |
101 | datasets_to_merge = []
102 | for dataset in datasets:
103 | dataset = os.path.basename(dataset)
104 | datasets_to_merge.append('TrainingData/%s/trainingdata_%s_%s.csv'%(dataset, binWidth, ipt_bin_width))
105 |
106 | #Merge datasets in a single file
107 | with open('TrainingData/Datasets/Dataset_%s_%s.csv'%(binWidth, ipt_bin_width), "w") as out_dataset:
108 | out_dataset.write("NumberOfPackets,TotalBytesTransmitted,MedianIPT,ConversationDuration,class\n")
109 | for fname in datasets_to_merge:
110 | with open(fname, 'rb') as infile:
111 | csv_reader = csv.reader(infile)
112 | for row in csv_reader:
113 | new_row = row
114 | if(row[4] == "P2PTraffic"):
115 | new_row[4] = "benign"
116 | else:
117 | new_row[4] = "malicious"
118 | out_dataset.write(",".join(new_row) + "\n")
119 |
120 |
121 | def RunPeerShark(quantized_pcap_data_dir, flow_data_dir, super_flow_data_dir, training_data_dir, bin_width, ipt_bin_width):
122 | #create a semaphore so as not to exceed threadlimit
123 | n_processes = 4
124 |
125 | #Set TIMEGAP
126 | timegap = 2000
127 |
128 | print "Generating Flows with TIMEGAP = %s"%(timegap)
129 | runGenerateFlows(quantized_pcap_data_dir, flow_data_dir, n_processes, timegap)
130 |
131 | #Set FLOWGAP in seconds
132 | flowgap = 3600
133 |
134 | print "Generating SuperFlows with FLOWGAP = %s"%(flowgap)
135 | runGenerateSuperFlows(flow_data_dir, super_flow_data_dir, flowgap)
136 |
137 | print "Generating Training Data..."
138 | runTrainingDataGenerator(super_flow_data_dir, training_data_dir, bin_width, ipt_bin_width)
139 |
140 |
141 | def Experiment(datasets, bin_width, ipt_bin_width):
142 |
143 | if not os.path.exists('FeatureSets'):
144 | os.makedirs('FeatureSets')
145 |
146 | #Quantize datasets according to bin width
147 | #Generate training sets for quantization
148 | for dataset in datasets:
149 | quantized_pcap_data_dir = 'FeatureSets/' + os.path.basename(dataset) + "/"
150 | flow_data_dir = 'FlowData/' + os.path.basename(dataset) + "/"
151 | superflow_data_dir = 'SuperFlowData/' + os.path.basename(dataset) + "/"
152 | training_data_dir = 'TrainingData/' + os.path.basename(dataset) + "/"
153 |
154 | if not os.path.exists('FeatureSets/' + os.path.basename(dataset)):
155 | os.makedirs('FeatureSets/' + os.path.basename(dataset))
156 |
157 | if not os.path.exists('FlowData/' + os.path.basename(dataset)):
158 | os.makedirs('FlowData/' + os.path.basename(dataset))
159 |
160 | if not os.path.exists('SuperFlowData/' + os.path.basename(dataset)):
161 | os.makedirs('SuperFlowData/' + os.path.basename(dataset))
162 |
163 | if not os.path.exists('TrainingData/' + os.path.basename(dataset)):
164 | os.makedirs('TrainingData/' + os.path.basename(dataset))
165 |
166 |
167 | print "Quantizing %s with BinWidth = %s and IPT_BinWidth = %s"% (dataset, binWidth, ipt_bin_width)
168 | n_processes = 4
169 | QuantizeDataset(dataset, bin_width, ipt_bin_width, n_processes)
170 | RunPeerShark(quantized_pcap_data_dir, flow_data_dir, superflow_data_dir, training_data_dir, bin_width, ipt_bin_width)
171 |
172 | print "Building Dataset..."
173 | GenerateDataset(datasets, binWidth, ipt_bin_width)
174 |
175 | print "Performing Classification..."
176 | Classify(binWidth, ipt_bin_width)
177 |
178 | start_collect = time.time()
179 | collected = gc.collect()
180 | end_collect = time.time()
181 | print "Time wasted on GC - Classification: %ss, collected %s objects"%(end_collect-start_collect, collected)
182 |
183 | shutil.rmtree('FeatureSets')
184 | shutil.rmtree('FlowData')
185 | shutil.rmtree('SuperFlowData')
186 | shutil.rmtree('TrainingData')
187 |
188 |
189 |
190 | if __name__ == "__main__":
191 |
192 | DATASETS = [
193 | data_location + "Waledac",
194 | data_location + "Storm",
195 | data_location + "P2PTraffic"
196 | ]
197 |
198 | ###
199 | #The following parameters are now fed by the fullRun.sh shell script
200 | # Please run fullRun.sh instead of this file directly
201 | ###
202 |
203 | #Quantization (packet size)
204 | #BIN_WIDTH = [1, 16, 32, 64, 128, 256]
205 |
206 | #Quantization (IPT in seconds)
207 | #TIMEGAP IS 2000s, FLOWGAP IS 3600s
208 | #IPT_BIN_WIDTH = [0, 1, 10, 60, 300, 900]
209 |
210 | if not os.path.exists("classificationResults"):
211 | os.makedirs("classificationResults")
212 | results_file = open("classificationResults/results.csv","a+")
213 | results_file.write("BinWidth, IPT_BinWidth, Precision_Benign, Recall_Benign, FalsePositiveRate_Benign, Precision_Malicious, Recall_Malicious, FalsePositiveRate_Malicious\n")
214 | results_file.flush()
215 | results_file.close()
216 |
217 |
218 | binWidth = int(sys.argv[1])
219 | ipt_bin_width = int(sys.argv[2])
220 |
221 | print "Starting experiment with Bin width %s and IPT Bin Width %s"%(binWidth, ipt_bin_width)
222 | start_time = time.time()
223 | Experiment(DATASETS, binWidth, ipt_bin_width)
224 | end_time = time.time()
225 | time_elapsed_seconds = end_time - start_time
226 | print "Experiment finished in %sh\n"%("{0:.2f}".format(time_elapsed_seconds/60.0/60.0))
227 |
228 |
--------------------------------------------------------------------------------
/Security Tasks Evaluation/WFAnalysis/SingleWebsiteAnalysis/ParsingUtilities/CSVParseWebsiteUnbalanced.py:
--------------------------------------------------------------------------------
1 | import os
2 | import csv
3 | import sys
4 | from datetime import datetime, timedelta
5 | import collections
6 | import math
7 | from collections import defaultdict, OrderedDict
8 | import numpy as np
9 |
10 | def RoundToNearest(n, m):
11 | if (m == 1):
12 | return n
13 | if (n > 0):
14 | r = n % m
15 | return n + m - r if r + r >= m else n - r
16 | else:
17 | if (n < 0):
18 | return RoundToNearest(abs(n), m) * -1
19 | return 0
20 |
21 | def extractDistributionWithoutTruncation(argv):
22 | BASE_DIR = os.path.dirname(argv[0])
23 | file = open(argv[0],'r')
24 |
25 | binWidth = int(argv[1])
26 | websiteToClassify = argv[2]
27 |
28 | if not os.path.exists(BASE_DIR + "/" + websiteToClassify):
29 | os.makedirs(BASE_DIR + "/" + websiteToClassify)
30 |
31 | trainSet = open(BASE_DIR + "/" + websiteToClassify + "/TrainSet_full_" + str(binWidth) + ".csv", 'w')
32 | testSet = open(BASE_DIR + "/" + websiteToClassify + "/TestSet_full_" + str(binWidth) + ".csv", 'w')
33 |
34 |
35 | #Set for all possible quantized buckets
36 | binsUsedByWebsite = set()
37 | minBucket = RoundToNearest(-1500, binWidth)
38 | maxBucket = RoundToNearest(1500, binWidth) + 1
39 | for size in range(minBucket, maxBucket, binWidth):
40 | binsUsedByWebsite.add(RoundToNearest(size, binWidth))
41 |
42 |
43 | websiteTrainInstances = int(argv[3])
44 | websiteTestInstances = int(argv[4])
45 |
46 | ################################################
47 | #Build csv with quantized bins
48 | ################################################
49 |
50 | # Write CSV datasets header (with bins used by the target website)
51 | for size in range(minBucket, maxBucket, binWidth):
52 | if (size in binsUsedByWebsite):
53 | trainSet.write("packetLengthBin_" + str(size) + ", ")
54 | testSet.write("packetLengthBin_" + str(size) + ", ")
55 | trainSet.write("class\n")
56 | testSet.write("class\n")
57 |
58 |
59 | file = open(argv[0],'r')
60 | l = file.readline() #Take out dataset header
61 | l = file.readline() #Take out dataset header
62 | trainCounter = 0
63 | testCounter = 0
64 | currWebsite = ""
65 | trainData = []
66 | testData =[]
67 |
68 | for lineNumber, l in enumerate(file.readlines()):
69 | lineSplit = l.rstrip('\n').split(" ")
70 | if (lineNumber % 2 == 1): #Gather website data
71 | website = lineSplit[0][:-1]
72 | if(website != currWebsite):
73 | currWebsite = website
74 | trainCounter = 0
75 | testCounter = 0
76 |
77 | #Build container for sample distribution
78 | website_bin_distribution = OrderedDict()
79 | for i in sorted(binsUsedByWebsite):
80 | website_bin_distribution[i] = 0
81 |
82 | #Add useful bins to the sample distribution
83 | for packet_size in lineSplit[1:-1]:
84 | packet_size_binned = RoundToNearest(int(packet_size), binWidth)
85 | if(packet_size_binned in binsUsedByWebsite):
86 | website_bin_distribution[packet_size_binned] += 1
87 |
88 |
89 | if(trainCounter < websiteTrainInstances):
90 | bin_list = []
91 | for i in website_bin_distribution:
92 | bin_list.append(str(website_bin_distribution[i]))
93 | trainData.append(",".join(bin_list) + ", " + currWebsite + "\n")
94 | trainCounter += 1
95 | elif(testCounter < websiteTestInstances):
96 | bin_list = []
97 | for i in website_bin_distribution:
98 | bin_list.append(str(website_bin_distribution[i]))
99 | testData.append(",".join(bin_list) + ", " + currWebsite + "\n")
100 | #Account for processed sample
101 | testCounter += 1
102 |
103 | trainSet.write("".join(trainData))
104 | testSet.write("".join(testData))
105 | trainSet.close()
106 | testSet.close()
107 |
108 |
109 | def extractDistributionWithTruncation(argv):
110 | BASE_DIR = os.path.dirname(argv[0])
111 | file = open(argv[0],'r')
112 |
113 | binWidth = int(argv[1])
114 | websiteToClassify = argv[2]
115 |
116 | if not os.path.exists(BASE_DIR + "/" + websiteToClassify):
117 | os.makedirs(BASE_DIR + "/" + websiteToClassify)
118 |
119 | trainSet = open(BASE_DIR + "/" + websiteToClassify + "/TrainSet_truncated_" + str(binWidth) + ".csv", 'w')
120 | testSet = open(BASE_DIR + "/" + websiteToClassify + "/TestSet_truncated_" + str(binWidth) + ".csv", 'w')
121 |
122 |
123 | websiteTrainInstances = int(argv[3])
124 | websiteTestInstances = int(argv[4])
125 |
126 | trainInstancesCounter = 0
127 | binsUsedByWebsite = set()
128 | minBucket = RoundToNearest(-1500, binWidth)
129 | maxBucket = RoundToNearest(1500, binWidth) + 1
130 |
131 | ################################################
132 | #Gather list of quantized buckets used by the target website in the training set (1st pass)
133 | ################################################
134 |
135 | l = file.readline() #Take out dataset header
136 | l = file.readline() #Take out dataset header
137 | for lineNumber, l in enumerate(file.readlines()):
138 | lineSplit = l.rstrip('\n').split(" ")
139 | if (lineNumber % 2 == 1): #Gather website data
140 | website = lineSplit[0][:-1]
141 | if (website == websiteToClassify):
142 | if(trainInstancesCounter < websiteTrainInstances):
143 | for packet_size in lineSplit[1:-1]:
144 | binsUsedByWebsite.add(RoundToNearest(int(packet_size), binWidth))
145 | trainInstancesCounter += 1
146 | else:
147 | break #We've analysed all training websiteToClassify samples
148 |
149 |
150 | #Get to know the amount of buckets used for measuring a given website
151 | print "Total number of buckets: " + str(int(math.floor(3000.0/binWidth)))
152 | print "Number of buckets after truncation: " + str(len(binsUsedByWebsite))
153 | #Write these stats to a file
154 | file = open("truncationInfo/" + websiteToClassify + ".csv", "a")
155 | file.write("%s, %s\n"%(binWidth, len(binsUsedByWebsite)))
156 | file.close()
157 |
158 | ################################################
159 | #Build csv with truncated bins (2nd pass)
160 | ################################################
161 |
162 | # Write CSV datasets header (with bins used by the target website)
163 | for size in range(minBucket, maxBucket, binWidth):
164 | if (size in binsUsedByWebsite):
165 | trainSet.write("packetLengthBin_" + str(size) + ", ")
166 | testSet.write("packetLengthBin_" + str(size) + ", ")
167 | trainSet.write("class\n")
168 | testSet.write("class\n")
169 |
170 |
171 | file = open(argv[0],'r')
172 | l = file.readline() #Take out dataset header
173 | l = file.readline() #Take out dataset header
174 | trainCounter = 0
175 | testCounter = 0
176 | currWebsite = ""
177 | trainData = []
178 | testData =[]
179 |
180 | for lineNumber, l in enumerate(file.readlines()):
181 | lineSplit = l.rstrip('\n').split(" ")
182 | if (lineNumber % 2 == 1): #Gather website data
183 | website = lineSplit[0][:-1]
184 | if(website != currWebsite):
185 | currWebsite = website
186 | trainCounter = 0
187 | testCounter = 0
188 |
189 | #Build container for sample distribution
190 | website_bin_distribution = OrderedDict()
191 | for i in sorted(binsUsedByWebsite):
192 | website_bin_distribution[i] = 0
193 |
194 | #Add useful bins to the sample distribution
195 | for packet_size in lineSplit[1:-1]:
196 | packet_size_binned = RoundToNearest(int(packet_size), binWidth)
197 | if(packet_size_binned in binsUsedByWebsite):
198 | website_bin_distribution[packet_size_binned] += 1
199 |
200 |
201 | if(trainCounter < websiteTrainInstances):
202 | bin_list = []
203 | for i in website_bin_distribution:
204 | bin_list.append(str(website_bin_distribution[i]))
205 | trainData.append(",".join(bin_list) + ", " + currWebsite + "\n")
206 | trainCounter += 1
207 | elif(testCounter < websiteTestInstances):
208 | bin_list = []
209 | for i in website_bin_distribution:
210 | bin_list.append(str(website_bin_distribution[i]))
211 | testData.append(",".join(bin_list) + ", " + currWebsite + "\n")
212 | #Account for processed sample
213 | testCounter += 1
214 |
215 | trainSet.write("".join(trainData))
216 | testSet.write("".join(testData))
217 | trainSet.close()
218 | testSet.close()
219 |
220 | if __name__ == "__main__":
221 | if (int(sys.argv[-1]) == 1):
222 | extractDistributionWithoutTruncation(sys.argv[1:-1])
223 | else:
224 | extractDistributionWithTruncation(sys.argv[1:-1])
--------------------------------------------------------------------------------
/Security Tasks Evaluation/MPTAnalysis/FacetAnalysis/online_sketching.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import numpy as np
3 |
4 |
5 | def CreateBinaryVectorRepresentation(BIN_WIDTH, TOPK, SKETCH_SIZE):
6 |
7 | for sketch_size in SKETCH_SIZE:
8 | for binWidth in BIN_WIDTH:
9 | for topk in TOPK:
10 |
11 | """
12 | Generate random base vectors
13 | """
14 |
15 | if(topk != 1500):
16 | real_bucket_number = topk
17 | else:
18 | real_bucket_number = 1500/binWidth
19 |
20 | random_base_vectors = []
21 | for i in range(0, sketch_size):
22 | random_base_vector = (2*np.random.randint(0,2,size=(real_bucket_number))-1)
23 | random_base_vectors.append(random_base_vector)
24 |
25 | n_bits = range(0, sketch_size)
26 |
27 | """
28 | Process Phase 1 Data
29 | """
30 |
31 | feature_set = 'PL_60_' + str(binWidth) + '_' + str(topk)
32 | data_folder = 'FeatureSets/' + feature_set + '/'
33 |
34 | #Regular Traffic
35 | print "Online_Sketch: Phase 1, Regular - " + feature_set + "/Online_regularTraffic_phase1_" + str(sketch_size) + "_dataset.csv"
36 | output = open(data_folder + "Online_regularTraffic_phase1_" + str(sketch_size) + "_dataset.csv", "w")
37 | f = open(data_folder + "RegularTraffic_phase1_dataset.csv", 'r')
38 | reader = csv.reader(f, delimiter=',')
39 |
40 | #Process data row
41 | for n, row in enumerate(reader):
42 | if(n == 0):
43 | output.write(",".join(str(x) for x in n_bits) + "\n")
44 | else:
45 | #Gather the packet vector array (v_f)
46 | packet_count_vector = []
47 | for i in row[:-1]:
48 | packet_count_vector.append(int(i))
49 |
50 | #Compute the integer array (c_f)
51 | integer_array = []
52 | for i in range(0, sketch_size):
53 | c_f_i = 0
54 | for j in range(0, real_bucket_number):
55 | #print "Random_base_vector: " + str(random_base_vectors[i])
56 | c_f_i += random_base_vectors[i][j] * packet_count_vector[j]
57 | integer_array.append(c_f_i)
58 |
59 | #Compute the binary array (s_f)
60 | binary_array = []
61 | for i in integer_array:
62 | if(i <= 0):
63 | binary_array.append(0)
64 | else:
65 | binary_array.append(1)
66 |
67 | #print "Binary array: " + str(binary_array)
68 | output.write(",".join(str(x) for x in binary_array) + "," + row[-1] + "\n")
69 | output.close()
70 |
71 |
72 | #Facet Traffic
73 | print "Online_Sketch: Phase 1, Facet - " + feature_set + "/Online_facetTraffic_phase1_" + str(sketch_size) + "_dataset.csv"
74 | output = open(data_folder + "Online_facetTraffic_phase1_" + str(sketch_size) + "_dataset.csv", "w")
75 | f = open(data_folder + "FacetTraffic_50_phase1_dataset.csv", 'r')
76 | reader = csv.reader(f, delimiter=',')
77 |
78 | #Process data row
79 | for n, row in enumerate(reader):
80 | if(n == 0):
81 | output.write(",".join(str(x) for x in n_bits) + "\n")
82 | else:
83 | #Gather the packet vector array (v_f)
84 | packet_count_vector = []
85 | for i in row[:-1]:
86 | packet_count_vector.append(int(i))
87 |
88 | #Compute the integer array (c_f)
89 | integer_array = []
90 | for i in range(0, sketch_size):
91 | c_f_i = 0
92 | for j in range(0, real_bucket_number):
93 | #print "Random_base_vector: " + str(random_base_vectors[i])
94 | c_f_i += random_base_vectors[i][j] * packet_count_vector[j]
95 | integer_array.append(c_f_i)
96 |
97 | #Compute the binary array (s_f)
98 | binary_array = []
99 | for i in integer_array:
100 | if(i <= 0):
101 | binary_array.append(0)
102 | else:
103 | binary_array.append(1)
104 |
105 | #print "Binary array: " + str(binary_array)
106 | output.write(",".join(str(x) for x in binary_array) + "," + row[-1] + "\n")
107 | output.close()
108 |
109 | ########################################################################################
110 | ########################################################################################
111 | ########################################################################################
112 |
113 |
114 | """
115 | Process Phase 2 Data
116 | """
117 |
118 | feature_set = 'PL_60_' + str(binWidth) + '_' + str(topk)
119 | data_folder = 'FeatureSets/' + feature_set + '/'
120 |
121 | #Regular Traffic
122 | print "Online_Sketch: Phase 2, Regular - " + feature_set + "/Online_regularTraffic_phase2_" + str(sketch_size) + "_dataset.csv"
123 | output = open(data_folder + "Online_regularTraffic_phase2_" + str(sketch_size) + "_dataset.csv", "w")
124 | f = open(data_folder + "RegularTraffic_phase2_dataset.csv", 'r')
125 | reader = csv.reader(f, delimiter=',')
126 |
127 | #Process data row
128 | for n, row in enumerate(reader):
129 | if(n == 0):
130 | output.write(",".join(str(x) for x in n_bits) + "\n")
131 | else:
132 | #Gather the packet vector array (v_f)
133 | packet_count_vector = []
134 | for i in row[:-1]:
135 | packet_count_vector.append(int(i))
136 |
137 | #Compute the integer array (c_f)
138 | integer_array = []
139 | for i in range(0, sketch_size):
140 | c_f_i = 0
141 | for j in range(0, real_bucket_number):
142 | #print "Random_base_vector: " + str(random_base_vectors[i])
143 | c_f_i += random_base_vectors[i][j] * packet_count_vector[j]
144 | integer_array.append(c_f_i)
145 |
146 | #Compute the binary array (s_f)
147 | binary_array = []
148 | for i in integer_array:
149 | if(i <= 0):
150 | binary_array.append(0)
151 | else:
152 | binary_array.append(1)
153 |
154 | #print "Binary array: " + str(binary_array)
155 | output.write(",".join(str(x) for x in binary_array) + "," + row[-1] + "\n")
156 | output.close()
157 |
158 |
159 | #Facet Traffic
160 | print "Online_Sketch: Phase 2, Facet - " + feature_set + "/Online_facetTraffic_phase2_" + str(sketch_size) + "_dataset.csv"
161 | output = open(data_folder + "Online_facetTraffic_phase2_" + str(sketch_size) + "_dataset.csv", "w")
162 | f = open(data_folder + "FacetTraffic_50_phase2_dataset.csv", 'r')
163 | reader = csv.reader(f, delimiter=',')
164 |
165 | #Process data row
166 | for n, row in enumerate(reader):
167 | if(n == 0):
168 | output.write(",".join(str(x) for x in n_bits) + "\n")
169 | else:
170 | #Gather the packet vector array (v_f)
171 | packet_count_vector = []
172 | for i in row[:-1]:
173 | packet_count_vector.append(int(i))
174 |
175 | #Compute the integer array (c_f)
176 | integer_array = []
177 | for i in range(0, sketch_size):
178 | c_f_i = 0
179 | for j in range(0, real_bucket_number):
180 | #print "Random_base_vector: " + str(random_base_vectors[i])
181 | c_f_i += random_base_vectors[i][j] * packet_count_vector[j]
182 | integer_array.append(c_f_i)
183 |
184 | #Compute the binary array (s_f)
185 | binary_array = []
186 | for i in integer_array:
187 | if(i <= 0):
188 | binary_array.append(0)
189 | else:
190 | binary_array.append(1)
191 |
192 | #print "Binary array: " + str(binary_array)
193 | output.write(",".join(str(x) for x in binary_array) + "," + row[-1] + "\n")
194 | output.close()
195 |
196 |
--------------------------------------------------------------------------------
/Security Tasks Evaluation/MPTAnalysis/DeltaShaperAnalysis/online_sketching.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import numpy as np
3 |
4 |
5 | def CreateBinaryVectorRepresentation(BIN_WIDTH, TOPK, SKETCH_SIZE):
6 |
7 | for sketch_size in SKETCH_SIZE:
8 | for binWidth in BIN_WIDTH:
9 | for topk in TOPK:
10 |
11 | """
12 | Generate random base vectors
13 | """
14 |
15 | if(topk != 1500):
16 | real_bucket_number = topk
17 | else:
18 | real_bucket_number = 1500/binWidth
19 |
20 | random_base_vectors = []
21 | for i in range(0, sketch_size):
22 | random_base_vector = (2*np.random.randint(0,2,size=(real_bucket_number))-1)
23 | random_base_vectors.append(random_base_vector)
24 |
25 | n_bits = range(0, sketch_size)
26 |
27 | """
28 | Process Phase 1 Data
29 | """
30 |
31 | feature_set = 'PL_60_' + str(binWidth) + '_' + str(topk)
32 | data_folder = 'FeatureSets/' + feature_set + '/'
33 |
34 | #Regular Traffic
35 | print "Online_Sketch: Phase 1, Regular - " + feature_set + "/Online_regularTraffic_phase1_" + str(sketch_size) + "_dataset.csv"
36 | output = open(data_folder + "Online_regularTraffic_phase1_" + str(sketch_size) + "_dataset.csv", "w")
37 | f = open(data_folder + "RegularTraffic_phase1_dataset.csv", 'r')
38 | reader = csv.reader(f, delimiter=',')
39 |
40 | #Process data row
41 | for n, row in enumerate(reader):
42 | if(n == 0):
43 | output.write(",".join(str(x) for x in n_bits) + "," + row[-1] + "\n")
44 | else:
45 | #Gather the packet vector array (v_f)
46 | packet_count_vector = []
47 | for i in row[:-1]:
48 | packet_count_vector.append(int(i))
49 |
50 | #Compute the integer array (c_f)
51 | integer_array = []
52 | for i in range(0, sketch_size):
53 | c_f_i = 0
54 | for j in range(0, real_bucket_number):
55 | #print "Random_base_vector: " + str(random_base_vectors[i])
56 | c_f_i += random_base_vectors[i][j] * packet_count_vector[j]
57 | integer_array.append(c_f_i)
58 |
59 | #Compute the binary array (s_f)
60 | binary_array = []
61 | for i in integer_array:
62 | if(i <= 0):
63 | binary_array.append(0)
64 | else:
65 | binary_array.append(1)
66 |
67 | #print "Binary array: " + str(binary_array)
68 | output.write(",".join(str(x) for x in binary_array) + "," + row[-1] + "\n")
69 | output.close()
70 |
71 |
72 | #DeltaShaper Traffic
73 | print "Online_Sketch: Phase 1, DeltaShaper - " + feature_set + "/Online_deltashaperTraffic_phase1_" + str(sketch_size) + "_dataset.csv"
74 | output = open(data_folder + "Online_deltashaperTraffic_phase1_" + str(sketch_size) + "_dataset.csv", "w")
75 | f = open(data_folder + "DeltaShaperTraffic_320_phase1_dataset.csv", 'r')
76 | reader = csv.reader(f, delimiter=',')
77 |
78 | #Process data row
79 | for n, row in enumerate(reader):
80 | if(n == 0):
81 | output.write(",".join(str(x) for x in n_bits) + "," + row[-1] + "\n")
82 | else:
83 | #Gather the packet vector array (v_f)
84 | packet_count_vector = []
85 | for i in row[:-1]:
86 | packet_count_vector.append(int(i))
87 |
88 | #Compute the integer array (c_f)
89 | integer_array = []
90 | for i in range(0, sketch_size):
91 | c_f_i = 0
92 | for j in range(0, real_bucket_number):
93 | #print "Random_base_vector: " + str(random_base_vectors[i])
94 | c_f_i += random_base_vectors[i][j] * packet_count_vector[j]
95 | integer_array.append(c_f_i)
96 |
97 | #Compute the binary array (s_f)
98 | binary_array = []
99 | for i in integer_array:
100 | if(i <= 0):
101 | binary_array.append(0)
102 | else:
103 | binary_array.append(1)
104 |
105 | #print "Binary array: " + str(binary_array)
106 | output.write(",".join(str(x) for x in binary_array) + "," + row[-1] + "\n")
107 | output.close()
108 |
109 | ########################################################################################
110 | ########################################################################################
111 | ########################################################################################
112 |
113 |
114 | """
115 | Process Phase 2 Data
116 | """
117 |
118 | feature_set = 'PL_60_' + str(binWidth) + '_' + str(topk)
119 | data_folder = 'FeatureSets/' + feature_set + '/'
120 |
121 | #Regular Traffic
122 | print "Online_Sketch: Phase 2, Regular - " + feature_set + "/Online_regularTraffic_phase2_" + str(sketch_size) + "_dataset.csv"
123 | output = open(data_folder + "Online_regularTraffic_phase2_" + str(sketch_size) + "_dataset.csv", "w")
124 | f = open(data_folder + "RegularTraffic_phase2_dataset.csv", 'r')
125 | reader = csv.reader(f, delimiter=',')
126 |
127 | #Process data row
128 | for n, row in enumerate(reader):
129 | if(n == 0):
130 | output.write(",".join(str(x) for x in n_bits) + "," + row[-1] + "\n")
131 | else:
132 | #Gather the packet vector array (v_f)
133 | packet_count_vector = []
134 | for i in row[:-1]:
135 | packet_count_vector.append(int(i))
136 |
137 | #Compute the integer array (c_f)
138 | integer_array = []
139 | for i in range(0, sketch_size):
140 | c_f_i = 0
141 | for j in range(0, real_bucket_number):
142 | #print "Random_base_vector: " + str(random_base_vectors[i])
143 | c_f_i += random_base_vectors[i][j] * packet_count_vector[j]
144 | integer_array.append(c_f_i)
145 |
146 | #Compute the binary array (s_f)
147 | binary_array = []
148 | for i in integer_array:
149 | if(i <= 0):
150 | binary_array.append(0)
151 | else:
152 | binary_array.append(1)
153 |
154 | #print "Binary array: " + str(binary_array)
155 | output.write(",".join(str(x) for x in binary_array) + "," + row[-1] + "\n")
156 | output.close()
157 |
158 |
159 | #DeltaShaper Traffic
160 | print "Online_Sketch: Phase 2, DeltaShaper - " + feature_set + "/Online_deltashaperTraffic_phase2_" + str(sketch_size) + "_dataset.csv"
161 | output = open(data_folder + "Online_deltashaperTraffic_phase2_" + str(sketch_size) + "_dataset.csv", "w")
162 | f = open(data_folder + "DeltaShaperTraffic_320_phase2_dataset.csv", 'r')
163 | reader = csv.reader(f, delimiter=',')
164 |
165 | #Process data row
166 | for n, row in enumerate(reader):
167 | if(n == 0):
168 | output.write(",".join(str(x) for x in n_bits) + "," + row[-1] + "\n")
169 | else:
170 | #Gather the packet vector array (v_f)
171 | packet_count_vector = []
172 | for i in row[:-1]:
173 | packet_count_vector.append(int(i))
174 |
175 | #Compute the integer array (c_f)
176 | integer_array = []
177 | for i in range(0, sketch_size):
178 | c_f_i = 0
179 | for j in range(0, real_bucket_number):
180 | #print "Random_base_vector: " + str(random_base_vectors[i])
181 | c_f_i += random_base_vectors[i][j] * packet_count_vector[j]
182 | integer_array.append(c_f_i)
183 |
184 | #Compute the binary array (s_f)
185 | binary_array = []
186 | for i in integer_array:
187 | if(i <= 0):
188 | binary_array.append(0)
189 | else:
190 | binary_array.append(1)
191 |
192 | #print "Binary array: " + str(binary_array)
193 | output.write(",".join(str(x) for x in binary_array) + "," + row[-1] + "\n")
194 | output.close()
195 |
196 |
--------------------------------------------------------------------------------
/Security Tasks Evaluation/WFAnalysis/SingleWebsiteAnalysis/generateFigures.py:
--------------------------------------------------------------------------------
1 | import os
2 | from decimal import Decimal
3 | import numpy as np
4 | import csv
5 |
6 | import matplotlib
7 | if os.environ.get('DISPLAY','') == '':
8 | print('no display found. Using non-interactive Agg backend')
9 | matplotlib.use('Agg')
10 | import matplotlib.pyplot as plt
11 |
12 |
13 | colors = ["0.8", "0.6", "0.2", "0.0"]
14 | colors = ["salmon", "lightsteelblue", "darkseagreen", "thistle", "wheat", "khaki", "skyblue"]
15 |
16 | """
17 | Attach a text label above each bar displaying its height
18 | """
19 | def autolabel(rects, ax):
20 | for rect in rects:
21 | height = rect.get_height()
22 | ax.text(rect.get_x() + rect.get_width()/2., 1.005*height, # original height was 1.005*height
23 | "{0:.2f}".format(float(height)), fontsize=7, ha='center', va='bottom')
24 |
25 |
26 | def PlotSingleWebsiteStats():
27 |
28 | for profile in os.listdir("classificationResults/"):
29 | if(".DS_Store" in profile):
30 | continue
31 |
32 | profile_data = open("classificationResults/" + profile, 'rb')
33 | csv_reader = csv.reader(profile_data, delimiter=',')
34 |
35 | binWidth = []
36 | acc = []
37 | fpr = []
38 | fnr = []
39 |
40 | for n, row in enumerate(csv_reader):
41 | if(n == 0):
42 | continue
43 | binWidth.append(row[0])
44 | acc.append(float(row[1]))
45 | fpr.append(float(row[2]))
46 | fnr.append(float(row[3]))
47 |
48 |
49 | fig = plt.figure()
50 | ax1 = fig.add_subplot(111)
51 |
52 | print "Current feature set: "+ str(binWidth)
53 |
54 |
55 | ind = np.arange(len(binWidth)) # the x locations for the groups
56 | width = 0.20
57 |
58 | rects0 = ax1.bar(ind - width, acc, width, colors[0], label='Acc')
59 | rects1 = ax1.bar(ind, fpr, width, colors[1], label='FPR')
60 | rects2 = ax1.bar(ind + width, fnr, width, colors[2], label='FNR')
61 |
62 |
63 | ax1.yaxis.grid(color='black', linestyle='dotted')
64 | ax1.set_title('Scores for Quantization')
65 | ax1.set_yscale("log")
66 | ax1.set_xticks(ind)
67 | labels = binWidth
68 | ax1.set_xticklabels(labels)
69 | ax1.legend()
70 |
71 |
72 | plt.tight_layout()
73 | #plt.ylim(0, 1)
74 |
75 | fig.savefig('WF_%s.pdf'%(profile[:-4])) # save the figure to file
76 | fig.savefig('WF_%s.png'%(profile[:-4])) # save the figure to file
77 | plt.close(fig)
78 | profile_data.close()
79 |
80 |
81 | def PlotNormalFPRComparison():
82 | websites = set()
83 |
84 | #Compute the set of websites to compare
85 | for profile in os.listdir("classificationResults/"):
86 | if(".DS_Store" in profile):
87 | continue
88 | website = profile.split("_")[2]
89 | website = website[:-4]
90 | websites.add(website)
91 |
92 |
93 | for website in websites:
94 | if not os.path.exists("Figures/%s"%(website)):
95 | os.makedirs("Figures/%s"%(website))
96 |
97 | #Gather results for full distribution
98 | profile_data_full = open("classificationResults/SingleWebsite_full_" + website + ".csv", 'rb')
99 | csv_reader_full = csv.reader(profile_data_full, delimiter=',')
100 |
101 | binWidth_full = []
102 | acc_full = []
103 | fpr_full = []
104 | fnr_full = []
105 |
106 | for n, row in enumerate(csv_reader_full):
107 | if(n == 0):
108 | continue
109 | binWidth_full.append(row[0])
110 | acc_full.append(round(Decimal(float(row[1])), 4))
111 | fpr_full.append(round(Decimal(float(row[2])), 9))
112 | fnr_full.append(round(Decimal(float(row[3])), 4))
113 |
114 |
115 | #Gather results for truncated distribution
116 | profile_data_truncated = open("classificationResults/SingleWebsite_truncated_" + website + ".csv", 'rb')
117 | csv_reader_truncated = csv.reader(profile_data_truncated, delimiter=',')
118 |
119 | binWidth_truncated = []
120 | acc_truncated = []
121 | fpr_truncated = []
122 | fnr_truncated = []
123 |
124 | for n, row in enumerate(csv_reader_truncated):
125 | if(n == 0):
126 | continue
127 | binWidth_truncated.append(row[0])
128 | acc_truncated.append(round(Decimal(float(row[1])), 4))
129 | fpr_truncated.append(round(Decimal(float(row[2])), 9))
130 | fnr_truncated.append(round(Decimal(float(row[3])), 4))
131 |
132 | #Gather number of bins used in the truncation
133 | truncated_info_file = open("truncationInfo/" + website + ".csv", 'r')
134 | truncation_info = csv.reader(truncated_info_file, delimiter=',')
135 | truncated_bins = []
136 |
137 | for n, row in enumerate(truncation_info):
138 | if(n == 0):
139 | continue
140 | truncated_bins.append(row[1])
141 |
142 | #Generate plot
143 | fig = plt.figure()
144 | ax1 = fig.add_subplot(111)
145 |
146 | print "Current feature set: "+ str(binWidth_full)
147 | print "FPR-Full: " + str(fpr_full)
148 | print "FPR-Truncated: " + str(fpr_truncated)
149 |
150 | ind = np.arange(len(binWidth_full)) # the x locations for the groups
151 | width = 0.40
152 |
153 | rects1 = ax1.bar(ind - width, fpr_full, width, color=colors[0], label='FPR-Full')
154 | #autolabel(rects1,ax1)
155 | rects2 = ax1.bar(ind, fpr_truncated, width, color=colors[1], label='FPR-Truncated')
156 | #autolabel(rects2,ax1)
157 |
158 |
159 | ax1.yaxis.grid(color='black', linestyle='dotted')
160 | ax1.set_title('Truncation effect on FPR - %s'%(website), fontsize = 10)
161 |
162 | ax1.set_xticks(ind)
163 | labels = ["K = " + x + "\nBins = " + str(truncated_bins[n]) for n, x in enumerate(binWidth_full)]
164 | ax1.set_xticklabels(labels)
165 | ax1.legend()
166 |
167 | plt.xticks(fontsize=7)
168 | plt.tight_layout()
169 | #plt.ylim(0, 1)
170 | fig.savefig('Figures/%s/WF_FPR_normal_%s.pdf'%(website, website)) # save the figure to file
171 | fig.savefig('Figures/%s/WF_FPR_normal_%s.png'%(website, website)) # save the figure to file
172 | plt.close(fig)
173 | profile_data_full.close()
174 | profile_data_truncated.close()
175 |
176 |
177 | def PlotNormalFNRComparison():
178 | websites = set()
179 |
180 | #Compute the set of websites to compare
181 | for profile in os.listdir("classificationResults/"):
182 | if(".DS_Store" in profile):
183 | continue
184 | website = profile.split("_")[2]
185 | website = website[:-4]
186 | websites.add(website)
187 |
188 |
189 | for website in websites:
190 | if not os.path.exists("Figures/%s"%(website)):
191 | os.makedirs("Figures/%s"%(website))
192 |
193 | #Gather results for full distribution
194 | profile_data_full = open("classificationResults/SingleWebsite_full_" + website + ".csv", 'rb')
195 | csv_reader_full = csv.reader(profile_data_full, delimiter=',')
196 |
197 | binWidth_full = []
198 | acc_full = []
199 | fpr_full = []
200 | fnr_full = []
201 |
202 | for n, row in enumerate(csv_reader_full):
203 | if(n == 0):
204 | continue
205 | binWidth_full.append(row[0])
206 | acc_full.append(round(Decimal(float(row[1])), 4))
207 | fpr_full.append(round(Decimal(float(row[2])), 4))
208 | fnr_full.append(round(Decimal(float(row[3])), 4))
209 |
210 |
211 | #Gather results for truncated distribution
212 | profile_data_truncated = open("classificationResults/SingleWebsite_truncated_" + website + ".csv", 'rb')
213 | csv_reader_truncated = csv.reader(profile_data_truncated, delimiter=',')
214 |
215 | binWidth_truncated = []
216 | acc_truncated = []
217 | fpr_truncated = []
218 | fnr_truncated = []
219 |
220 | for n, row in enumerate(csv_reader_truncated):
221 | if(n == 0):
222 | continue
223 | binWidth_truncated.append(row[0])
224 | acc_truncated.append(round(Decimal(float(row[1])), 4))
225 | fpr_truncated.append(round(Decimal(float(row[2])), 4))
226 | fnr_truncated.append(round(Decimal(float(row[3])), 4))
227 |
228 |
229 | #Gather number of bins used in the truncation
230 | truncated_info_file = open("truncationInfo/" + website + ".csv", 'r')
231 | truncation_info = csv.reader(truncated_info_file, delimiter=',')
232 | truncated_bins = []
233 |
234 | for n, row in enumerate(truncation_info):
235 | if(n == 0):
236 | continue
237 | truncated_bins.append(row[1])
238 |
239 |
240 | #Generate plot
241 | fig = plt.figure()
242 | ax1 = fig.add_subplot(111)
243 |
244 | print "Current feature set: "+ str(binWidth_full)
245 | print "FNR-Full: " + str(fnr_full)
246 | print "FNR-Truncated: " + str(fnr_truncated)
247 |
248 | ind = np.arange(len(binWidth_full)) # the x locations for the groups
249 | width = 0.40
250 |
251 | rects1 = ax1.bar(ind - width, fnr_full, width, color=colors[0], label='FNR-Full')
252 | autolabel(rects1,ax1)
253 | rects2 = ax1.bar(ind, fnr_truncated, width, color=colors[1], label='FNR-Truncated')
254 | autolabel(rects2,ax1)
255 |
256 |
257 | ax1.yaxis.grid(color='black', linestyle='dotted')
258 | ax1.set_title('Truncation effect on FNR - %s'%(website), fontsize = 10)
259 |
260 | ax1.set_xticks(ind)
261 | labels = ["K = " + x + "\nBins = " + str(truncated_bins[n]) for n, x in enumerate(binWidth_full)]
262 | ax1.set_xticklabels(labels)
263 | ax1.legend()
264 |
265 | plt.xticks(fontsize=7)
266 | plt.tight_layout()
267 | plt.ylim(0, 1)
268 | fig.savefig('Figures/%s/WF_FNR_normal_%s.pdf'%(website, website)) # save the figure to file
269 | fig.savefig('Figures/%s/WF_FNR_normal_%s.png'%(website, website)) # save the figure to file
270 | plt.close(fig)
271 | profile_data_full.close()
272 | profile_data_truncated.close()
273 |
274 |
275 |
276 | def GenerateFigures():
277 | if not os.path.exists("Figures"):
278 | os.makedirs("Figures")
279 |
280 | PlotNormalFNRComparison()
281 | PlotNormalFPRComparison()
--------------------------------------------------------------------------------
/Security Tasks Evaluation/MPTAnalysis/FacetAnalysis/generateFigures.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | import math
4 |
5 | import matplotlib
6 | if os.environ.get('DISPLAY','') == '':
7 | print('no display found. Using non-interactive Agg backend')
8 | matplotlib.use('Agg')
9 | import matplotlib.pyplot as plt
10 |
11 |
12 | colors = ["0.8", "0.6", "0.2", "0.0"]
13 | colors = ["salmon", "lightsteelblue", "darkseagreen", "thistle", "wheat", "khaki", "skyblue"]
14 |
15 | """
16 | Attach a text label above each bar displaying its height
17 | """
18 | def autolabel(rects, ax):
19 | for rect in rects:
20 | height = rect.get_height()
21 | ax.text(rect.get_x() + rect.get_width()/2., 1.005*height, # original height was 1.005*height
22 | "{0:.2f}".format(float(height)), fontsize=7, ha='center', va='bottom')
23 |
24 |
25 | def PlotQuantization(binWidths, n_flows):
26 | print "PlotQuantization"
27 | feature_sets = []
28 | set_acc = []
29 | set_fpr =[]
30 | set_fnr = []
31 |
32 | for binWidth in binWidths:
33 |
34 | feature_folder = 'PL_60_' + str(binWidth) + '_1500'
35 | #print feature_folder
36 |
37 | #Load configuration results
38 | data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase2_NoSketch.npy"
39 | results = np.load(data_folder)
40 | set_acc.append(results[0])
41 | set_fpr.append(results[1])
42 | set_fnr.append(results[2])
43 | feature_sets.append(feature_folder)
44 |
45 |
46 | max_acc = 0
47 | max_fset = ""
48 | for i, f_set in enumerate(feature_sets):
49 | if set_acc[i] > max_acc:
50 | max_acc = set_acc[i]
51 | max_fset = f_set
52 | print "Max acc: %s, Best quantization set: %s"%(max_acc, max_fset)
53 |
54 | fig = plt.figure(figsize=(10,4))
55 | ax1 = fig.add_subplot(111)
56 |
57 | curr_fset = feature_sets
58 | curr_acc = set_acc
59 | curr_fpr = set_fpr
60 | curr_fnr = set_fnr
61 | #print "Current feature set: "+ str(curr_fset)
62 |
63 | ind = np.arange(len(curr_fset)) # the x locations for the groups
64 | width = 0.20
65 |
66 | rects0 = ax1.bar(ind - width - width/2, curr_acc, width, color=colors[0], label='Acc')
67 | autolabel(rects0,ax1)
68 | rects1 = ax1.bar(ind - width/2 , curr_fpr, width, color=colors[1], label='FPR')
69 | autolabel(rects1,ax1)
70 | rects2 = ax1.bar(ind + width - width/2, curr_fnr, width, color=colors[2], label='FNR')
71 | autolabel(rects2,ax1)
72 |
73 |
74 | ax1.yaxis.grid(color='black', linestyle='dotted')
75 | ax1.set_title('Scores for Quantization')
76 | ax1.set_xticks(ind)
77 | labels = ["K = " + str(int(x.split('_')[2])) + " -> " + str(1500/int(x.split('_')[2])) + " bins" + "\n(PerFlow = " + str(int(1500/int(x.split('_')[2]))*4) + " B)" + "\n(CGMem = " + str((n_flows * int(1500/int(x.split('_')[2]))*4)/1024) + " KB)" for x in feature_sets]
78 | ax1.set_xticklabels(labels)
79 | plt.xticks(fontsize=7)
80 | ax1.legend()
81 |
82 | plt.ylim(top=1)
83 | plt.legend(loc='upper right', fontsize=8)
84 | plt.tight_layout()
85 | fig.savefig('Figures/Facet_bin_NoSketch.pdf') # save the figure to file
86 | fig.savefig('Figures/Facet_bin_NoSketch.png') # save the figure to file
87 | plt.close(fig)
88 |
89 |
90 | def PlotQuantizationLines(binWidths, n_flows):
91 | print "PlotQuantizationLines"
92 | feature_sets = []
93 | set_acc = []
94 |
95 | for binWidth in binWidths:
96 |
97 | feature_folder = 'PL_60_' + str(binWidth) + '_1500'
98 | #print feature_folder
99 |
100 | #Load configuration results
101 | data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase2_NoSketch.npy"
102 | results = np.load(data_folder)
103 | set_acc.append(results[3])
104 | feature_sets.append(feature_folder)
105 |
106 |
107 |
108 | fig = plt.figure(figsize=(10,4))
109 | ax1 = fig.add_subplot(111)
110 |
111 | curr_fset = feature_sets
112 | curr_acc = set_acc
113 |
114 | ind = np.arange(len(curr_fset)) # the x locations for the groups
115 | print curr_acc
116 | ax1.plot(curr_acc, color=colors[0], marker=".", markersize=12, lw=3, label='AUC')
117 | ax1.hlines(0.99, 0, len(ind)-1, lw=2, label='Baseline, AUC = 0.99')
118 |
119 | for i,j in zip(ind,curr_acc):
120 | ax1.annotate("{0:.2f}".format(j),xy=(i-0.15,j-0.08))
121 |
122 |
123 | ax1.yaxis.grid(color='black', linestyle='dotted')
124 | plt.yticks(fontsize=14)
125 | plt.ylim(bottom=0,top=1)
126 | plt.ylabel("AUC Score", fontsize=14)
127 |
128 |
129 | plt.xlim(-0.3, len(ind)-1+0.3)
130 | ax1.set_xticks(ind)
131 | labels = [str(int(x.split('_')[2])) for x in feature_sets]
132 | #labels = ["K = " + str(int(x.split('_')[2])) + " -> " + str(1500/int(x.split('_')[2])) + " bins" + "\n(PerFlow = " + str(int(1500/int(x.split('_')[2]))*4) + " B)" + "\n(CGMem = " + str((n_flows * int(1500/int(x.split('_')[2]))*4)/1024) + " KB)" for x in feature_sets]
133 | #labels = ["K = " + str(int(x.split('_')[2])) + "\nPF = " + str(int(1500/int(x.split('_')[2]))*4) + " B" + "\nTM = " + str((n_flows * int(1500/int(x.split('_')[2]))*4)/1024) + " KB" for x in feature_sets]
134 | ax1.set_xticklabels(labels)
135 | plt.xticks(fontsize=11)
136 | plt.xlabel("Quantization Factor", fontsize=14)
137 | ax1.legend()
138 |
139 |
140 | plt.legend(loc='lower right', fontsize=12)
141 | plt.tight_layout()
142 | fig.savefig('Figures/Facet_bin_NoSketch_Lines.pdf') # save the figure to file
143 | fig.savefig('Figures/Facet_bin_NoSketch_Lines.png') # save the figure to file
144 | plt.close(fig)
145 |
146 |
147 | def PlotKQuantizationAndTruncation(binWidths, topk_features, n_flows):
148 | print "PlotKQuantizationAndTruncation"
149 | if not os.path.exists('Figures/Truncation_comparison'):
150 | os.makedirs('Figures/Truncation_comparison')
151 |
152 | for binWidth in binWidths:
153 | feature_sets = []
154 | set_acc = []
155 | set_fpr =[]
156 | set_fnr = []
157 |
158 | for topk in topk_features:
159 |
160 | feature_folder = 'PL_60_' + str(binWidth) + '_' + str(topk)
161 | #print feature_folder
162 |
163 | if(topk != 1500 and topk > 1500/binWidth):
164 | #print "Skipping sample, invalid configuration. TopK = " + str(topk) + " Total Features = " + str(1500/binWidth)
165 | set_acc.append(0)
166 | set_fpr.append(0)
167 | set_fnr.append(0)
168 | feature_sets.append(feature_folder)
169 | continue
170 |
171 | #Load configuration results
172 | #if(topk == 1500):
173 | # data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase1_NoSketch.npy"
174 | #else:
175 | data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase2_NoSketch.npy"
176 | results = np.load(data_folder)
177 | set_acc.append(results[0])
178 | set_fpr.append(results[1])
179 | set_fnr.append(results[2])
180 | feature_sets.append(feature_folder)
181 |
182 |
183 | #Check best truncation value
184 | max_acc = 0
185 | max_fset = ""
186 | for i, f_set in enumerate(feature_sets[:-1]):
187 | if set_acc[i] > max_acc:
188 | max_acc = set_acc[i]
189 | max_fset = f_set
190 | print "K = " + str(binWidth) + ", Max acc: %s, Best Truncation: %s"%(max_acc, max_fset)
191 |
192 |
193 | #Plot figures
194 | fig = plt.figure(figsize=(10,4))
195 | ax1 = fig.add_subplot(111)
196 |
197 | curr_fset = feature_sets
198 | curr_acc = set_acc
199 | curr_fpr = set_fpr
200 | curr_fnr = set_fnr
201 | #print "Current feature set: "+ str(curr_fset)
202 |
203 | ind = np.arange(len(curr_fset)) # the x locations for the groups
204 | width = 0.20
205 |
206 | rects0 = ax1.bar(ind - width - width/2, curr_acc, width, color=colors[0], label='Acc')
207 | autolabel(rects0,ax1)
208 | rects1 = ax1.bar(ind - width/2 , curr_fpr, width, color=colors[1], label='FPR')
209 | autolabel(rects1,ax1)
210 | rects2 = ax1.bar(ind + width - width/2, curr_fnr, width, color=colors[2], label='FNR')
211 | autolabel(rects2,ax1)
212 |
213 | ax1.yaxis.grid(color='black', linestyle='dotted')
214 | ax1.set_title('Truncation Scores for K ='+str(binWidth))
215 | ax1.set_xticks(ind)
216 | print feature_sets
217 | labels = ["Top-k= " + str(int(x.split('_')[3])) + "\n(PerFlow = " + str(int(x.split('_')[3])*4) + " B)" + "\n(CGMem = " + str((n_flows * int(x.split('_')[3]) * 4)/1024) + " KB)" for x in feature_sets]
218 | labels[len(topk_features)-1] = str(int(1500/binWidth)) + " features\n(PerFlow = " + str(int(1500/binWidth)*4) + " B)" + "\n(CGMem = " + str(int((n_flows * int(1500/binWidth) * 4)/1024)) + " KB)"
219 | ax1.set_xticklabels(labels)
220 | plt.xticks(fontsize=9)
221 | ax1.legend()
222 |
223 | plt.ylim(top=1)
224 | plt.legend(loc='upper right', fontsize=10)
225 | plt.tight_layout()
226 | fig.savefig('Figures/Truncation_comparison/Facet_bin' + str(binWidth) + '_topk_NoSketch.pdf') # save the figure to file
227 | fig.savefig('Figures/Truncation_comparison/Facet_bin' + str(binWidth) + '_topk_NoSketch.png') # save the figure to file
228 | plt.close(fig)
229 |
230 | def PlotKQuantizationAndTruncationLines(binWidths, topk_features, n_flows):
231 | print "PlotKQuantizationAndTruncation"
232 | if not os.path.exists('Figures/Truncation_comparison'):
233 | os.makedirs('Figures/Truncation_comparison')
234 |
235 | for binWidth in binWidths:
236 | feature_sets = []
237 | set_acc = []
238 |
239 | for topk in topk_features:
240 |
241 | feature_folder = 'PL_60_' + str(binWidth) + '_' + str(topk)
242 | #print feature_folder
243 |
244 | if(topk != 1500 and topk > 1500/binWidth):
245 | #print "Skipping sample, invalid configuration. TopK = " + str(topk) + " Total Features = " + str(1500/binWidth)
246 | set_acc.append(0)
247 | feature_sets.append(feature_folder)
248 | continue
249 |
250 | #Load configuration results
251 | #if(topk == 1500):
252 | # data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase1_NoSketch.npy"
253 | #else:
254 | data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase2_NoSketch.npy"
255 | results = np.load(data_folder)
256 | set_acc.append(results[3])
257 | feature_sets.append(feature_folder)
258 |
259 |
260 | #Plot figures
261 | fig = plt.figure(figsize=(10,4))
262 | ax1 = fig.add_subplot(111)
263 |
264 | curr_fset = feature_sets
265 | curr_acc = set_acc
266 |
267 | #print "Current feature set: "+ str(curr_fset)
268 |
269 | ind = np.arange(len(curr_fset)) # the x locations for the groups
270 |
271 |
272 | ax1.plot(curr_acc, color=colors[0], marker=".", markersize=12, lw=3, label='AUC')
273 | ax1.hlines(0.99, 0, len(ind)-1, lw=3, label='Baseline, AUC = 0.99')
274 |
275 | for i,j in zip(ind,curr_acc):
276 | ax1.annotate("{0:.2f}".format(j),xy=(i-0.1,j-0.08))
277 |
278 | plt.xlim(-0.3, len(ind)-1+0.3)
279 | ax1.yaxis.grid(color='black', linestyle='dotted')
280 |
281 | ax1.set_xticks(ind)
282 | print feature_sets
283 | labels = [str(int(x.split('_')[3])) for x in feature_sets]
284 | #labels = ["Top-n= " + str(int(x.split('_')[3])) + "\nPF = " + str(int(x.split('_')[3])*4) + " B" + "\nTM = " + str((n_flows * int(x.split('_')[3]) * 4)/1024) + " KB" for x in feature_sets]
285 | #labels[len(topk_features)-1] = str(int(1500/binWidth)) + " features\n(PF = " + str(int(1500/binWidth)*4) + " B)" + "\n(TMem = " + str(int((n_flows * int(1500/binWidth) * 4)/1024)) + " KB)"
286 | ax1.set_xticklabels(labels)
287 | plt.xticks(fontsize=9)
288 | plt.xlabel("Truncation Factor", fontsize=12)
289 | ax1.legend()
290 |
291 |
292 | plt.yticks(fontsize=12)
293 | plt.ylim(bottom=0,top=1)
294 | plt.ylabel("AUC Score", fontsize=12)
295 |
296 | plt.legend(loc='lower right', fontsize=12)
297 | plt.tight_layout()
298 | fig.savefig('Figures/Truncation_comparison/Facet_bin' + str(binWidth) + '_topk_NoSketch_Lines.pdf') # save the figure to file
299 | fig.savefig('Figures/Truncation_comparison/Facet_bin' + str(binWidth) + '_topk_NoSketch_Lines.png') # save the figure to file
300 | plt.close(fig)
301 |
302 |
303 | def GenerateFigures(binWidths, topk_features, nFlows):
304 | if not os.path.exists('Figures'):
305 | os.makedirs('Figures')
306 |
307 | PlotQuantization(binWidths, nFlows)
308 | PlotKQuantizationAndTruncation(binWidths, topk_features, nFlows)
309 |
310 |
311 |
312 | def GenerateFiguresLine(binWidths, topk_features, nFlows):
313 | if not os.path.exists('Figures'):
314 | os.makedirs('Figures')
315 |
316 | TOPK = [10, 20, 30, 40, 50]
317 | PlotQuantizationLines(binWidths, nFlows)
318 | PlotKQuantizationAndTruncationLines(binWidths, TOPK, nFlows)
319 |
320 |
321 |
322 | if __name__ == "__main__":
323 |
324 | #Quantization
325 | BIN_WIDTH = [1, 4, 8, 16, 32, 64, 128, 256]
326 |
327 | #Truncation Top-K features
328 | TOPK = [5, 10, 20, 30, 40, 50, 1500]
329 | TOPK = [10, 20, 30, 40, 50]
330 |
331 | #Total amount of flows per dataset
332 | N_FLOWS = 1000
333 |
334 | PlotQuantizationLines(BIN_WIDTH, N_FLOWS)
335 | PlotKQuantizationAndTruncationLines(BIN_WIDTH, TOPK, N_FLOWS)
--------------------------------------------------------------------------------
/Security Tasks Evaluation/MPTAnalysis/DeltaShaperAnalysis/generateFigures.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | import math
4 |
5 | import matplotlib
6 | if os.environ.get('DISPLAY','') == '':
7 | print('no display found. Using non-interactive Agg backend')
8 | matplotlib.use('Agg')
9 | import matplotlib.pyplot as plt
10 |
11 |
12 | colors = ["0.8", "0.6", "0.2", "0.0"]
13 | colors = ["salmon", "lightsteelblue", "darkseagreen", "thistle", "wheat", "khaki", "skyblue"]
14 |
15 | """
16 | Attach a text label above each bar displaying its height
17 | """
18 | def autolabel(rects, ax):
19 | for rect in rects:
20 | height = rect.get_height()
21 | ax.text(rect.get_x() + rect.get_width()/2., 1.005*height, # original height was 1.005*height
22 | "{0:.2f}".format(float(height)), fontsize=7, ha='center', va='bottom')
23 |
24 |
25 | def PlotQuantization(binWidths, n_flows):
26 | print "PlotQuantization"
27 | feature_sets = []
28 | set_acc = []
29 | set_fpr =[]
30 | set_fnr = []
31 |
32 | for binWidth in binWidths:
33 |
34 | feature_folder = 'PL_60_' + str(binWidth) + '_1500'
35 | #print feature_folder
36 |
37 | #Load configuration results
38 | data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase2_NoSketch.npy"
39 | results = np.load(data_folder)
40 | set_acc.append(results[0])
41 | set_fpr.append(results[1])
42 | set_fnr.append(results[2])
43 | feature_sets.append(feature_folder)
44 |
45 |
46 | max_acc = 0
47 | max_fset = ""
48 | for i, f_set in enumerate(feature_sets):
49 | if set_acc[i] > max_acc:
50 | max_acc = set_acc[i]
51 | max_fset = f_set
52 | print "Max acc: %s, Best quantization set: %s"%(max_acc, max_fset)
53 |
54 | fig = plt.figure(figsize=(10,4))
55 | ax1 = fig.add_subplot(111)
56 |
57 | curr_fset = feature_sets
58 | curr_acc = set_acc
59 | curr_fpr = set_fpr
60 | curr_fnr = set_fnr
61 | #print "Current feature set: "+ str(curr_fset)
62 |
63 | ind = np.arange(len(curr_fset)) # the x locations for the groups
64 | width = 0.20
65 |
66 | rects0 = ax1.bar(ind - width - width/2, curr_acc, width, color=colors[0], label='Acc')
67 | autolabel(rects0,ax1)
68 | rects1 = ax1.bar(ind - width/2 , curr_fpr, width, color=colors[1], label='FPR')
69 | autolabel(rects1,ax1)
70 | rects2 = ax1.bar(ind + width - width/2, curr_fnr, width, color=colors[2], label='FNR')
71 | autolabel(rects2,ax1)
72 |
73 |
74 | ax1.yaxis.grid(color='black', linestyle='dotted')
75 | ax1.set_title('Scores for Quantization')
76 | ax1.set_xticks(ind)
77 | labels = ["K = " + str(int(x.split('_')[2])) + " -> " + str(1500/int(x.split('_')[2])) + " bins" + "\n(PerFlow = " + str(int(1500/int(x.split('_')[2]))*4) + " B)" + "\n(CGMem = " + str((n_flows * int(1500/int(x.split('_')[2]))*4)/1024) + " KB)" for x in feature_sets]
78 | ax1.set_xticklabels(labels)
79 | plt.xticks(fontsize=7)
80 | ax1.legend()
81 |
82 | plt.ylim(top=1)
83 | plt.legend(loc='upper right', fontsize=8)
84 | plt.tight_layout()
85 | fig.savefig('Figures/DeltaShaper_bin_NoSketch.pdf') # save the figure to file
86 | fig.savefig('Figures/DeltaShaper_bin_NoSketch.png') # save the figure to file
87 | plt.close(fig)
88 |
89 |
90 | def PlotQuantizationLines(binWidths, n_flows):
91 | print "PlotQuantizationLines"
92 | feature_sets = []
93 | set_acc = []
94 |
95 | for binWidth in binWidths:
96 |
97 | feature_folder = 'PL_60_' + str(binWidth) + '_1500'
98 | #print feature_folder
99 |
100 | #Load configuration results
101 | data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase2_NoSketch.npy"
102 | results = np.load(data_folder)
103 | set_acc.append(results[3])
104 | feature_sets.append(feature_folder)
105 |
106 |
107 |
108 | fig = plt.figure(figsize=(10,4))
109 | ax1 = fig.add_subplot(111)
110 |
111 | curr_fset = feature_sets
112 | curr_acc = set_acc
113 |
114 | ind = np.arange(len(curr_fset)) # the x locations for the groups
115 | print curr_acc
116 | ax1.plot(curr_acc, color=colors[0], marker=".", markersize=12, lw=3, label='AUC')
117 | ax1.hlines(0.95, 0, len(ind)-1, lw=3, label='Baseline, AUC = 0.95')
118 |
119 | for i,j in zip(ind,curr_acc):
120 | ax1.annotate("{0:.2f}".format(j),xy=(i-0.15,j+0.03))
121 |
122 | ax1.yaxis.grid(color='black', linestyle='dotted')
123 | plt.yticks(fontsize=14)
124 | plt.ylim(bottom=0,top=1)
125 | plt.ylabel("AUC Score", fontsize=14)
126 |
127 |
128 | plt.xlim(-0.3, len(ind)-1+0.3)
129 | ax1.set_xticks(ind)
130 | #labels = ["K = " + str(int(x.split('_')[2])) + " -> " + str(1500/int(x.split('_')[2])) + " bins" + "\n(PerFlow = " + str(int(1500/int(x.split('_')[2]))*4) + " B)" + "\n(CGMem = " + str((n_flows * int(1500/int(x.split('_')[2]))*4)/1024) + " KB)" for x in feature_sets]
131 | #labels = ["K = " + str(int(x.split('_')[2])) + "\nPF = " + str(int(1500/int(x.split('_')[2]))*4) + " B" + "\nTM = " + str((n_flows * int(1500/int(x.split('_')[2]))*4)/1024) + " KB" for x in feature_sets]
132 | labels = [str(int(x.split('_')[2])) for x in feature_sets]
133 |
134 | ax1.set_xticklabels(labels)
135 | plt.xticks(fontsize=9)
136 | plt.xlabel("DeltaShaper Quantization Factor (K)", fontsize=12)
137 | ax1.legend()
138 |
139 |
140 | plt.legend(loc='lower right', fontsize=12)
141 | plt.tight_layout()
142 | fig.savefig('Figures/DeltaShaper_bin_NoSketch_Lines.pdf') # save the figure to file
143 | fig.savefig('Figures/DeltaShaper_bin_NoSketch_Lines.png') # save the figure to file
144 | plt.close(fig)
145 |
146 |
147 | def PlotKQuantizationAndTruncation(binWidths, topk_features, n_flows):
148 | print "PlotKQuantizationAndTruncation"
149 | if not os.path.exists('Figures/Truncation_comparison'):
150 | os.makedirs('Figures/Truncation_comparison')
151 |
152 | for binWidth in binWidths:
153 | feature_sets = []
154 | set_acc = []
155 | set_fpr =[]
156 | set_fnr = []
157 |
158 | for topk in topk_features:
159 |
160 | feature_folder = 'PL_60_' + str(binWidth) + '_' + str(topk)
161 | #print feature_folder
162 |
163 | if(topk != 1500 and topk > 1500/binWidth):
164 | #print "Skipping sample, invalid configuration. TopK = " + str(topk) + " Total Features = " + str(1500/binWidth)
165 | set_acc.append(0)
166 | set_fpr.append(0)
167 | set_fnr.append(0)
168 | feature_sets.append(feature_folder)
169 | continue
170 |
171 | #Load configuration results
172 | #if(topk == 1500):
173 | # data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase1_NoSketch.npy"
174 | #else:
175 | data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase2_NoSketch.npy"
176 | results = np.load(data_folder)
177 | set_acc.append(results[0])
178 | set_fpr.append(results[1])
179 | set_fnr.append(results[2])
180 | feature_sets.append(feature_folder)
181 |
182 |
183 | #Check best truncation value
184 | max_acc = 0
185 | max_fset = ""
186 | for i, f_set in enumerate(feature_sets[:-1]):
187 | if set_acc[i] > max_acc:
188 | max_acc = set_acc[i]
189 | max_fset = f_set
190 | print "K = " + str(binWidth) + ", Max acc: %s, Best Truncation: %s"%(max_acc, max_fset)
191 |
192 |
193 | #Plot figures
194 | fig = plt.figure(figsize=(10,4))
195 | ax1 = fig.add_subplot(111)
196 |
197 | curr_fset = feature_sets
198 | curr_acc = set_acc
199 | curr_fpr = set_fpr
200 | curr_fnr = set_fnr
201 | #print "Current feature set: "+ str(curr_fset)
202 |
203 | ind = np.arange(len(curr_fset)) # the x locations for the groups
204 | width = 0.20
205 |
206 | rects0 = ax1.bar(ind - width - width/2, curr_acc, width, color=colors[0], label='Acc')
207 | autolabel(rects0,ax1)
208 | rects1 = ax1.bar(ind - width/2 , curr_fpr, width, color=colors[1], label='FPR')
209 | autolabel(rects1,ax1)
210 | rects2 = ax1.bar(ind + width - width/2, curr_fnr, width, color=colors[2], label='FNR')
211 | autolabel(rects2,ax1)
212 |
213 | ax1.yaxis.grid(color='black', linestyle='dotted')
214 | ax1.set_title('Truncation Scores for K ='+str(binWidth))
215 | ax1.set_xticks(ind)
216 | print feature_sets
217 | labels = ["Top-k= " + str(int(x.split('_')[3])) + "\n(PerFlow = " + str(int(x.split('_')[3])*4) + " B)" + "\n(CGMem = " + str((n_flows * int(x.split('_')[3]) * 4)/1024) + " KB)" for x in feature_sets]
218 | labels[len(topk_features)-1] = str(int(1500/binWidth)) + " features\n(PerFlow = " + str(int(1500/binWidth)*4) + " B)" + "\n(CGMem = " + str(int((n_flows * int(1500/binWidth) * 4)/1024)) + " KB)"
219 | ax1.set_xticklabels(labels)
220 | plt.xticks(fontsize=9)
221 | ax1.legend()
222 |
223 | plt.ylim(top=1)
224 | plt.legend(loc='upper right', fontsize=10)
225 | plt.tight_layout()
226 | fig.savefig('Figures/Truncation_comparison/DeltaShaper_bin' + str(binWidth) + '_topk_NoSketch.pdf') # save the figure to file
227 | fig.savefig('Figures/Truncation_comparison/DeltaShaper_bin' + str(binWidth) + '_topk_NoSketch.png') # save the figure to file
228 | plt.close(fig)
229 |
230 | def PlotKQuantizationAndTruncationLines(binWidths, topk_features, n_flows):
231 | print "PlotKQuantizationAndTruncation"
232 | if not os.path.exists('Figures/Truncation_comparison'):
233 | os.makedirs('Figures/Truncation_comparison')
234 |
235 | for binWidth in binWidths:
236 | feature_sets = []
237 | set_acc = []
238 |
239 | for topk in topk_features:
240 |
241 | feature_folder = 'PL_60_' + str(binWidth) + '_' + str(topk)
242 | #print feature_folder
243 |
244 | if(topk != 1500 and topk > 1500/binWidth):
245 | #print "Skipping sample, invalid configuration. TopK = " + str(topk) + " Total Features = " + str(1500/binWidth)
246 | set_acc.append(0)
247 | feature_sets.append(feature_folder)
248 | continue
249 |
250 | #Load configuration results
251 | #if(topk == 1500):
252 | # data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase1_NoSketch.npy"
253 | #else:
254 | data_folder = 'classificationResults/' + feature_folder + '/' + "classificationResults_phase2_NoSketch.npy"
255 | results = np.load(data_folder)
256 | set_acc.append(results[3])
257 | feature_sets.append(feature_folder)
258 |
259 |
260 | #Plot figures
261 | fig = plt.figure(figsize=(10,4))
262 | ax1 = fig.add_subplot(111)
263 |
264 | curr_fset = feature_sets
265 | curr_acc = set_acc
266 |
267 | #print "Current feature set: "+ str(curr_fset)
268 |
269 | ind = np.arange(len(curr_fset)) # the x locations for the groups
270 |
271 |
272 | ax1.plot(curr_acc, color=colors[0], marker=".", markersize=12, lw=3, label='AUC')
273 | ax1.hlines(0.95, 0, len(ind)-1, lw=3, label='Baseline, AUC = 0.95')
274 |
275 | for i,j in zip(ind,curr_acc):
276 | ax1.annotate("{0:.2f}".format(j),xy=(i-0.1,j+0.03))
277 |
278 | plt.xlim(-0.3, len(ind)-1+0.3)
279 | ax1.yaxis.grid(color='black', linestyle='dotted')
280 |
281 | ax1.set_xticks(ind)
282 | print feature_sets
283 | labels = [str(int(x.split('_')[3])) for x in feature_sets]
284 | #labels = ["Top-n= " + str(int(x.split('_')[3])) + "\nPF = " + str(int(x.split('_')[3])*4) + " B" + "\nTM = " + str((n_flows * int(x.split('_')[3]) * 4)/1024) + " KB" for x in feature_sets]
285 | #labels[len(topk_features)-1] = str(int(1500/binWidth)) + " features\n(PF = " + str(int(1500/binWidth)*4) + " B)" + "\n(TMem = " + str(int((n_flows * int(1500/binWidth) * 4)/1024)) + " KB)"
286 | ax1.set_xticklabels(labels)
287 | plt.xticks(fontsize=9)
288 | plt.xlabel("Truncation Factor", fontsize=12)
289 | ax1.legend()
290 |
291 |
292 | plt.yticks(fontsize=12)
293 | plt.ylim(bottom=0,top=1)
294 | plt.ylabel("AUC Score", fontsize=12)
295 |
296 | plt.legend(loc='lower right', fontsize=12)
297 | plt.tight_layout()
298 | fig.savefig('Figures/Truncation_comparison/DeltaShaper_bin' + str(binWidth) + '_topk_NoSketch_Lines.pdf') # save the figure to file
299 | fig.savefig('Figures/Truncation_comparison/DeltaShaper_bin' + str(binWidth) + '_topk_NoSketch_Lines.png') # save the figure to file
300 | plt.close(fig)
301 |
302 |
303 |
304 | def GenerateFigures(binWidths, topk_features, nFlows):
305 | if not os.path.exists('Figures'):
306 | os.makedirs('Figures')
307 |
308 | PlotQuantization(binWidths, nFlows)
309 | PlotKQuantizationAndTruncation(binWidths, topk_features, nFlows)
310 |
311 |
312 | def GenerateFiguresLines(binWidths, topk_features, nFlows):
313 | if not os.path.exists('Figures'):
314 | os.makedirs('Figures')
315 |
316 | TOPK = [10, 20, 30, 40, 50]
317 | PlotQuantizationLines(binWidths, nFlows)
318 | PlotKQuantizationAndTruncationLines(binWidths, TOPK, nFlows)
319 |
320 |
321 |
322 | if __name__ == "__main__":
323 |
324 | #Quantization
325 | BIN_WIDTH = [1, 4, 8, 16, 32, 64, 128, 256]
326 |
327 | #Truncation Top-K features
328 | TOPK = [5, 10, 20, 30, 40, 50, 1500]
329 | TOPK = [10, 20, 30, 40, 50]
330 |
331 |
332 | #Total amount of flows per dataset
333 | N_FLOWS = 300
334 |
335 | PlotQuantizationLines(BIN_WIDTH, N_FLOWS)
336 | PlotKQuantizationAndTruncationLines(BIN_WIDTH, TOPK, N_FLOWS)
--------------------------------------------------------------------------------
/Security Tasks Evaluation/MPTAnalysis/FacetAnalysis/compressive_ta.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import numpy as np
3 | import os
4 | import math
5 |
6 |
7 | def CreateCompressiveRepresentation(MODE, BIN_WIDTH, TOPK, SIGMA_PARAM, COMPRESSIVE_RATIO):
8 |
9 | for compressive_ratio in COMPRESSIVE_RATIO:
10 | for binWidth in BIN_WIDTH:
11 | for topk in TOPK:
12 | feature_set = 'PL_60_' + str(binWidth) + '_' + str(topk)
13 | data_folder = 'FeatureSets/' + feature_set + '/'
14 |
15 | #Sensing matrix parameters
16 | N = 0
17 | f = open(data_folder + "RegularTraffic_phase1_dataset.csv", 'r')
18 | reader = csv.reader(f, delimiter=',')
19 | for n, row in enumerate(reader):
20 | if(n == 0):
21 | N = len(row) -1 #Read number of bins from file
22 | f.close()
23 |
24 | M = N/compressive_ratio
25 |
26 | if(M < 1):
27 | print "Cannot compress further(features = %d, ratio = %d), only 1 feature left"%(N, compressive_ratio)
28 | continue
29 |
30 | np.random.seed(1)
31 |
32 | print "Compressive Ratio: %d"%(compressive_ratio)
33 | print "M: %d"%(M)
34 | print "N: %d"%(N)
35 |
36 | ######################################
37 | # GAUSSIAN SENSING MATRIX
38 | ######################################
39 | if MODE == "compressive_gaussian":
40 | print "Start Compressive Gaussian Representation"
41 | for sigma_param in SIGMA_PARAM:
42 |
43 | """
44 | Generate sensing matrix
45 | """
46 |
47 | sensing_matrix = np.random.normal(0,1,(M,N))
48 |
49 | """
50 | Process Phase 1 Data
51 | """
52 |
53 | #Regular Traffic
54 | print "Compressive Gaussian: Phase 1, Regular - " + feature_set + "/CompressiveGaussian_regularTraffic_phase1_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv"
55 | output = open(data_folder + "CompressiveGaussian_regularTraffic_phase1_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv", "w")
56 | f = open(data_folder + "RegularTraffic_phase1_dataset.csv", 'r')
57 | reader = csv.reader(f, delimiter=',')
58 |
59 | #Process data row
60 | for n, row in enumerate(reader):
61 | if(n == 0):
62 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
63 | else:
64 | #Gather the first n packets array
65 | first_n_packets_vector = []
66 | for i in row[:-1]:
67 | first_n_packets_vector.append(int(i))
68 |
69 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
70 |
71 | #print "Compressed vector: " + str(compressed_vector)
72 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
73 | output.close()
74 |
75 |
76 | #Facet Traffic
77 | print "Compressive Gaussian: Phase 1, Facet - " + feature_set + "/CompressiveGaussian_facetTraffic_phase1_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv"
78 | output = open(data_folder + "CompressiveGaussian_facetTraffic_phase1_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv", "w")
79 | f = open(data_folder + "FacetTraffic_50_phase1_dataset.csv", 'r')
80 | reader = csv.reader(f, delimiter=',')
81 |
82 | #Process data row
83 | for n, row in enumerate(reader):
84 | if(n == 0):
85 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
86 | else:
87 | #Gather the first n packets array
88 | first_n_packets_vector = []
89 | for i in row[:-1]:
90 | first_n_packets_vector.append(int(i))
91 |
92 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
93 |
94 | #print "Compressed vector: " + str(compressed_vector)
95 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
96 | output.close()
97 |
98 | ########################################################################################
99 | ########################################################################################
100 | ########################################################################################
101 |
102 |
103 | """
104 | Process Phase 2 Data
105 | """
106 |
107 | #Regular Traffic
108 | print "Compressive Gaussian: Phase 2, Regular - " + feature_set + "/CompressiveGaussian_regularTraffic_phase2_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv"
109 | output = open(data_folder + "CompressiveGaussian_regularTraffic_phase2_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv", "w")
110 | f = open(data_folder + "RegularTraffic_phase2_dataset.csv", 'r')
111 | reader = csv.reader(f, delimiter=',')
112 |
113 | #Process data row
114 | for n, row in enumerate(reader):
115 | if(n == 0):
116 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
117 | else:
118 | #Gather the first n packets array
119 | first_n_packets_vector = []
120 | for i in row[:-1]:
121 | first_n_packets_vector.append(int(i))
122 |
123 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
124 |
125 | #print "Compressed vector: " + str(compressed_vector)
126 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
127 | output.close()
128 |
129 |
130 | #Facet Traffic
131 | print "Compressive Gaussian Phase 2, Facet - " + feature_set + "/CompressiveGaussian_facetTraffic_phase2_" + str(sigma_param) + "_" + "_" + str(compressive_ratio) + "_dataset.csv"
132 | output = open(data_folder + "CompressiveGaussian_facetTraffic_phase2_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv", "w")
133 | f = open(data_folder + "FacetTraffic_50_phase2_dataset.csv", 'r')
134 | reader = csv.reader(f, delimiter=',')
135 |
136 | #Process data row
137 | for n, row in enumerate(reader):
138 | if(n == 0):
139 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
140 | else:
141 | #Gather the first n packets array
142 | first_n_packets_vector = []
143 | for i in row[:-1]:
144 | first_n_packets_vector.append(int(i))
145 |
146 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
147 |
148 | #print "Compressed vector: " + str(compressed_vector)
149 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
150 | output.close()
151 |
152 | ######################################
153 | # BERNOULLI SENSING MATRIX
154 | ######################################
155 | elif MODE == "compressive_bernoulli":
156 | print "Start Compressive Bernoulli Representation"
157 |
158 | """
159 | Generate sensing matrix
160 | """
161 | values = [-1/float(math.sqrt(N)), 1/float(math.sqrt(N))]
162 | sensing_matrix = np.random.choice(values,(M,N), p=[0.5, 0.5])
163 |
164 |
165 | """
166 | Process Phase 1 Data
167 | """
168 |
169 | #Regular Traffic
170 | print "Compressive Bernoulli: Phase 1, Regular - " + feature_set + "/CompressiveBernoulli_regularTraffic_phase1_" + str(compressive_ratio) + "_dataset.csv"
171 | output = open(data_folder + "CompressiveBernoulli_regularTraffic_phase1_" + str(compressive_ratio) + "_dataset.csv", "w")
172 | f = open(data_folder + "RegularTraffic_phase1_dataset.csv", 'r')
173 | reader = csv.reader(f, delimiter=',')
174 |
175 | #Process data row
176 | for n, row in enumerate(reader):
177 | if(n == 0):
178 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
179 | else:
180 | #Gather the first n packets array
181 | first_n_packets_vector = []
182 | for i in row[:-1]:
183 | first_n_packets_vector.append(int(i))
184 |
185 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
186 |
187 | #print "Compressed vector: " + str(compressed_vector)
188 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
189 | output.close()
190 |
191 |
192 | #Facet Traffic
193 | print "Compressive Bernoulli: Phase 1, Facet - " + feature_set + "/CompressiveBernoulli_facetTraffic_phase1_" + str(compressive_ratio) + "_dataset.csv"
194 | output = open(data_folder + "CompressiveBernoulli_facetTraffic_phase1_" + str(compressive_ratio) + "_dataset.csv", "w")
195 | f = open(data_folder + "FacetTraffic_50_phase1_dataset.csv", 'r')
196 | reader = csv.reader(f, delimiter=',')
197 |
198 | #Process data row
199 | for n, row in enumerate(reader):
200 | if(n == 0):
201 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
202 | else:
203 | #Gather the first n packets array
204 | first_n_packets_vector = []
205 | for i in row[:-1]:
206 | first_n_packets_vector.append(int(i))
207 |
208 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
209 |
210 | #print "Compressed vector: " + str(compressed_vector)
211 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
212 | output.close()
213 |
214 | ########################################################################################
215 | ########################################################################################
216 | ########################################################################################
217 |
218 |
219 | """
220 | Process Phase 2 Data
221 | """
222 |
223 | #Regular Traffic
224 | print "Compressive Bernoulli: Phase 2, Regular - " + feature_set + "/CompressiveBernoulli_regularTraffic_phase2_" + str(compressive_ratio) + "_dataset.csv"
225 | output = open(data_folder + "CompressiveBernoulli_regularTraffic_phase2_" + str(compressive_ratio) + "_dataset.csv", "w")
226 | f = open(data_folder + "RegularTraffic_phase2_dataset.csv", 'r')
227 | reader = csv.reader(f, delimiter=',')
228 |
229 | #Process data row
230 | for n, row in enumerate(reader):
231 | if(n == 0):
232 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
233 | else:
234 | #Gather the first n packets array
235 | first_n_packets_vector = []
236 | for i in row[:-1]:
237 | first_n_packets_vector.append(int(i))
238 |
239 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
240 |
241 | #print "Compressed vector: " + str(compressed_vector)
242 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
243 | output.close()
244 |
245 |
246 | #Facet Traffic
247 | print "Compressive Bernoulli Phase 2, Facet - " + feature_set + "/CompressiveBernoulli_facetTraffic_phase2_" + str(compressive_ratio) + "_dataset.csv"
248 | output = open(data_folder + "CompressiveBernoulli_facetTraffic_phase2_" + str(compressive_ratio) + "_dataset.csv", "w")
249 | f = open(data_folder + "FacetTraffic_50_phase2_dataset.csv", 'r')
250 | reader = csv.reader(f, delimiter=',')
251 |
252 | #Process data row
253 | for n, row in enumerate(reader):
254 | if(n == 0):
255 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
256 | else:
257 | #Gather the first n packets array
258 | first_n_packets_vector = []
259 | for i in row[:-1]:
260 | first_n_packets_vector.append(int(i))
261 |
262 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
263 |
264 | #print "Compressed vector: " + str(compressed_vector)
265 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
266 | output.close()
--------------------------------------------------------------------------------
/Security Tasks Evaluation/MPTAnalysis/DeltaShaperAnalysis/compressive_ta.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import numpy as np
3 | import os
4 | import math
5 |
6 |
7 | def CreateCompressiveRepresentation(MODE, BIN_WIDTH, TOPK, SIGMA_PARAM, COMPRESSIVE_RATIO):
8 |
9 | for compressive_ratio in COMPRESSIVE_RATIO:
10 | for binWidth in BIN_WIDTH:
11 | for topk in TOPK:
12 | feature_set = 'PL_60_' + str(binWidth) + '_' + str(topk)
13 | data_folder = 'FeatureSets/' + feature_set + '/'
14 |
15 | #Sensing matrix parameters
16 | N = 0
17 | f = open(data_folder + "RegularTraffic_phase1_dataset.csv", 'r')
18 | reader = csv.reader(f, delimiter=',')
19 | for n, row in enumerate(reader):
20 | if(n == 0):
21 | N = len(row) -1 #Read number of bins from file
22 | f.close()
23 |
24 | M = N/compressive_ratio
25 |
26 | if(M < 1):
27 | print "Cannot compress further(features = %d, ratio = %d), only 1 feature left"%(N, compressive_ratio)
28 | continue
29 |
30 | np.random.seed(1)
31 |
32 | print "Compressive Ratio: %d"%(compressive_ratio)
33 | print "M: %d"%(M)
34 | print "N: %d"%(N)
35 |
36 | ######################################
37 | # GAUSSIAN SENSING MATRIX
38 | ######################################
39 | if MODE == "compressive_gaussian":
40 | print "Start Compressive Gaussian Representation"
41 | for sigma_param in SIGMA_PARAM:
42 |
43 | """
44 | Generate sensing matrix
45 | """
46 |
47 | sensing_matrix = np.random.normal(0,1,(M,N))
48 |
49 | """
50 | Process Phase 1 Data
51 | """
52 |
53 | #Regular Traffic
54 | print "Compressive Gaussian: Phase 1, Regular - " + feature_set + "/CompressiveGaussian_regularTraffic_phase1_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv"
55 | output = open(data_folder + "CompressiveGaussian_regularTraffic_phase1_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv", "w")
56 | f = open(data_folder + "RegularTraffic_phase1_dataset.csv", 'r')
57 | reader = csv.reader(f, delimiter=',')
58 |
59 | #Process data row
60 | for n, row in enumerate(reader):
61 | if(n == 0):
62 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
63 | else:
64 | #Gather the first n packets array
65 | first_n_packets_vector = []
66 | for i in row[:-1]:
67 | first_n_packets_vector.append(int(i))
68 |
69 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
70 |
71 | #print "Compressed vector: " + str(compressed_vector)
72 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
73 | output.close()
74 |
75 |
76 | #DeltaShaper Traffic
77 | print "Compressive Gaussian: Phase 1, DeltaShaper - " + feature_set + "/CompressiveGaussian_deltashaperTraffic_phase1_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv"
78 | output = open(data_folder + "CompressiveGaussian_deltashaperTraffic_phase1_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv", "w")
79 | f = open(data_folder + "DeltaShaperTraffic_320_phase1_dataset.csv", 'r')
80 | reader = csv.reader(f, delimiter=',')
81 |
82 | #Process data row
83 | for n, row in enumerate(reader):
84 | if(n == 0):
85 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
86 | else:
87 | #Gather the first n packets array
88 | first_n_packets_vector = []
89 | for i in row[:-1]:
90 | first_n_packets_vector.append(int(i))
91 |
92 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
93 |
94 | #print "Compressed vector: " + str(compressed_vector)
95 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
96 | output.close()
97 |
98 | ########################################################################################
99 | ########################################################################################
100 | ########################################################################################
101 |
102 |
103 | """
104 | Process Phase 2 Data
105 | """
106 |
107 | #Regular Traffic
108 | print "Compressive Gaussian: Phase 2, Regular - " + feature_set + "/CompressiveGaussian_regularTraffic_phase2_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv"
109 | output = open(data_folder + "CompressiveGaussian_regularTraffic_phase2_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv", "w")
110 | f = open(data_folder + "RegularTraffic_phase2_dataset.csv", 'r')
111 | reader = csv.reader(f, delimiter=',')
112 |
113 | #Process data row
114 | for n, row in enumerate(reader):
115 | if(n == 0):
116 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
117 | else:
118 | #Gather the first n packets array
119 | first_n_packets_vector = []
120 | for i in row[:-1]:
121 | first_n_packets_vector.append(int(i))
122 |
123 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
124 |
125 | #print "Compressed vector: " + str(compressed_vector)
126 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
127 | output.close()
128 |
129 |
130 | #DeltaShaper Traffic
131 | print "Compressive Gaussian Phase 2, DeltaShaper - " + feature_set + "/CompressiveGaussian_deltashaperTraffic_phase2_" + str(sigma_param) + "_" + "_" + str(compressive_ratio) + "_dataset.csv"
132 | output = open(data_folder + "CompressiveGaussian_deltashaperTraffic_phase2_" + str(sigma_param) + "_" + str(compressive_ratio) + "_dataset.csv", "w")
133 | f = open(data_folder + "DeltaShaperTraffic_320_phase2_dataset.csv", 'r')
134 | reader = csv.reader(f, delimiter=',')
135 |
136 | #Process data row
137 | for n, row in enumerate(reader):
138 | if(n == 0):
139 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
140 | else:
141 | #Gather the first n packets array
142 | first_n_packets_vector = []
143 | for i in row[:-1]:
144 | first_n_packets_vector.append(int(i))
145 |
146 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
147 |
148 | #print "Compressed vector: " + str(compressed_vector)
149 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
150 | output.close()
151 |
152 | ######################################
153 | # BERNOULLI SENSING MATRIX
154 | ######################################
155 | elif MODE == "compressive_bernoulli":
156 | print "Start Compressive Bernoulli Representation"
157 |
158 | """
159 | Generate sensing matrix
160 | """
161 | values = [-1/float(math.sqrt(N)), 1/float(math.sqrt(N))]
162 | sensing_matrix = np.random.choice(values,(M,N), p=[0.5, 0.5])
163 |
164 |
165 | """
166 | Process Phase 1 Data
167 | """
168 |
169 | #Regular Traffic
170 | print "Compressive Bernoulli: Phase 1, Regular - " + feature_set + "/CompressiveBernoulli_regularTraffic_phase1_" + str(compressive_ratio) + "_dataset.csv"
171 | output = open(data_folder + "CompressiveBernoulli_regularTraffic_phase1_" + str(compressive_ratio) + "_dataset.csv", "w")
172 | f = open(data_folder + "RegularTraffic_phase1_dataset.csv", 'r')
173 | reader = csv.reader(f, delimiter=',')
174 |
175 | #Process data row
176 | for n, row in enumerate(reader):
177 | if(n == 0):
178 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
179 | else:
180 | #Gather the first n packets array
181 | first_n_packets_vector = []
182 | for i in row[:-1]:
183 | first_n_packets_vector.append(int(i))
184 |
185 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
186 |
187 | #print "Compressed vector: " + str(compressed_vector)
188 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
189 | output.close()
190 |
191 |
192 | #DeltaShaper Traffic
193 | print "Compressive Bernoulli: Phase 1, DeltaShaper - " + feature_set + "/CompressiveBernoulli_deltashaperTraffic_phase1_" + str(compressive_ratio) + "_dataset.csv"
194 | output = open(data_folder + "CompressiveBernoulli_deltashaperTraffic_phase1_" + str(compressive_ratio) + "_dataset.csv", "w")
195 | f = open(data_folder + "DeltaShaperTraffic_320_phase1_dataset.csv", 'r')
196 | reader = csv.reader(f, delimiter=',')
197 |
198 | #Process data row
199 | for n, row in enumerate(reader):
200 | if(n == 0):
201 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
202 | else:
203 | #Gather the first n packets array
204 | first_n_packets_vector = []
205 | for i in row[:-1]:
206 | first_n_packets_vector.append(int(i))
207 |
208 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
209 |
210 | #print "Compressed vector: " + str(compressed_vector)
211 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
212 | output.close()
213 |
214 | ########################################################################################
215 | ########################################################################################
216 | ########################################################################################
217 |
218 |
219 | """
220 | Process Phase 2 Data
221 | """
222 |
223 | #Regular Traffic
224 | print "Compressive Bernoulli: Phase 2, Regular - " + feature_set + "/CompressiveBernoulli_regularTraffic_phase2_" + str(compressive_ratio) + "_dataset.csv"
225 | output = open(data_folder + "CompressiveBernoulli_regularTraffic_phase2_" + str(compressive_ratio) + "_dataset.csv", "w")
226 | f = open(data_folder + "RegularTraffic_phase2_dataset.csv", 'r')
227 | reader = csv.reader(f, delimiter=',')
228 |
229 | #Process data row
230 | for n, row in enumerate(reader):
231 | if(n == 0):
232 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
233 | else:
234 | #Gather the first n packets array
235 | first_n_packets_vector = []
236 | for i in row[:-1]:
237 | first_n_packets_vector.append(int(i))
238 |
239 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
240 |
241 | #print "Compressed vector: " + str(compressed_vector)
242 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
243 | output.close()
244 |
245 |
246 | #DeltaShaper Traffic
247 | print "Compressive Bernoulli Phase 2, DeltaShaper - " + feature_set + "/CompressiveBernoulli_deltashaperTraffic_phase2_" + str(compressive_ratio) + "_dataset.csv"
248 | output = open(data_folder + "CompressiveBernoulli_deltashaperTraffic_phase2_" + str(compressive_ratio) + "_dataset.csv", "w")
249 | f = open(data_folder + "DeltaShaperTraffic_320_phase2_dataset.csv", 'r')
250 | reader = csv.reader(f, delimiter=',')
251 |
252 | #Process data row
253 | for n, row in enumerate(reader):
254 | if(n == 0):
255 | output.write(",".join(str(x) for x in range(0,M)) + "," + row[-1] + "\n")
256 | else:
257 | #Gather the first n packets array
258 | first_n_packets_vector = []
259 | for i in row[:-1]:
260 | first_n_packets_vector.append(int(i))
261 |
262 | compressed_vector = np.matmul(sensing_matrix, first_n_packets_vector)
263 |
264 | #print "Compressed vector: " + str(compressed_vector)
265 | output.write(",".join(str(x) for x in compressed_vector) + "," + row[-1] + "\n")
266 | output.close()
--------------------------------------------------------------------------------
/Flow Marker Accumulator/flowlens-v1model.p4:
--------------------------------------------------------------------------------
1 | /* -*- P4_16 -*- */
2 | #include
3 | #include
4 |
5 | /*Set number of shifts according to the quantization level
6 | QL=2, 1
7 | QL=4, 2
8 | QL=8, 3
9 | QL=16, 4
10 | QL=32, 5
11 | QL=64, 6
12 | QL=128, 7
13 | QL=256, 8
14 | */
15 |
16 | /* In our running example, we will use QL=16 */
17 | const bit<8> BIN_WIDTH_SHIFT = 4;
18 |
19 | /* Number of counters for each flow */
20 | const bit<32> FLOW_BINS = 1500 >> BIN_WIDTH_SHIFT; //94 flow counters for QL=16
21 |
22 | /* Number of flows in each partition */
23 | const bit<32> FLOWS_PER_PARTITION = 3000;
24 |
25 | const bit<32> PARTITION_SIZE = FLOWS_PER_PARTITION*FLOW_BINS;
26 |
27 | /* Number of packet sizes considered for truncation */
28 | const bit<32> NUM_PKT_SIZES = 1500;
29 |
30 | /* To flag bins not to be counted */
31 | const bit<1> NOBIN_FLAG = 0;
32 |
33 |
34 | typedef bit<9> egressSpec_t;
35 | typedef bit<48> macAddr_t;
36 | typedef bit<32> ip4Addr_t;
37 | const bit<16> TYPE_IPV4 = 0x800;
38 | typedef bit<8> ip_protocol_t;
39 | const ip_protocol_t IP_PROTOCOLS_TCP = 6;
40 | const ip_protocol_t IP_PROTOCOLS_UDP = 17;
41 |
42 | /*************************************************************************
43 | *********************** H E A D E R S ***********************************
44 | *************************************************************************/
45 |
46 | header ethernet_t {
47 | macAddr_t dstAddr;
48 | macAddr_t srcAddr;
49 | bit<16> etherType;
50 | }
51 |
52 | header ipv4_t {
53 | bit<4> version;
54 | bit<4> ihl;
55 | bit<8> diffserv;
56 | bit<16> totalLen;
57 | bit<16> identification;
58 | bit<3> flags;
59 | bit<13> fragOffset;
60 | bit<8> ttl;
61 | bit<8> protocol;
62 | bit<16> hdrChecksum;
63 | ip4Addr_t srcAddr;
64 | ip4Addr_t dstAddr;
65 | }
66 |
67 | header tcp_t {
68 | bit<16> srcPort;
69 | bit<16> dstPort;
70 | bit<32> seqNo;
71 | bit<32> ackNo;
72 | bit<4> dataOffset;
73 | bit<3> res;
74 | bit<3> ecn;
75 | bit<6> ctrl;
76 | bit<16> window;
77 | bit<16> checksum;
78 | bit<16> urgentPtr;
79 | }
80 |
81 | header udp_t {
82 | bit<16> srcPort;
83 | bit<16> dstPort;
84 | bit<16> length_;
85 | bit<16> checksum;
86 | }
87 |
88 | //User-defined metadata
89 | struct metadata {
90 | bit truncation_flag; // marks whether or not the current pkt has to be counted
91 | bit<32> rg_bin_offset; // this is computed by adding the binIndex_posTruncation to the flow_offset
92 | bit<32> binIndex_preTruncation;
93 | bit<32> binIndex_posTruncation;
94 | }
95 |
96 | struct headers {
97 | ethernet_t ethernet;
98 | ipv4_t ipv4;
99 | tcp_t tcp;
100 | udp_t udp;
101 | }
102 |
103 | /*************************************************************************
104 | *********************** P A R S E R ***********************************
105 | *************************************************************************/
106 |
107 | parser MyParser(packet_in packet,
108 | out headers hdr,
109 | inout metadata meta,
110 | inout standard_metadata_t standard_metadata) {
111 |
112 | // Initial state of the parser
113 | state start {
114 | transition parse_ethernet;
115 | }
116 |
117 | state parse_ethernet {
118 | packet.extract(hdr.ethernet);
119 | transition select(hdr.ethernet.etherType) {
120 | TYPE_IPV4: parse_ipv4;
121 | default: accept;
122 | }
123 | }
124 |
125 | state parse_ipv4 {
126 | packet.extract(hdr.ipv4);
127 | transition select(hdr.ipv4.protocol) {
128 | 6: parse_tcp;
129 | 17: parse_udp;
130 | default: accept;
131 | }
132 | }
133 |
134 | state parse_tcp {
135 | packet.extract(hdr.tcp);
136 | transition accept;
137 | }
138 |
139 | state parse_udp {
140 | packet.extract(hdr.udp);
141 | transition accept;
142 | }
143 | }
144 |
145 |
146 | /*************************************************************************
147 | ************ C H E C K S U M V E R I F I C A T I O N *************
148 | *************************************************************************/
149 |
150 | control MyVerifyChecksum(inout headers hdr, inout metadata meta) {
151 | apply { }
152 | }
153 |
154 | /*************************************************************************
155 | ************** I N G R E S S P R O C E S S I N G *******************
156 | *************************************************************************/
157 |
158 | control MyIngress(inout headers hdr,
159 | inout metadata meta,
160 | inout standard_metadata_t standard_metadata) {
161 |
162 | action drop() {
163 | mark_to_drop(standard_metadata);
164 | }
165 |
166 |
167 | ///////////////////////////////////////////////////////
168 | //Set ipv4 forwarding for packets traversing the switch
169 | ///////////////////////////////////////////////////////
170 | action ipv4_forward(macAddr_t dstAddr, egressSpec_t port) {
171 | standard_metadata.egress_spec = port; //Sets the egress port for the next hop.
172 | hdr.ethernet.srcAddr = hdr.ethernet.dstAddr; //Updates the ethernet destination address with the address of the next hop.
173 | hdr.ethernet.dstAddr = dstAddr; //Updates the ethernet source address with the address of the switch.
174 | hdr.ipv4.ttl = hdr.ipv4.ttl - 1; //Decrements time to live
175 | }
176 |
177 |
178 | table ipv4_lpm {
179 | key = {
180 | hdr.ipv4.dstAddr: exact;
181 | hdr.ipv4.srcAddr: exact;
182 | }
183 | actions = {
184 | ipv4_forward;
185 | drop;
186 | }
187 | size = 1024;
188 | default_action = drop();
189 | }
190 |
191 | apply {
192 |
193 | if (hdr.ipv4.isValid()) {
194 |
195 | ipv4_lpm.apply();
196 |
197 | }
198 | else {
199 | drop();
200 | }
201 | }
202 | }
203 |
204 | /*************************************************************************
205 | **************** E G R E S S P R O C E S S I N G *******************
206 | *************************************************************************/
207 |
208 | control MyEgress(inout headers hdr,
209 | inout metadata meta,
210 | inout standard_metadata_t standard_metadata) {
211 |
212 | register>(PARTITION_SIZE) reg_grid0;
213 | register>(PARTITION_SIZE) reg_grid1;
214 | register>(PARTITION_SIZE) reg_grid2;
215 | register>(PARTITION_SIZE) reg_grid3;
216 | register>(PARTITION_SIZE) reg_grid4;
217 | register>(PARTITION_SIZE) reg_grid5;
218 | register>(PARTITION_SIZE) reg_grid6;
219 | register>(PARTITION_SIZE) reg_grid7;
220 | register>(PARTITION_SIZE) reg_grid8;
221 |
222 |
223 | //****************** Register Actions Definition************************
224 | action reg_grid0_action() {
225 | bit<16> value;
226 | reg_grid0.read(value, meta.rg_bin_offset);
227 | value = value+1;
228 | reg_grid0.write(meta.rg_bin_offset, value);
229 | }
230 |
231 | action reg_grid1_action() {
232 | bit<16> value;
233 | reg_grid1.read(value, meta.rg_bin_offset);
234 | value = value+1;
235 | reg_grid1.write(meta.rg_bin_offset, value);
236 | }
237 |
238 | action reg_grid2_action() {
239 | bit<16> value;
240 | reg_grid2.read(value, meta.rg_bin_offset);
241 | value = value+1;
242 | reg_grid2.write(meta.rg_bin_offset, value);
243 | }
244 |
245 | action reg_grid3_action() {
246 | bit<16> value;
247 | reg_grid3.read(value, meta.rg_bin_offset);
248 | value = value+1;
249 | reg_grid3.write(meta.rg_bin_offset, value);
250 | }
251 |
252 | action reg_grid4_action() {
253 | bit<16> value;
254 | reg_grid4.read(value, meta.rg_bin_offset);
255 | value = value+1;
256 | reg_grid4.write(meta.rg_bin_offset, value);
257 | }
258 |
259 | action reg_grid5_action() {
260 | bit<16> value;
261 | reg_grid5.read(value, meta.rg_bin_offset);
262 | value = value+1;
263 | reg_grid5.write(meta.rg_bin_offset, value);
264 | }
265 |
266 | action reg_grid6_action() {
267 | bit<16> value;
268 | reg_grid6.read(value, meta.rg_bin_offset);
269 | value = value+1;
270 | reg_grid6.write(meta.rg_bin_offset, value);
271 | }
272 |
273 | action reg_grid7_action() {
274 | bit<16> value;
275 | reg_grid7.read(value, meta.rg_bin_offset);
276 | value = value+1;
277 | reg_grid7.write(meta.rg_bin_offset, value);
278 | }
279 |
280 | action reg_grid8_action() {
281 | bit<16> value;
282 | reg_grid8.read(value, meta.rg_bin_offset);
283 | value = value+1;
284 | reg_grid8.write(meta.rg_bin_offset, value);
285 | }
286 |
287 | //******************End Register Actions Definition*********************
288 |
289 | //****************** Other Actions Definition************************
290 |
291 | // flow_offset: is used for indexing the flow within a bin of the reg_grid
292 | action set_flow_data(bit<32> flow_offset) {
293 | meta.rg_bin_offset = flow_offset + meta.binIndex_posTruncation;
294 | }
295 |
296 | action quantization_act(){
297 | meta.binIndex_preTruncation = (bit<32>) (standard_metadata.packet_length >> BIN_WIDTH_SHIFT);
298 | }
299 |
300 | action truncate_binIndex(bit<32> new_index, bit flag) {
301 | meta.binIndex_posTruncation = new_index;
302 | meta.truncation_flag = flag;
303 | }
304 |
305 |
306 | //******************End Other Actions Definition*********************
307 |
308 | //******************Tables Definition**************************
309 |
310 | table flow_tbl0 {
311 | key = {
312 | hdr.ipv4.dstAddr: exact;
313 | hdr.ipv4.srcAddr: exact;
314 | meta.truncation_flag : exact;
315 | }
316 | actions = {
317 | set_flow_data;
318 | NoAction();
319 | }
320 | default_action = NoAction();
321 | size = FLOWS_PER_PARTITION;
322 | }
323 |
324 | table flow_tbl1 {
325 | key = {
326 | hdr.ipv4.dstAddr: exact;
327 | hdr.ipv4.srcAddr: exact;
328 | meta.truncation_flag : exact;
329 | }
330 | actions = {
331 | set_flow_data;
332 | NoAction();
333 | }
334 | default_action = NoAction();
335 | size = FLOWS_PER_PARTITION;
336 | }
337 |
338 | table flow_tbl2 {
339 | key = {
340 | hdr.ipv4.dstAddr: exact;
341 | hdr.ipv4.srcAddr: exact;
342 | meta.truncation_flag : exact;
343 | }
344 | actions = {
345 | set_flow_data;
346 | NoAction();
347 | }
348 | default_action = NoAction();
349 | size = FLOWS_PER_PARTITION;
350 | }
351 |
352 | table flow_tbl3 {
353 | key = {
354 | hdr.ipv4.dstAddr: exact;
355 | hdr.ipv4.srcAddr: exact;
356 | meta.truncation_flag : exact;
357 | }
358 | actions = {
359 | set_flow_data;
360 | NoAction();
361 | }
362 | default_action = NoAction();
363 | size = FLOWS_PER_PARTITION;
364 | }
365 |
366 | table flow_tbl4 {
367 | key = {
368 | hdr.ipv4.dstAddr: exact;
369 | hdr.ipv4.srcAddr: exact;
370 | meta.truncation_flag : exact;
371 | }
372 | actions = {
373 | set_flow_data;
374 | NoAction();
375 | }
376 | default_action = NoAction();
377 | size = FLOWS_PER_PARTITION;
378 | }
379 |
380 | table flow_tbl5 {
381 | key = {
382 | hdr.ipv4.dstAddr: exact;
383 | hdr.ipv4.srcAddr: exact;
384 | meta.truncation_flag : exact;
385 | }
386 | actions = {
387 | set_flow_data;
388 | NoAction();
389 | }
390 | default_action = NoAction();
391 | size = FLOWS_PER_PARTITION;
392 | }
393 |
394 | table flow_tbl6 {
395 | key = {
396 | hdr.ipv4.dstAddr: exact;
397 | hdr.ipv4.srcAddr: exact;
398 | meta.truncation_flag : exact;
399 | }
400 | actions = {
401 | set_flow_data;
402 | NoAction();
403 | }
404 | default_action = NoAction();
405 | size = FLOWS_PER_PARTITION;
406 | }
407 |
408 | table flow_tbl7 {
409 | key = {
410 | hdr.ipv4.dstAddr: exact;
411 | hdr.ipv4.srcAddr: exact;
412 | meta.truncation_flag : exact;
413 | }
414 | actions = {
415 | set_flow_data;
416 | NoAction();
417 | }
418 | default_action = NoAction();
419 | size = FLOWS_PER_PARTITION;
420 | }
421 |
422 | table flow_tbl8 {
423 | key = {
424 | hdr.ipv4.dstAddr: exact;
425 | hdr.ipv4.srcAddr: exact;
426 | meta.truncation_flag : exact;
427 | }
428 | actions = {
429 | set_flow_data;
430 | NoAction();
431 | }
432 | default_action = NoAction();
433 | size = FLOWS_PER_PARTITION;
434 | }
435 |
436 | table truncation_tbl {
437 | key = {
438 | meta.binIndex_preTruncation: exact;
439 | }
440 | actions = {
441 | truncate_binIndex();
442 | NoAction();
443 | }
444 | default_action = truncate_binIndex(0, NOBIN_FLAG);
445 | size = NUM_PKT_SIZES;
446 | }
447 |
448 |
449 | //******************End Tables Definition***********************
450 |
451 |
452 | apply {
453 |
454 | quantization_act();
455 |
456 | truncation_tbl.apply();
457 |
458 | if(flow_tbl0.apply().hit) {
459 | reg_grid0_action();
460 | }
461 | else {
462 | if(flow_tbl1.apply().hit) {
463 | reg_grid1_action();
464 | }
465 | else {
466 | if(flow_tbl2.apply().hit) {
467 | reg_grid2_action();
468 | }
469 | else {
470 | if(flow_tbl3.apply().hit) {
471 | reg_grid3_action();
472 | }
473 | else {
474 | if(flow_tbl4.apply().hit) {
475 | reg_grid4_action();
476 | }
477 | else {
478 | if(flow_tbl5.apply().hit) {
479 | reg_grid5_action();
480 | }
481 | else {
482 | if(flow_tbl6.apply().hit) {
483 | reg_grid6_action();
484 | }
485 | else {
486 | if(flow_tbl7.apply().hit) {
487 | reg_grid7_action();
488 | }
489 | else {
490 | if(flow_tbl8.apply().hit) {
491 | reg_grid8_action();
492 | }
493 | }
494 | }
495 | }
496 | }
497 | }
498 | }
499 | }
500 | }
501 |
502 | } // end of the apply block
503 |
504 | }
505 |
506 | /*************************************************************************
507 | ************* C H E C K S U M C O M P U T A T I O N **************
508 | *************************************************************************/
509 |
510 | control MyComputeChecksum(inout headers hdr, inout metadata meta) {
511 | apply {
512 | update_checksum(
513 | hdr.ipv4.isValid(),
514 | { hdr.ipv4.version,
515 | hdr.ipv4.ihl,
516 | hdr.ipv4.diffserv,
517 | hdr.ipv4.totalLen,
518 | hdr.ipv4.identification,
519 | hdr.ipv4.flags,
520 | hdr.ipv4.fragOffset,
521 | hdr.ipv4.ttl,
522 | hdr.ipv4.protocol,
523 | hdr.ipv4.srcAddr,
524 | hdr.ipv4.dstAddr },
525 | hdr.ipv4.hdrChecksum,
526 | HashAlgorithm.csum16);
527 | }
528 | }
529 |
530 |
531 | /*************************************************************************
532 | *********************** D E P A R S E R *******************************
533 | *************************************************************************/
534 |
535 | control MyDeparser(packet_out packet, in headers hdr) {
536 |
537 | //deparser that selects the order in which fields inserted into the outgoing packet.
538 | apply {
539 | packet.emit(hdr.ethernet);
540 | packet.emit(hdr.ipv4);
541 | packet.emit(hdr.tcp);
542 | packet.emit(hdr.udp);
543 | }
544 | }
545 |
546 | /*************************************************************************
547 | *********************** S W I T C H *******************************
548 | *************************************************************************/
549 |
550 | V1Switch(
551 | MyParser(),
552 | MyVerifyChecksum(),
553 | MyIngress(),
554 | MyEgress(),
555 | MyComputeChecksum(),
556 | MyDeparser()
557 | ) main;
558 |
--------------------------------------------------------------------------------
/Security Tasks Evaluation/MPTAnalysis/FacetAnalysis/generateFeatures.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import collections
3 | import dpkt
4 | import subprocess
5 | import socket
6 | import os
7 | import sys
8 | import math
9 | import csv
10 | import numpy as np
11 | from itertools import product
12 | from scipy.stats import kurtosis, skew
13 | import time
14 | import glob
15 |
16 |
17 | DEST_IP = '172.31.0.2'
18 | SOURCE_IP = '172.31.0.19'
19 |
20 | def MergeDatasets(data_folder):
21 | if(os.path.exists(data_folder + '/full_dataset.csv')):
22 | os.remove(data_folder + '/full_dataset.csv')
23 |
24 | features_files = [data_folder + "facet_dataset.csv", data_folder + "RegularTraffic_dataset.csv"]
25 |
26 | print "Merging full dataset..."
27 | header_saved = False
28 | with open(data_folder + '/full_dataset.csv','wb') as fout:
29 | for filename in features_files:
30 | print "merging " + filename
31 | with open(filename) as fin:
32 | header = next(fin)
33 | if not header_saved:
34 | fout.write(header)
35 | header_saved = True
36 | for line in fin:
37 | fout.write(line)
38 | print "Dataset merged!"
39 |
40 |
41 | def CombinedMerging(data_folder):
42 | if(os.path.exists(data_folder + '/regular_50_dataset.csv')):
43 | os.remove(data_folder + '/regular_50_dataset.csv')
44 |
45 | features_files = [data_folder + "FacetTraffic_50_dataset.csv", data_folder + "RegularTraffic_dataset.csv"]
46 |
47 | print "Merging dataset..."
48 | header_saved = False
49 | with open(data_folder + '/regular_50_dataset.csv','wb') as fout:
50 | for filename in features_files:
51 | print "merging " + filename
52 | with open(filename) as fin:
53 | header = next(fin)
54 | if not header_saved:
55 | fout.write(header)
56 | header_saved = True
57 | for line in fin:
58 | fout.write(line)
59 | print "Dataset merged!"
60 |
61 |
62 | def MergeSamples(data_folder):
63 | #Generate training dataset
64 | facet_files = glob.glob(data_folder + "/FacetTraffic_*.csv")
65 |
66 | header_saved = False
67 | with open(data_folder + '/facet_dataset.csv','wb') as fout:
68 | for filename in facet_files:
69 | with open(filename) as fin:
70 | header = next(fin)
71 | if not header_saved:
72 | fout.write(header)
73 | header_saved = True
74 | for line in fin:
75 | fout.write(line)
76 |
77 |
78 | def GenerateDatasets(data_folder):
79 | MergeSamples(data_folder)
80 | CombinedMerging(data_folder)
81 | #MergeDatasets(data_folder)
82 |
83 |
84 | def RoundToNearest(n, m):
85 | r = n % m
86 | return n + m - r if r + r >= m else n - r
87 |
88 |
89 | def FeatureExtractionPLBenchmark(sampleFolder, binWidth, topk):
90 | #Bucket importance in decreasing order
91 | BUCKETS_TO_MEASURE = []
92 |
93 | #Measure interesting buckets
94 | if(topk != 1500):
95 | #Buckets in decreasing importance order
96 | f_imp = np.load('classificationResults/PL_60_' + str(binWidth) + '_1500/FeatureImportance_XGBoost_FacetTraffic_50.npy')
97 | #Print top k
98 | #for f in f_imp:
99 | # print str(f[1]) + " " + str(f[2])
100 |
101 | if(topk > len(f_imp)):
102 | print "Skipping, not enough features to accomodate for. TopK = " + str(topk) + " Features = " + str(len(f_imp))
103 | return
104 | for i in range(0,topk):
105 | b = int(f_imp[i][2].split("_")[1])
106 | print "Top-" + str(i) + " = " + str(b)
107 | BUCKETS_TO_MEASURE.append(b)
108 |
109 | #Measure all buckets
110 | elif(topk == 1500):
111 | for i in range(0,1500,binWidth):
112 | BUCKETS_TO_MEASURE.append(i/binWidth)
113 |
114 |
115 | quantized_buckets_to_measure = sorted(BUCKETS_TO_MEASURE)
116 | print "Quantized buckets to measure = " + str(quantized_buckets_to_measure)
117 | print "Number of buckets to measure = " + str(len(quantized_buckets_to_measure))
118 |
119 |
120 | traceInterval = 60 #Amount of time in packet trace to consider for feature extraction
121 | feature_set_folder = 'FeatureSets/PL_' + str(traceInterval) + "_" + str(binWidth) + "_" + str(topk)
122 | print feature_set_folder
123 |
124 | if not os.path.exists(feature_set_folder):
125 | os.makedirs(feature_set_folder)
126 | arff_path = feature_set_folder + '/' + os.path.basename(sampleFolder) + '_dataset.csv'
127 | arff = open(arff_path, 'wb')
128 | written_header = False
129 |
130 |
131 | for sample in os.listdir(sampleFolder):
132 | f = open(sampleFolder + "/" + sample + "/" + sample + ".pcap")
133 | pcap = dpkt.pcap.Reader(f)
134 |
135 | #Analyse packets transmited
136 | bin_dict = {}
137 |
138 |
139 | for i in quantized_buckets_to_measure:
140 | bin_dict[i] = 0
141 |
142 |
143 | firstTime = 0.0
144 | setFirst = False
145 | for ts, buf in pcap:
146 | if(not(setFirst)):
147 | firstTime = ts
148 | setFirst = True
149 |
150 | if(ts < (firstTime + traceInterval)):
151 |
152 | eth = dpkt.ethernet.Ethernet(buf)
153 | ip_hdr = eth.data
154 | try:
155 | src_ip_addr_str = socket.inet_ntoa(ip_hdr.src)
156 | #Target UDP communication between both cluster machines
157 | if (ip_hdr.p == 17 and src_ip_addr_str == SOURCE_IP):
158 | binned = RoundToNearest(len(buf),binWidth)
159 | if(binned/binWidth in quantized_buckets_to_measure):
160 | bin_dict[binned/binWidth]+=1
161 | except:
162 | pass
163 | f.close()
164 |
165 | od_dict = collections.OrderedDict(sorted(bin_dict.items(), key=lambda t: float(t[0])))
166 | bin_list = []
167 | for i in od_dict:
168 | bin_list.append(od_dict[i])
169 |
170 |
171 | label = os.path.basename(sampleFolder)
172 | if('Regular' in sampleFolder):
173 | label = 'Regular'
174 |
175 | #Write sample features to the csv file
176 | f_names = []
177 | f_values = []
178 |
179 | for i, b in enumerate(bin_list):
180 | f_names.append('packetLengthBin_' + str(quantized_buckets_to_measure[i]))
181 | f_values.append(b)
182 |
183 |
184 | #print len(f_names)
185 | f_names.append('Class')
186 | f_values.append(label)
187 |
188 | if(not written_header):
189 | arff.write(', '.join(f_names))
190 | arff.write('\n')
191 | print "Writing header"
192 | written_header = True
193 |
194 | l = []
195 | for v in f_values:
196 | l.append(str(v))
197 | arff.write(', '.join(l))
198 | arff.write('\n')
199 | arff.close()
200 | return feature_set_folder
201 |
202 |
203 | def CompressFeatures(BIN_WIDTH, TOPK):
204 | sampleFolders = [
205 | "TrafficCaptures/240Resolution/FacetTraffic_50",
206 | "TrafficCaptures/240Resolution/RegularTraffic",
207 | ]
208 |
209 |
210 | if not os.path.exists('FeatureSets'):
211 | os.makedirs('FeatureSets')
212 |
213 | for topk in TOPK:
214 | for binWidth in BIN_WIDTH:
215 | start = time.time()
216 | print "\n#####################################"
217 | print "Generating Dataset based on Binned Packet Length Features"
218 | for sampleFolder in sampleFolders:
219 | print "\n#############################"
220 | print "Parsing " + sampleFolder
221 | print "#############################"
222 | feature_set_folder = FeatureExtractionPLBenchmark(sampleFolder, binWidth, topk)
223 | if(feature_set_folder is not None):
224 | GenerateDatasets(feature_set_folder + '/')
225 | end = time.time()
226 | print "Optimize_compress_bin_%s_topk_%s_time_%s"%(binWidth, topk, end-start)
227 |
228 |
229 | def SplitDataset(DATASET_SPLIT, N_FLOWS, COVERT_FLOWS_PERC):
230 |
231 | print "Splitting datasets with DATASET_SPLIT= %s, N_FLOWS = %s, REG_FLOWS_PROP = %s"%(DATASET_SPLIT, N_FLOWS, COVERT_FLOWS_PERC)
232 | split_value = DATASET_SPLIT * N_FLOWS #samples
233 | covert_split_value = COVERT_FLOWS_PERC * split_value
234 |
235 | print "SPLIT_VALUE = %s"%(split_value)
236 | print "COVERT_SAMPLES_VALUE = %s"%(covert_split_value)
237 |
238 | for feature_folder in os.listdir("FeatureSets"):
239 | if(".DS_Store" not in feature_folder):
240 | start = time.time()
241 | print "Splitting %s"%("FeatureSets/" + feature_folder + "/RegularTraffic_dataset.csv")
242 | #Split RegularFlows
243 | RegularFile = open("FeatureSets/" + feature_folder + "/RegularTraffic_dataset.csv", 'rb')
244 | csv_reader = csv.reader(RegularFile, delimiter=',')
245 |
246 | PhaseOneRegularFile = open("FeatureSets/" + feature_folder + "/RegularTraffic_phase1_dataset.csv", 'w')
247 | PhaseTwoRegularFile = open("FeatureSets/" + feature_folder + "/RegularTraffic_phase2_dataset.csv", 'w')
248 |
249 | for n, row in enumerate(csv_reader):
250 | if(n == 0):
251 | row_string = ",".join(row) + "\n"
252 | PhaseOneRegularFile.write(row_string)
253 | PhaseTwoRegularFile.write(row_string)
254 | elif(n < split_value):
255 | row_string = ",".join(row) + "\n"
256 | PhaseOneRegularFile.write(row_string)
257 | else:
258 | row_string = ",".join(row) + "\n"
259 | PhaseTwoRegularFile.write(row_string)
260 |
261 | RegularFile.close()
262 | PhaseOneRegularFile.close()
263 | PhaseTwoRegularFile.close()
264 |
265 |
266 | #Split CovertFlows
267 | print "Splitting %s"%("FeatureSets/" + feature_folder + "/FacetTraffic_50_dataset.csv")
268 | CovertFile = open("FeatureSets/" + feature_folder + "/FacetTraffic_50_dataset.csv", "rb")
269 | csv_reader = csv.reader(CovertFile, delimiter=',')
270 |
271 | PhaseOneCovertFile = open("FeatureSets/" + feature_folder + "/FacetTraffic_50_phase1_dataset.csv", "w")
272 | PhaseTwoCovertFile = open("FeatureSets/" + feature_folder + "/FacetTraffic_50_phase2_dataset.csv", "w")
273 |
274 | for n, row in enumerate(csv_reader):
275 | if(n == 0):
276 | row_string = ",".join(row) + "\n"
277 | PhaseOneCovertFile.write(row_string)
278 | PhaseTwoCovertFile.write(row_string)
279 | elif(n < split_value):
280 | row_string = ",".join(row) + "\n"
281 | PhaseOneCovertFile.write(row_string)
282 | else:
283 | row_string = ",".join(row) + "\n"
284 | PhaseTwoCovertFile.write(row_string)
285 |
286 | CovertFile.close()
287 | PhaseOneCovertFile.close()
288 | PhaseTwoCovertFile.close()
289 | end = time.time()
290 | binWidth = feature_folder.split("_")[2]
291 | topk = feature_folder.split("_")[3]
292 | print "Optimize_split_bin_%s_topk_%s_time_%s"%(binWidth, topk, end-start)
293 |
294 |
295 | def MergeTestData():
296 | for feature_folder in os.listdir("FeatureSets"):
297 | if(".DS_Store" not in feature_folder):
298 | print "Merging %s"%("FeatureSets/" + feature_folder + "/RegularTraffic_phase2_dataset.csv")
299 | print "Merging %s"%("FeatureSets/" + feature_folder + "/FacetTraffic_50_phase2_dataset.csv")
300 |
301 | #Merging Phase2
302 | PhaseTwoFile = open("FeatureSets/" + feature_folder + "/Phase2_dataset.csv", 'w')
303 |
304 | PhaseTwoRegularFile = open("FeatureSets/" + feature_folder + "/RegularTraffic_phase2_dataset.csv", 'rb')
305 | PhaseTwoCovertFile = open("FeatureSets/" + feature_folder + "/FacetTraffic_50_phase2_dataset.csv", "rb")
306 |
307 |
308 | #Write data from the regular file
309 | csv_reader = csv.reader(PhaseTwoRegularFile, delimiter=',')
310 | for n, row in enumerate(csv_reader):
311 | row_string = ",".join(row) + "\n"
312 | PhaseTwoFile.write(row_string)
313 |
314 | #Write data from the covert file
315 | csv_reader = csv.reader(PhaseTwoCovertFile, delimiter=',')
316 | for n, row in enumerate(csv_reader):
317 | if(n == 0):
318 | continue
319 | row_string = ",".join(row) + "\n"
320 | PhaseTwoFile.write(row_string)
321 |
322 | PhaseTwoFile.close()
323 | PhaseTwoRegularFile.close()
324 | PhaseTwoCovertFile.close()
325 |
326 |
327 |
328 | def FeatureExtractionPLBenchmarkBasedOnTrainData(sampleFolder, binWidth, topk):
329 | #Bucket importance in decreasing order
330 | BUCKETS_TO_MEASURE = []
331 |
332 | #Measure interesting buckets
333 | if(topk != 1500):
334 | #Buckets in decreasing importance order
335 | f_imp = np.load('classificationResults/PL_60_' + str(binWidth) + '_1500/FeatureImportance_XGBoost_FacetTraffic_50_phase1.npy')
336 | #Print top k
337 | #for f in f_imp:
338 | # print str(f[1]) + " " + str(f[2])
339 |
340 | if(topk > len(f_imp)):
341 | print "Skipping, not enough features to accomodate for. TopK = " + str(topk) + " Features = " + str(len(f_imp))
342 | return
343 | for i in range(0,topk):
344 | b = int(f_imp[i][2].split("_")[1])
345 | print "Top-" + str(i) + " = " + str(b)
346 | BUCKETS_TO_MEASURE.append(b)
347 |
348 | #Measure all buckets
349 | elif(topk == 1500):
350 | print "Measuring all buckets according to quantization"
351 | for i in range(0,1500,binWidth):
352 | BUCKETS_TO_MEASURE.append(i/binWidth)
353 |
354 |
355 | quantized_buckets_to_measure = sorted(BUCKETS_TO_MEASURE)
356 | print "Quantized buckets to measure = " + str(quantized_buckets_to_measure)
357 | print "Number of buckets to measure = " + str(len(quantized_buckets_to_measure))
358 |
359 |
360 | traceInterval = 60 #Amount of time in packet trace to consider for feature extraction
361 | feature_set_folder = 'FeatureSets/PL_' + str(traceInterval) + "_" + str(binWidth) + "_" + str(topk)
362 | print feature_set_folder
363 |
364 | if not os.path.exists(feature_set_folder):
365 | os.makedirs(feature_set_folder)
366 | arff_path = feature_set_folder + '/' + os.path.basename(sampleFolder) + '_dataset.csv'
367 | arff = open(arff_path, 'wb')
368 | written_header = False
369 |
370 |
371 | for sample in os.listdir(sampleFolder):
372 | if(".DS_Store" in sample):
373 | continue
374 | f = open(sampleFolder + "/" + sample + "/" + sample + ".pcap")
375 | pcap = dpkt.pcap.Reader(f)
376 |
377 | #Analyse packets transmited
378 | packetSizesIn = []
379 | packetSizesOut = []
380 | bin_dict = {}
381 |
382 |
383 | for i in quantized_buckets_to_measure:
384 | bin_dict[i] = 0
385 |
386 |
387 | firstTime = 0.0
388 | setFirst = False
389 | for ts, buf in pcap:
390 | if(not(setFirst)):
391 | firstTime = ts
392 | setFirst = True
393 |
394 | if(ts < (firstTime + traceInterval)):
395 |
396 | eth = dpkt.ethernet.Ethernet(buf)
397 | ip_hdr = eth.data
398 | try:
399 | src_ip_addr_str = socket.inet_ntoa(ip_hdr.src)
400 | dst_ip_addr_str = socket.inet_ntoa(ip_hdr.dst)
401 | #Target UDP communication between both cluster machines
402 | if (ip_hdr.p == 17 and src_ip_addr_str == SOURCE_IP):
403 | binned = RoundToNearest(len(buf),binWidth)
404 | if(binned/binWidth in quantized_buckets_to_measure):
405 | bin_dict[binned/binWidth]+=1
406 | except:
407 | pass
408 | f.close()
409 |
410 | od_dict = collections.OrderedDict(sorted(bin_dict.items(), key=lambda t: float(t[0])))
411 | bin_list = []
412 | for i in od_dict:
413 | bin_list.append(od_dict[i])
414 |
415 |
416 | label = os.path.basename(sampleFolder)
417 | if('Regular' in sampleFolder):
418 | label = 'Regular'
419 |
420 | #Write sample features to the csv file
421 | f_names = []
422 | f_values = []
423 |
424 | for i, b in enumerate(bin_list):
425 | f_names.append('packetLengthBin_' + str(quantized_buckets_to_measure[i]))
426 | f_values.append(b)
427 |
428 |
429 | #print len(f_names)
430 | f_names.append('Class')
431 | f_values.append(label)
432 |
433 | if(not written_header):
434 | arff.write(', '.join(f_names))
435 | arff.write('\n')
436 | print "Writing header"
437 | written_header = True
438 |
439 | l = []
440 | for v in f_values:
441 | l.append(str(v))
442 | arff.write(', '.join(l))
443 | arff.write('\n')
444 | arff.close()
445 | return feature_set_folder
446 |
447 |
448 |
449 | def CompressFeaturesBasedOnTrainData(BIN_WIDTH, TOPK):
450 | sampleFolders = [
451 | "TrafficCaptures/240Resolution/FacetTraffic_50",
452 | "TrafficCaptures/240Resolution/RegularTraffic",
453 | ]
454 |
455 |
456 | if not os.path.exists('FeatureSets'):
457 | os.makedirs('FeatureSets')
458 |
459 | for topk in TOPK:
460 | for binWidth in BIN_WIDTH:
461 | start = time.time()
462 | print "\n#####################################"
463 | print "Generating Dataset based on Binned Packet Length Features"
464 | for sampleFolder in sampleFolders:
465 | print "\n#############################"
466 | print "Parsing " + sampleFolder
467 | print "#############################"
468 | feature_set_folder = FeatureExtractionPLBenchmarkBasedOnTrainData(sampleFolder, binWidth, topk)
469 | if(feature_set_folder is not None):
470 | GenerateDatasets(feature_set_folder + '/')
471 | end = time.time()
472 | print "Optimize_compress_bin_%s_topk_%s_time_%s"%(binWidth, topk, end-start)
473 |
474 |
475 |
476 |
477 | def ExtractFirstNPackets(sampleFolder, number_of_packets):
478 |
479 | traceInterval = 60 #Amount of time in packet trace to consider for feature extraction
480 | feature_set_folder = 'FeatureSets/First_%d_packets'%(number_of_packets)
481 | print feature_set_folder
482 |
483 | if not os.path.exists(feature_set_folder):
484 | os.makedirs(feature_set_folder)
485 | arff_path = feature_set_folder + '/' + os.path.basename(sampleFolder) + '_dataset.csv'
486 | arff = open(arff_path, 'wb')
487 | written_header = False
488 |
489 |
490 | for sample in os.listdir(sampleFolder):
491 | if(".DS_Store" in sample):
492 | continue
493 | f = open(sampleFolder + "/" + sample + "/" + sample + ".pcap")
494 | pcap = dpkt.pcap.Reader(f)
495 |
496 |
497 | packet_array1 = []
498 | packet_array2 = []
499 | firstTime = 0.0
500 | setFirst = False
501 | for ts, buf in pcap:
502 | if(len(packet_array1) >= number_of_packets and len(packet_array2) >= number_of_packets):
503 | break
504 |
505 | if(not(setFirst)):
506 | firstTime = ts
507 | setFirst = True
508 |
509 | if(ts < (firstTime + traceInterval)):
510 |
511 | eth = dpkt.ethernet.Ethernet(buf)
512 | ip_hdr = eth.data
513 | try:
514 | src_ip_addr_str = socket.inet_ntoa(ip_hdr.src)
515 | dst_ip_addr_str = socket.inet_ntoa(ip_hdr.dst)
516 | #Target UDP communication between both cluster machines
517 | if (ip_hdr.p == 17 and src_ip_addr_str == SOURCE_IP):
518 | if(len(packet_array1) < number_of_packets):
519 | packet_array1.append(len(buf))
520 | elif(ip_hdr.p == 17 and src_ip_addr_str != SOURCE_IP):
521 | if(len(packet_array2) < number_of_packets):
522 | packet_array2.append(len(buf))
523 | except:
524 | pass
525 | f.close()
526 |
527 | label = os.path.basename(sampleFolder)
528 | if('Regular' in sampleFolder):
529 | label = 'Regular'
530 |
531 | if(len(packet_array1) >= number_of_packets and len(packet_array2) >= number_of_packets):
532 | #Write sample features to the csv file
533 | f_names = []
534 | f_values = []
535 |
536 | for i, b in enumerate(packet_array1):
537 | f_names.append('packetNumberOut_' + str(i))
538 | f_values.append(b)
539 |
540 | for i, b in enumerate(packet_array2):
541 | f_names.append('packetNumberIn_' + str(i))
542 | f_values.append(b)
543 |
544 |
545 | f_names.append('Class')
546 | f_values.append(label)
547 |
548 | if(not written_header):
549 | arff.write(', '.join(f_names))
550 | arff.write('\n')
551 | print "Writing header"
552 | written_header = True
553 |
554 | l = []
555 | for v in f_values:
556 | l.append(str(v))
557 | arff.write(', '.join(l))
558 | arff.write('\n')
559 | else:
560 | print "Sample %s has not enough packets"%(sampleFolder + "/" + sample + "/" + sample + ".pcap")
561 | arff.close()
562 | return feature_set_folder
563 |
564 |
565 | def ExtractPacketSample(NUMBER_OF_PACKETS):
566 | sampleFolders = [
567 | "TrafficCaptures/240Resolution/FacetTraffic_50",
568 | "TrafficCaptures/240Resolution/RegularTraffic",
569 | ]
570 |
571 | if not os.path.exists('FeatureSets'):
572 | os.makedirs('FeatureSets')
573 |
574 | for number_of_packets in NUMBER_OF_PACKETS:
575 | print "\n#####################################"
576 | print "Extracting first %d packet sizes"%(number_of_packets)
577 |
578 | for sampleFolder in sampleFolders:
579 | print "\n#############################"
580 | print "Parsing " + sampleFolder
581 | print "#############################"
582 | feature_set_folder = ExtractFirstNPackets(sampleFolder, number_of_packets)
583 | if(feature_set_folder is not None):
584 | GenerateDatasets(feature_set_folder + '/')
--------------------------------------------------------------------------------