├── requirements.txt
├── CovertCastAnalysis
    ├── dataset_gen.py
    ├── ParseCaptures.py
    ├── KL_classifier.py
    ├── EMD_classifier.py
    ├── xgboost_classifier.py
    └── X2_classifier.py
├── DeltaShaperAnalysis
    ├── dataset_gen.py
    ├── parseCaptures.py
    ├── KL_classifier.py
    ├── EMD_classifier.py
    ├── xgboost_classifier.py
    ├── X2_classifier.py
    └── IsolationForest.py
├── FacetAnalysis
    ├── dataset_gen.py
    ├── ParseCaptures.py
    ├── KL_classifier.py
    ├── EMD_classifier.py
    ├── xgboost_classifier.py
    └── autoencoder.py
└── README.md


/requirements.txt:
--------------------------------------------------------------------------------
1 | dpkt = 1.8.8
2 | numpy == 1.13.1
3 | scipy == 0.14.0
4 | matplotlib == 1.5.3
5 | scikit-learn == 0.19.0
6 | xgboost == 0.6a2
7 | tensorflow == 0.12.1
8 | pyemd == 0.3.0


--------------------------------------------------------------------------------
/CovertCastAnalysis/dataset_gen.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import csv
 4 | import glob
 5 | import os
 6 | 
 7 | def MergeDatasets(data_folder):
 8 |     if(os.path.exists(data_folder + '/full_dataset.csv')):
 9 |         os.remove(data_folder + '/full_dataset.csv')
10 | 
11 |     features_files = glob.glob(data_folder + "/*_dataset.csv")
12 | 
13 |     print "Merging full dataset..."
14 |     header_saved = False
15 |     with open(data_folder + '/full_dataset.csv','wb') as fout:
16 |         for filename in features_files:
17 |             print "merging " + filename
18 |             with open(filename) as fin:
19 |                 header = next(fin)
20 |                 if not header_saved:
21 |                     fout.write(header)
22 |                     header_saved = True
23 |                 for line in fin:
24 |                     fout.write(line)
25 |     print "Dataset merged!"
26 | 
27 | 
28 | def MergeSamples(data_folder):
29 |     #Generate training dataset
30 |     youtube_files = glob.glob(data_folder + "/YouTubeTraffic_*.csv")
31 | 
32 |     header_saved = False
33 |     with open(data_folder + '/youtube_dataset.csv','wb') as fout:
34 |         for filename in youtube_files:
35 |             with open(filename) as fin:
36 |                 header = next(fin)
37 |                 if not header_saved:
38 |                     fout.write(header)
39 |                     header_saved = True
40 |                 for line in fin:
41 |                     fout.write(line)
42 | 
43 |     covertcast_files = glob.glob(data_folder + "/CovertCastTraffic_*.csv")
44 | 
45 |     header_saved = False
46 |     with open(data_folder + '/covertcast_dataset.csv','wb') as fout:
47 |         for filename in covertcast_files:
48 |             with open(filename) as fin:
49 |                 header = next(fin)
50 |                 if not header_saved:
51 |                     fout.write(header)
52 |                     header_saved = True
53 |                 for line in fin:
54 |                     fout.write(line)
55 | 
56 | 


--------------------------------------------------------------------------------
/DeltaShaperAnalysis/dataset_gen.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import csv
 4 | import glob
 5 | import os
 6 | 
 7 | 
 8 | def MergeDatasets(data_folder):
 9 |     if(os.path.exists(data_folder + '/full_dataset.csv')):
10 |         os.remove(data_folder + '/full_dataset.csv')
11 | 
12 |     features_files = [data_folder + "deltashaper_dataset.csv", data_folder + "RegularTraffic_dataset.csv"]
13 | 
14 |     print "Merging full dataset..."
15 |     header_saved = False
16 |     with open(data_folder + '/full_dataset.csv','wb') as fout:
17 |         for filename in features_files:
18 |             print "merging " + filename
19 |             with open(filename) as fin:
20 |                 header = next(fin)
21 |                 if not header_saved:
22 |                     fout.write(header)
23 |                     header_saved = True
24 |                 for line in fin:
25 |                     fout.write(line)
26 |     print "Dataset merged!"
27 | 
28 | 
29 | def CombinedMerging(data_folder):
30 |     if(os.path.exists(data_folder + '/regular_320_dataset.csv')):
31 |         os.remove(data_folder + '/regular_320_dataset.csv')
32 |     if(os.path.exists(data_folder + '/regular_160_dataset.csv')):
33 |         os.remove(data_folder + '/regular_160_dataset.csv')
34 | 
35 |     features_files = [data_folder + "DeltaShaperTraffic_320_dataset.csv", data_folder + "RegularTraffic_dataset.csv"]
36 | 
37 |     print "Merging dataset..."
38 |     header_saved = False
39 |     with open(data_folder + '/regular_320_dataset.csv','wb') as fout:
40 |         for filename in features_files:
41 |             print "merging " + filename
42 |             with open(filename) as fin:
43 |                 header = next(fin)
44 |                 if not header_saved:
45 |                     fout.write(header)
46 |                     header_saved = True
47 |                 for line in fin:
48 |                     fout.write(line)
49 |     print "Dataset merged!"
50 | 
51 |     features_files = [data_folder + "DeltaShaperTraffic_160_dataset.csv", data_folder + "RegularTraffic_dataset.csv"]
52 | 
53 |     print "Merging dataset..."
54 |     header_saved = False
55 |     with open(data_folder + '/regular_160_dataset.csv','wb') as fout:
56 |         for filename in features_files:
57 |             print "merging " + filename
58 |             with open(filename) as fin:
59 |                 header = next(fin)
60 |                 if not header_saved:
61 |                     fout.write(header)
62 |                     header_saved = True
63 |                 for line in fin:
64 |                     fout.write(line)
65 |     print "Dataset merged!"
66 | 
67 | 
68 | 
69 | def MergeSamples(data_folder):
70 |     #Generate training dataset
71 |     deltashaper_files = glob.glob(data_folder + "/DeltaShaperTraffic_*.csv")
72 | 
73 |     header_saved = False
74 |     with open(data_folder + 'deltashaper_dataset.csv','wb') as fout:
75 |         for filename in deltashaper_files:
76 |             with open(filename) as fin:
77 |                 header = next(fin)
78 |                 if not header_saved:
79 |                     fout.write(header)
80 |                     header_saved = True
81 |                 for line in fin:
82 |                     fout.write(line)
83 | 
84 | 
85 | def GenerateDatasets(data_folder):
86 |     MergeSamples(data_folder)
87 |     CombinedMerging(data_folder)
88 |     MergeDatasets(data_folder)
89 | 


--------------------------------------------------------------------------------
/FacetAnalysis/dataset_gen.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys
  3 | import csv
  4 | import glob
  5 | import os
  6 | 
  7 | 
  8 | def MergeDatasets(data_folder):
  9 |     if(os.path.exists(data_folder + '/full_dataset.csv')):
 10 |         os.remove(data_folder + '/full_dataset.csv')
 11 | 
 12 |     features_files = [data_folder + "facet_dataset.csv", data_folder + "RegularTraffic_Christmas_dataset.csv"]
 13 | 
 14 |     print "Merging full dataset..."
 15 |     header_saved = False
 16 |     with open(data_folder + '/full_dataset.csv','wb') as fout:
 17 |         for filename in features_files:
 18 |             print "merging " + filename
 19 |             with open(filename) as fin:
 20 |                 header = next(fin)
 21 |                 if not header_saved:
 22 |                     fout.write(header)
 23 |                     header_saved = True
 24 |                 for line in fin:
 25 |                     fout.write(line)
 26 |     print "Dataset merged!"
 27 | 
 28 | 
 29 | 
 30 | def CombinedMerging(data_folder):
 31 |     if(os.path.exists(data_folder + '/regular_12.5_dataset.csv')):
 32 |         os.remove(data_folder + '/regular_12.5_dataset.csv')
 33 |     if(os.path.exists(data_folder + '/regular_25_dataset.csv')):
 34 |         os.remove(data_folder + '/regular_25_dataset.csv')
 35 |     if(os.path.exists(data_folder + '/regular_50_dataset.csv')):
 36 |         os.remove(data_folder + '/regular_50_dataset.csv')
 37 | 
 38 |     features_files = [data_folder + "FacetTraffic_12.5_Christmas_dataset.csv", data_folder + "RegularTraffic_Christmas_dataset.csv"]
 39 | 
 40 |     print "Merging dataset..."
 41 |     header_saved = False
 42 |     with open(data_folder + '/regular_12.5_dataset.csv','wb') as fout:
 43 |         for filename in features_files:
 44 |             print "merging " + filename
 45 |             with open(filename) as fin:
 46 |                 header = next(fin)
 47 |                 if not header_saved:
 48 |                     fout.write(header)
 49 |                     header_saved = True
 50 |                 for line in fin:
 51 |                     fout.write(line)
 52 |     print "Dataset merged!"
 53 | 
 54 |     features_files = [data_folder + "FacetTraffic_25_Christmas_dataset.csv", data_folder + "RegularTraffic_Christmas_dataset.csv"]
 55 | 
 56 |     print "Merging dataset..."
 57 |     header_saved = False
 58 |     with open(data_folder + '/regular_25_dataset.csv','wb') as fout:
 59 |         for filename in features_files:
 60 |             print "merging " + filename
 61 |             with open(filename) as fin:
 62 |                 header = next(fin)
 63 |                 if not header_saved:
 64 |                     fout.write(header)
 65 |                     header_saved = True
 66 |                 for line in fin:
 67 |                     fout.write(line)
 68 |     print "Dataset merged!"
 69 | 
 70 |     features_files = [data_folder + "FacetTraffic_50_Christmas_dataset.csv", data_folder + "RegularTraffic_Christmas_dataset.csv"]
 71 | 
 72 |     print "Merging dataset..."
 73 |     header_saved = False
 74 |     with open(data_folder + '/regular_50_dataset.csv','wb') as fout:
 75 |         for filename in features_files:
 76 |             print "merging " + filename
 77 |             with open(filename) as fin:
 78 |                 header = next(fin)
 79 |                 if not header_saved:
 80 |                     fout.write(header)
 81 |                     header_saved = True
 82 |                 for line in fin:
 83 |                     fout.write(line)
 84 |     print "Dataset merged!"
 85 | 
 86 | 
 87 | 
 88 | def MergeSamples(data_folder):
 89 |     #Generate training dataset
 90 |     facet_files = glob.glob(data_folder + "/FacetTraffic_*.csv")
 91 | 
 92 |     header_saved = False
 93 |     with open(data_folder + '/facet_dataset.csv','wb') as fout:
 94 |         for filename in facet_files:
 95 |             with open(filename) as fin:
 96 |                 header = next(fin)
 97 |                 if not header_saved:
 98 |                     fout.write(header)
 99 |                     header_saved = True
100 |                 for line in fin:
101 |                     fout.write(line)
102 | 
103 | 
104 | def GenerateDatasets(data_folder):
105 |     MergeSamples(data_folder)
106 |     CombinedMerging(data_folder)
107 |     #MergeDatasets(data_folder)
108 | 
109 | 


--------------------------------------------------------------------------------
/DeltaShaperAnalysis/parseCaptures.py:
--------------------------------------------------------------------------------
  1 | import dpkt
  2 | import os
  3 | from matplotlib import pyplot as plt
  4 | from matplotlib.pyplot import cm
  5 | import numpy as np
  6 | import socket
  7 | 
  8 | BIN_WIDTH = [15,20,50]
  9 | 
 10 | InterPacketBins = [5000,2500,1000]
 11 | 
 12 | 
 13 | auxFolder = 'auxFolder/'
 14 | 
 15 | def RoundToNearest(n, m):
 16 |         r = n % m
 17 |         return n + m - r if r + r >= m else n - r
 18 | 
 19 | def CreateBigrams(capsFolder, sampleFolder):
 20 |     for sample in os.listdir(capsFolder + sampleFolder):
 21 |         for binWidth in BIN_WIDTH:
 22 |             faux = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + '/bigrams_' + str(binWidth), 'w')
 23 |             f = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + "/packetCount_" + str(binWidth), 'r')
 24 | 
 25 |             lines = f.readlines()
 26 |             for index, line in enumerate(lines):
 27 |                 try:
 28 |                     faux.write(line.rstrip('\n') + "," + lines[index+1])
 29 |                 except IndexError:
 30 |                     break #Reached last index, stop processing
 31 |             faux.close()
 32 |             f.close()
 33 | 
 34 | 
 35 | def ComputeDelta(capsFolder, sampleFolder):
 36 |     for sample in os.listdir(capsFolder + sampleFolder):
 37 |         for binWidth in InterPacketBins:
 38 |             faux = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + '/deltaT_' + str(binWidth), 'w')
 39 |             f = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + "/timestamps", 'r')
 40 | 
 41 |             lines = f.readlines()
 42 |             for index, line in enumerate(lines):
 43 |                 try:
 44 |                     delta = "%0.6f" % (float(lines[index+1]) - (float(line.rstrip('\n'))))
 45 |                     delta = float(delta) * 1000000
 46 |                     faux.write("%s\n" % RoundToNearest(int(delta), int(binWidth)))
 47 |                 except IndexError:
 48 |                     break #Reached last index, stop processing
 49 | 
 50 |             faux.close()
 51 |             f.close()
 52 | 
 53 | def ParseCapture(capsFolder, sampleFolder):
 54 | 
 55 |     fig = plt.figure()
 56 |     ax1 = fig.add_subplot(111)
 57 | 
 58 |     for sample in os.listdir(capsFolder + sampleFolder):
 59 |         descriptors = []
 60 |         f = open(capsFolder + sampleFolder + "/" + sample + "/" + sample)
 61 |         print sample
 62 |         if not os.path.exists(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample):
 63 |         	os.makedirs(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample)
 64 |         packet_count = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + '/packetCount_0', 'w')
 65 |         timestamps = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + "/timestamps", 'w')
 66 |         for binWidth in BIN_WIDTH:
 67 |             descriptors.append(open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + '/packetCount_' + str(binWidth), 'w'))
 68 | 
 69 | 
 70 |         pcap = dpkt.pcap.Reader(f)
 71 | 
 72 |         for ts, buf in pcap:
 73 |             eth = dpkt.ethernet.Ethernet(buf)
 74 |             ip_hdr = eth.data
 75 |             try:
 76 |                 if (eth.type != dpkt.ethernet.ETH_TYPE_IP and eth.type != dpkt.ethernet.ETH_TYPE_IP6):
 77 |                     continue
 78 |                 if eth.type != dpkt.ethernet.ETH_TYPE_IP6:
 79 |                     src_ip_addr_str = socket.inet_ntoa(ip_hdr.src)
 80 |                 else:
 81 |                     src_ip_addr_str = socket.inet_ntop(socket.AF_INET6, ip_hdr.src)
 82 | 
 83 |                 if (ip_hdr.p == 17 and src_ip_addr_str == '172.31.0.19'):
 84 |                     for i, descript in enumerate(BIN_WIDTH):
 85 |                         descriptors[i].write("%s\n" % RoundToNearest(len(buf), BIN_WIDTH[i]))
 86 |                     timestamps.write("{0:.6f}".format(ts) + "\n")
 87 |                     packet_count.write("%s\n" % len(buf))
 88 | 
 89 |             except Exception as e:
 90 |                 print "[Exception]" + str(e)
 91 |         packet_count.close()
 92 |         timestamps.close()
 93 |         for i, descript in enumerate(BIN_WIDTH):
 94 |             descriptors[i].close()
 95 |         f.close()
 96 | 
 97 | 
 98 | if __name__ == "__main__":
 99 |     sampleFolders = ["TrafficCaptures/480Resolution/"]
100 |     modeFolders = ["RegularTraffic","DeltaShaperTraffic_320", "DeltaShaperTraffic_160"]
101 | 
102 |     for sampleFolder in sampleFolders:
103 |         for modeFolder in modeFolders:
104 |             if not os.path.exists(auxFolder + sampleFolder + modeFolder):
105 |             	os.makedirs(auxFolder + sampleFolder + modeFolder)
106 |             ParseCapture(sampleFolder, modeFolder)
107 |             CreateBigrams(sampleFolder, modeFolder)
108 |             #ComputeDelta(sampleFolder, modeFolder)
109 | 


--------------------------------------------------------------------------------
/FacetAnalysis/ParseCaptures.py:
--------------------------------------------------------------------------------
  1 | import dpkt
  2 | import os
  3 | from matplotlib import pyplot as plt
  4 | from matplotlib.pyplot import cm
  5 | import numpy as np
  6 | import socket
  7 | 
  8 | BIN_WIDTH = [15,20,50]
  9 | 
 10 | InterPacketBins = [5000,2500,1000]
 11 | 
 12 | 
 13 | auxFolder = 'auxFolder/'
 14 | 
 15 | def RoundToNearest(n, m):
 16 |         r = n % m
 17 |         return n + m - r if r + r >= m else n - r
 18 | 
 19 | def CreateBigrams(capsFolder, sampleFolder):
 20 |     for sample in os.listdir(capsFolder + sampleFolder):
 21 |         for binWidth in BIN_WIDTH:
 22 |             faux = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + '/bigrams_' + str(binWidth), 'w')
 23 |             f = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + "/packetCount_" + str(binWidth), 'r')
 24 | 
 25 |             lines = f.readlines()
 26 |             for index, line in enumerate(lines):
 27 |                 try:
 28 |                     faux.write(line.rstrip('\n') + "," + lines[index+1])
 29 |                 except IndexError:
 30 |                     break #Reached last index, stop processing
 31 |             faux.close()
 32 |             f.close()
 33 | 
 34 | 
 35 | def ComputeDelta(capsFolder, sampleFolder):
 36 |     for sample in os.listdir(capsFolder + sampleFolder):
 37 |         for binWidth in InterPacketBins:
 38 |             faux = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + '/deltaT_' + str(binWidth), 'w')
 39 |             f = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + "/timestamps", 'r')
 40 | 
 41 |             lines = f.readlines()
 42 |             for index, line in enumerate(lines):
 43 |                 try:
 44 |                     delta = "%0.6f" % (float(lines[index+1]) - (float(line.rstrip('\n'))))
 45 |                     delta = float(delta) * 1000000
 46 |                     faux.write("%s\n" % RoundToNearest(int(delta), int(binWidth)))
 47 |                 except IndexError:
 48 |                     break #Reached last index, stop processing
 49 | 
 50 |             faux.close()
 51 |             f.close()
 52 | 
 53 | def ParseCapture(capsFolder, sampleFolder):
 54 | 
 55 |     fig = plt.figure()
 56 |     ax1 = fig.add_subplot(111)
 57 | 
 58 |     for sample in os.listdir(capsFolder + sampleFolder):
 59 |         descriptors = []
 60 |         f = open(capsFolder + sampleFolder + "/" + sample + "/" + sample)
 61 |         print sample
 62 |         if not os.path.exists(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample):
 63 |         	os.makedirs(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample)
 64 |         packet_count = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + '/packetCount_0', 'w')
 65 |         timestamps = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + "/timestamps", 'w')
 66 |         for binWidth in BIN_WIDTH:
 67 |             descriptors.append(open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + '/packetCount_' + str(binWidth), 'w'))
 68 | 
 69 | 
 70 |         pcap = dpkt.pcap.Reader(f)
 71 | 
 72 |         for ts, buf in pcap:
 73 |             eth = dpkt.ethernet.Ethernet(buf)
 74 |             ip_hdr = eth.data
 75 |             try:
 76 |                 if (eth.type != dpkt.ethernet.ETH_TYPE_IP and eth.type != dpkt.ethernet.ETH_TYPE_IP6):
 77 |                     continue
 78 |                 if eth.type != dpkt.ethernet.ETH_TYPE_IP6:
 79 |                     src_ip_addr_str = socket.inet_ntoa(ip_hdr.src)
 80 |                 else:
 81 |                     src_ip_addr_str = socket.inet_ntop(socket.AF_INET6, ip_hdr.src)
 82 | 
 83 |                 if (ip_hdr.p == 17 and src_ip_addr_str == '172.31.0.19'):
 84 |                     for i, descript in enumerate(BIN_WIDTH):
 85 |                         descriptors[i].write("%s\n" % RoundToNearest(len(buf), BIN_WIDTH[i]))
 86 |                     timestamps.write("{0:.6f}".format(ts) + "\n")
 87 |                     packet_count.write("%s\n" % len(buf))
 88 | 
 89 |             except Exception as e:
 90 |                 print "[Exception]" + str(e)
 91 |         packet_count.close()
 92 |         timestamps.close()
 93 |         for i, descript in enumerate(BIN_WIDTH):
 94 |             descriptors[i].close()
 95 |         f.close()
 96 | 
 97 | 
 98 | if __name__ == "__main__":
 99 |     sampleFolders = ["TrafficCaptures/240Resolution/"]
100 |     modeFolders = ["RegularTraffic_Christmas","FacetTraffic_12.5_Christmas","FacetTraffic_25_Christmas","FacetTraffic_50_Christmas"] #"CensoredTraffic_Christmas"
101 | 
102 |     for sampleFolder in sampleFolders:
103 |         for modeFolder in modeFolders:
104 |             if not os.path.exists(auxFolder + sampleFolder + modeFolder):
105 |             	os.makedirs(auxFolder + sampleFolder + modeFolder)
106 |             ParseCapture(sampleFolder, modeFolder)
107 |             CreateBigrams(sampleFolder, modeFolder)
108 |             #ComputeDelta(sampleFolder, modeFolder)
109 | 


--------------------------------------------------------------------------------
/CovertCastAnalysis/ParseCaptures.py:
--------------------------------------------------------------------------------
  1 | import dpkt
  2 | import os
  3 | from matplotlib import pyplot as plt
  4 | from matplotlib.pyplot import cm
  5 | import numpy as np
  6 | import socket
  7 | 
  8 | BIN_WIDTH = [15, 20, 50]
  9 | 
 10 | InterPacketBins = [5000,2500,1000]
 11 | 
 12 | 
 13 | auxFolder = 'auxFolder/'
 14 | 
 15 | def RoundToNearest(n, m):
 16 |         r = n % m
 17 |         return n + m - r if r + r >= m else n - r
 18 | 
 19 | def CreateBigrams(capsFolder, sampleFolder):
 20 |     for sample in os.listdir(capsFolder + sampleFolder):
 21 |         for binWidth in BIN_WIDTH:
 22 |             faux = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + '/bigrams_' + str(binWidth), 'w')
 23 |             f = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + "/packetCount_" + str(binWidth), 'r')
 24 | 
 25 |             lines = f.readlines()
 26 |             for index, line in enumerate(lines):
 27 |                 try:
 28 |                     faux.write(line.rstrip('\n') + "," + lines[index+1])
 29 |                 except IndexError:
 30 |                     break #Reached last index, stop processing
 31 |             faux.close()
 32 |             f.close()
 33 | 
 34 | 
 35 | def ComputeDelta(capsFolder, sampleFolder):
 36 |     for sample in os.listdir(capsFolder + sampleFolder):
 37 |         for binWidth in InterPacketBins:
 38 |             faux = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + '/deltaT_' + str(binWidth), 'w')
 39 |             f = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + "/timestamps", 'r')
 40 | 
 41 |             lines = f.readlines()
 42 |             for index, line in enumerate(lines):
 43 |                 try:
 44 |                     delta = "%0.6f" % (float(lines[index+1]) - (float(line.rstrip('\n'))))
 45 |                     delta = float(delta) * 1000000
 46 |                     faux.write("%s\n" % RoundToNearest(int(delta), int(binWidth)))
 47 |                 except IndexError:
 48 |                     break #Reached last index, stop processing
 49 | 
 50 |             faux.close()
 51 |             f.close()
 52 | 
 53 | def ParseCapture(capsFolder, sampleFolder):
 54 | 
 55 |     fig = plt.figure()
 56 |     ax1 = fig.add_subplot(111)
 57 | 
 58 |     for sample in os.listdir(capsFolder + sampleFolder):
 59 |         descriptors = []
 60 |         f = open(capsFolder + sampleFolder + "/" + sample)
 61 |         print sample
 62 |         if not os.path.exists(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample):
 63 |         	os.makedirs(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample)
 64 |         packet_count = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + '/packetCount_0', 'w')
 65 |         timestamps = open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + "/timestamps", 'w')
 66 |         for binWidth in BIN_WIDTH:
 67 |             print auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + '/packetCount_' + str(binWidth)
 68 |             descriptors.append(open(auxFolder + os.path.dirname(capsFolder) + "/" + sampleFolder + "/" + sample + '/packetCount_' + str(binWidth), 'w'))
 69 | 
 70 | 
 71 |         pcap = dpkt.pcapng.Reader(f)
 72 | 
 73 |         for ts, buf in pcap:
 74 |             eth = dpkt.ethernet.Ethernet(buf)
 75 |             ip_hdr = eth.data
 76 |             try:
 77 |                 if (eth.type != dpkt.ethernet.ETH_TYPE_IP and eth.type != dpkt.ethernet.ETH_TYPE_IP6):
 78 |                     continue
 79 |                 if eth.type != dpkt.ethernet.ETH_TYPE_IP6:
 80 |                     src_ip_addr_str = socket.inet_ntoa(ip_hdr.src)
 81 |                 else:
 82 |                     src_ip_addr_str = socket.inet_ntop(socket.AF_INET6, ip_hdr.src)
 83 | 
 84 |                 if (ip_hdr.p == 17 and (ip_hdr.data.sport == 443 or ip_hdr.data.dport == 443)):
 85 |                     if(ip_hdr.data.sport == 443):
 86 |                         for i, descript in enumerate(BIN_WIDTH):
 87 |                             descriptors[i].write("%s\n" % RoundToNearest(len(buf), BIN_WIDTH[i]))
 88 |                         timestamps.write("{0:.6f}".format(ts) + "\n")
 89 |                         packet_count.write("%s\n" % len(buf))
 90 | 
 91 |             except Exception as e:
 92 |                 print "[Exception]" + str(e)
 93 |         packet_count.close()
 94 |         timestamps.close()
 95 |         for i, descript in enumerate(BIN_WIDTH):
 96 |             descriptors[i].close()
 97 |         f.close()
 98 | 
 99 | 
100 | if __name__ == "__main__":
101 |     sampleFolders = ["TrafficCaptures/"]
102 | 
103 |     modeFolders = ["YouTube_home_world_live","CovertCast_home_world"]
104 | 
105 |     for sampleFolder in sampleFolders:
106 |         for modeFolder in modeFolders:
107 |             if not os.path.exists(auxFolder + sampleFolder + modeFolder):
108 |             	os.makedirs(auxFolder + sampleFolder + modeFolder)
109 |             #ParseCapture(sampleFolder, modeFolder)
110 |             CreateBigrams(sampleFolder, modeFolder)
111 |             #ComputeDelta(sampleFolder, modeFolder)
112 | 


--------------------------------------------------------------------------------
/CovertCastAnalysis/KL_classifier.py:
--------------------------------------------------------------------------------
  1 | import dpkt
  2 | import os
  3 | from matplotlib import pyplot as plt
  4 | from matplotlib.pyplot import cm
  5 | import numpy as np
  6 | import socket
  7 | import collections
  8 | from scipy.stats import entropy
  9 | 
 10 | auxFolder = 'auxFolder/'
 11 | 
 12 | cfgs = [
 13 | ["YouTube_home_world_live",
 14 | "CovertCast_home_world"]
 15 | ]
 16 | 
 17 | 
 18 | BIN_WIDTH = [15]
 19 | 
 20 | 
 21 | def ComputeFrequencyDistributions(sampleFolder, cfg, binWidth):
 22 |     freq_dists = []
 23 | 
 24 |     for mode in cfg:
 25 |     #Compute frequency distribution for A and B
 26 |         freq_dist = []
 27 |         for sample in os.listdir(sampleFolder + mode):
 28 | 
 29 |             f = open(auxFolder + os.path.dirname(sampleFolder) + "/" + mode + "/" + sample + '/packetCount_' + str(binWidth), 'r')
 30 | 
 31 |             bin_dict = {}
 32 |             bins=[]
 33 |             #Generate the set of all possible bins
 34 |             for i in range(0,1500, binWidth):
 35 |                 bin_dict[str(i).replace(" ", "")] = 1
 36 | 
 37 | 
 38 |             lines = f.readlines()
 39 |             for line in lines:
 40 |                 try:
 41 |                     bins.append(line.rstrip('\n'))
 42 |                 except IndexError:
 43 |                     break #Reached last index, stop processing
 44 |             f.close()
 45 | 
 46 |             #Account for each bin elem
 47 |             for i in bins:
 48 |                 bin_dict[str(i)]+=1
 49 | 
 50 |             #Order bin_key : num_packets
 51 |             od_dict = collections.OrderedDict(sorted(bin_dict.items(), key=lambda t: float(t[0])))
 52 |             bin_list = []
 53 |             for i in od_dict:
 54 |                 bin_list.append(float(od_dict[i]))
 55 | 
 56 |             #Build up the list of a distribution samples freq dist
 57 |             freq_dist.append(bin_list)
 58 |         #Build up the list of all freq dists for different sample folders
 59 |         freq_dists.append(freq_dist)
 60 | 
 61 |     return freq_dists
 62 | 
 63 | def KL_Classify(freq_dists):
 64 |     # A vs A
 65 |     AvsA_matrix = []
 66 |     for i in range(0, len(freq_dists[0])):
 67 |         AxVsAy = []
 68 |         for j in range(0, len(freq_dists[0])):
 69 |             d = entropy(freq_dists[0][i],freq_dists[0][j])
 70 |             AxVsAy.append(d)
 71 |         AvsA_matrix.append(AxVsAy)
 72 | 
 73 |     # A vs B
 74 |     AvsB_matrix = []
 75 |     for i in range(0,len(freq_dists[0])):
 76 |         AxVsBy = []
 77 |         for j in range(0, len(freq_dists[1])):
 78 |             d = entropy(freq_dists[0][i],freq_dists[1][j])
 79 |             AxVsBy.append(d)
 80 |         AvsB_matrix.append(AxVsBy)
 81 | 
 82 |     # B vs B
 83 |     BvsB_matrix = []
 84 |     for i in range(0, len(freq_dists[1])):
 85 |         BxVsBy = []
 86 |         for j in range(0, len(freq_dists[1])):
 87 |             d = entropy(freq_dists[1][i],freq_dists[1][j])
 88 |             BxVsBy.append(d)
 89 |         BvsB_matrix.append(BxVsBy)
 90 | 
 91 |     # B vs A
 92 |     BvsA_matrix = []
 93 |     for i in range(0,len(freq_dists[1])):
 94 |         BxVsAy = []
 95 |         for j in range(0, len(freq_dists[0])):
 96 |             d = entropy(freq_dists[1][i],freq_dists[0][j])
 97 |             BxVsAy.append(d)
 98 |         BvsA_matrix.append(BxVsAy)
 99 | 
100 |     ##########################
101 |     #Compute success metric
102 |     #Set A - YouTube
103 |     #Set B - CovertCast
104 |     #TP = Correctly identify CovertCast
105 |     #TN = Correctly identify YouTube
106 |     ##########################
107 | 
108 |     total_KL_distances = 0
109 |     success = 0
110 |     TrueNegatives = 0
111 |     TruePositives = 0
112 | 
113 |     #A - B
114 |     for i in range(0,len(freq_dists[0])):
115 |         for j in range(0, len(AvsA_matrix[i])):
116 |             for k in range(0, len(AvsB_matrix[i])):
117 |                 total_KL_distances+=1
118 |                 if(AvsA_matrix[i][j] < AvsB_matrix[i][k]):
119 |                     success += 1
120 |                     TrueNegatives += 1
121 |     # B - A
122 |     for i in range(0,len(freq_dists[1])):
123 |         for j in range(0, len(BvsB_matrix[i])):
124 |             for k in range(0, len(BvsA_matrix[i])):
125 |                 total_KL_distances +=1
126 |                 if(BvsB_matrix[i][j] < BvsA_matrix[i][k]):
127 |                     success += 1
128 |                     TruePositives += 1
129 | 
130 |     print "Total Accuracy: " + str(success / float(total_KL_distances))
131 |     print "TruePositives: " + str(TruePositives / float(total_KL_distances/2.0))
132 |     print "TrueNegatives: " + str(TrueNegatives / float(total_KL_distances/2.0))
133 | 
134 | 
135 | if __name__ == "__main__":
136 | 
137 |     sampleFolders = ['TrafficCaptures/']
138 | 
139 |     for sampleFolder in sampleFolders:
140 |         print "###########################"
141 |         print os.path.dirname(sampleFolder)
142 |         print "###########################"
143 |         for cfg in cfgs:
144 | 
145 |             print "KL classifier - Regular vs " + cfg[1]
146 |             for binWidth in BIN_WIDTH:
147 |                 print "Bin Width: " + str(binWidth)
148 |                 KL_Classify(ComputeFrequencyDistributions(sampleFolder, cfg, binWidth))
149 | 


--------------------------------------------------------------------------------
/DeltaShaperAnalysis/KL_classifier.py:
--------------------------------------------------------------------------------
  1 | import dpkt
  2 | import os
  3 | from matplotlib import pyplot as plt
  4 | from matplotlib.pyplot import cm
  5 | import numpy as np
  6 | import socket
  7 | import collections
  8 | from scipy.stats import entropy
  9 | 
 10 | auxFolder = 'auxFolder/'
 11 | 
 12 | cfgs = [
 13 | ["RegularTraffic",
 14 | "DeltaShaperTraffic_320"],
 15 | ["RegularTraffic",
 16 | "DeltaShaperTraffic_160"]]
 17 | 
 18 | 
 19 | BIN_WIDTH = [15]
 20 | 
 21 | def ComputeFrequencyDistributions(sampleFolder, cfg, binWidth):
 22 |     freq_dists = []
 23 | 
 24 |     for mode in cfg:
 25 |     #Compute frequency distribution for A and B
 26 |         freq_dist = []
 27 |         for sample in os.listdir(sampleFolder + mode):
 28 | 
 29 |             f = open(auxFolder + os.path.dirname(sampleFolder) + "/" + mode + "/" + sample + '/packetCount_' + str(binWidth), 'r')
 30 | 
 31 |             bin_dict = {}
 32 |             bins=[]
 33 |             #Generate the set of all possible bins
 34 |             for i in range(0,1500, binWidth):
 35 |                 bin_dict[str(i).replace(" ", "")] = 1
 36 | 
 37 | 
 38 |             lines = f.readlines()
 39 |             for line in lines:
 40 |                 try:
 41 |                     bins.append(line.rstrip('\n'))
 42 |                 except IndexError:
 43 |                     break #Reached last index, stop processing
 44 |             f.close()
 45 | 
 46 |             #Account for each bin elem
 47 |             for i in bins:
 48 |                 bin_dict[str(i)]+=1
 49 | 
 50 |             #Order bin_key : num_packets
 51 |             od_dict = collections.OrderedDict(sorted(bin_dict.items(), key=lambda t: float(t[0])))
 52 |             bin_list = []
 53 |             for i in od_dict:
 54 |                 bin_list.append(float(od_dict[i]))
 55 | 
 56 |             #Build up the list of a distribution samples freq dist
 57 |             freq_dist.append(bin_list)
 58 |         #Build up the list of all freq dists for different sample folders
 59 |         freq_dists.append(freq_dist)
 60 | 
 61 |     return freq_dists
 62 | 
 63 | def KL_Classify(freq_dists):
 64 |     # A vs A
 65 |     AvsA_matrix = []
 66 |     for i in range(0, len(freq_dists[0])):
 67 |         AxVsAy = []
 68 |         for j in range(0, len(freq_dists[0])):
 69 |             d = entropy(freq_dists[0][i],freq_dists[0][j])
 70 |             AxVsAy.append(d)
 71 |         AvsA_matrix.append(AxVsAy)
 72 | 
 73 |     # A vs B
 74 |     AvsB_matrix = []
 75 |     for i in range(0,len(freq_dists[0])):
 76 |         AxVsBy = []
 77 |         for j in range(0, len(freq_dists[1])):
 78 |             d = entropy(freq_dists[0][i],freq_dists[1][j])
 79 |             AxVsBy.append(d)
 80 |         AvsB_matrix.append(AxVsBy)
 81 | 
 82 |     # B vs B
 83 |     BvsB_matrix = []
 84 |     for i in range(0, len(freq_dists[1])):
 85 |         BxVsBy = []
 86 |         for j in range(0, len(freq_dists[1])):
 87 |             d = entropy(freq_dists[1][i],freq_dists[1][j])
 88 |             BxVsBy.append(d)
 89 |         BvsB_matrix.append(BxVsBy)
 90 | 
 91 |     # B vs A
 92 |     BvsA_matrix = []
 93 |     for i in range(0,len(freq_dists[1])):
 94 |         BxVsAy = []
 95 |         for j in range(0, len(freq_dists[0])):
 96 |             d = entropy(freq_dists[1][i],freq_dists[0][j])
 97 |             BxVsAy.append(d)
 98 |         BvsA_matrix.append(BxVsAy)
 99 | 
100 |     ##########################
101 |     #Compute success metric
102 |     #Set A - YouTube
103 |     #Set B - CovertCast
104 |     #TP = Correctly identify CovertCast
105 |     #TN = Correctly identify YouTube
106 |     ##########################
107 | 
108 |     total_KL_distances = 0
109 |     success = 0
110 |     TrueNegatives = 0
111 |     TruePositives = 0
112 | 
113 |     #A - B
114 |     for i in range(0,len(freq_dists[0])):
115 |         for j in range(0, len(AvsA_matrix[i])):
116 |             for k in range(0, len(AvsB_matrix[i])):
117 |                 total_KL_distances+=1
118 |                 if(AvsA_matrix[i][j] < AvsB_matrix[i][k]):
119 |                     success += 1
120 |                     TrueNegatives += 1
121 |     # B - A
122 |     for i in range(0,len(freq_dists[1])):
123 |         for j in range(0, len(BvsB_matrix[i])):
124 |             for k in range(0, len(BvsA_matrix[i])):
125 |                 total_KL_distances +=1
126 |                 if(BvsB_matrix[i][j] < BvsA_matrix[i][k]):
127 |                     success += 1
128 |                     TruePositives += 1
129 | 
130 |     print "Total Accuracy: " + str(success / float(total_KL_distances))
131 |     print "TruePositives: " + str(TruePositives / float(total_KL_distances/2.0))
132 |     print "TrueNegatives: " + str(TrueNegatives / float(total_KL_distances/2.0))
133 | 
134 | 
135 | if __name__ == "__main__":
136 | 
137 |     sampleFolders = ['TrafficCaptures/480Resolution/']
138 | 
139 |     for sampleFolder in sampleFolders:
140 |         print "###########################"
141 |         print os.path.dirname(sampleFolder)
142 |         print "###########################"
143 |         for cfg in cfgs:
144 |             print "KL classifier - " + cfg[0] + " vs " + cfg[1]
145 |             for binWidth in BIN_WIDTH:
146 |                 print "Bin Width: " + str(binWidth)
147 |                 KL_Classify(ComputeFrequencyDistributions(sampleFolder, cfg, binWidth))
148 | 


--------------------------------------------------------------------------------
/FacetAnalysis/KL_classifier.py:
--------------------------------------------------------------------------------
  1 | import dpkt
  2 | import os
  3 | from matplotlib import pyplot as plt
  4 | from matplotlib.pyplot import cm
  5 | import numpy as np
  6 | import socket
  7 | import time
  8 | import collections
  9 | from scipy.stats import entropy
 10 | 
 11 | auxFolder = 'auxFolder/'
 12 | 
 13 | cfgs = [
 14 | ["RegularTraffic_Christmas",
 15 | "FacetTraffic_12.5_Christmas"],
 16 | ["RegularTraffic_Christmas",
 17 | "FacetTraffic_25_Christmas"],
 18 | ["RegularTraffic_Christmas",
 19 | "FacetTraffic_50_Christmas"]
 20 | ]
 21 | 
 22 | 
 23 | BIN_WIDTH = [15]
 24 | 
 25 | def ComputeFrequencyDistributions(sampleFolder, cfg, binWidth):
 26 |     freq_dists = []
 27 | 
 28 |     for mode in cfg:
 29 |     #Compute frequency distribution for A and B
 30 |         freq_dist = []
 31 |         for sample in os.listdir(sampleFolder + mode):
 32 | 
 33 |             f = open(auxFolder + os.path.dirname(sampleFolder) + "/" + mode + "/" + sample + '/packetCount_' + str(binWidth), 'r')
 34 | 
 35 |             bin_dict = {}
 36 |             bins=[]
 37 |             #Generate the set of all possible bins
 38 |             for i in range(0,1500, binWidth):
 39 |                 bin_dict[str(i).replace(" ", "")] = 1
 40 | 
 41 | 
 42 |             lines = f.readlines()
 43 |             for line in lines:
 44 |                 try:
 45 |                     bins.append(line.rstrip('\n'))
 46 |                 except IndexError:
 47 |                     break #Reached last index, stop processing
 48 |             f.close()
 49 | 
 50 |             #Account for each bin elem
 51 |             for i in bins:
 52 |                 bin_dict[str(i)]+=1
 53 | 
 54 |             #Order bin_key : num_packets
 55 |             od_dict = collections.OrderedDict(sorted(bin_dict.items(), key=lambda t: float(t[0])))
 56 |             bin_list = []
 57 |             for i in od_dict:
 58 |                 bin_list.append(float(od_dict[i]))
 59 | 
 60 |             #Build up the list of a distribution samples freq dist
 61 |             freq_dist.append(bin_list)
 62 |         #Build up the list of all freq dists for different sample folders
 63 |         freq_dists.append(freq_dist)
 64 | 
 65 |     return freq_dists
 66 | 
 67 | def KL_Classify(freq_dists):
 68 | 
 69 |     #Time measurement - avg single KL
 70 |     """times = []
 71 |     for j in range(0, len(freq_dists[1])):
 72 |         start_time = time.time()
 73 |         d = entropy(freq_dists[0][0],freq_dists[1][j])
 74 |         end_time = time.time()
 75 |         times.append(end_time - start_time)
 76 |     print "Avg KL: " + "{0:.5f}".format(np.mean(times,axis=0))"""
 77 | 
 78 | 
 79 |     #time measurement - avg classification
 80 |     times = []
 81 |     start_time = time.time()
 82 |     for j in range(0, len(freq_dists[1])):
 83 |         d = entropy(freq_dists[0][0],freq_dists[1][j])
 84 |     for j in range(0, len(freq_dists[1])):
 85 |         d = entropy(freq_dists[0][1],freq_dists[1][j])
 86 | 
 87 |         
 88 |     end_time = time.time()
 89 |     times.append(end_time - start_time)
 90 |     #print "Avg sample classification time: " + "{0:.5f}".format(end_time - start_time)
 91 | 
 92 | 
 93 |     ###############################
 94 |     #Model Building
 95 |     ###############################
 96 |     start_time = time.time()
 97 |     # A vs A
 98 |     AvsA_matrix = []
 99 |     for i in range(0, len(freq_dists[0])):
100 |         AxVsAy = []
101 |         for j in range(0, len(freq_dists[0])):
102 |             d = entropy(freq_dists[0][i],freq_dists[0][j])
103 |             AxVsAy.append(d)
104 |         AvsA_matrix.append(AxVsAy)
105 | 
106 | 
107 | 
108 | 
109 |     # A vs B
110 |     AvsB_matrix = []
111 |     for i in range(0,len(freq_dists[0])):
112 |         AxVsBy = []
113 |         start_time = time.time()
114 |         for j in range(0, len(freq_dists[1])):
115 |             d = entropy(freq_dists[0][i],freq_dists[1][j])
116 |             AxVsBy.append(d)
117 |         AvsB_matrix.append(AxVsBy)
118 | 
119 | 
120 | 
121 |     # B vs B
122 |     BvsB_matrix = []
123 |     for i in range(0, len(freq_dists[1])):
124 |         BxVsBy = []
125 |         for j in range(0, len(freq_dists[1])):
126 |             d = entropy(freq_dists[1][i],freq_dists[1][j])
127 |             BxVsBy.append(d)
128 |         BvsB_matrix.append(BxVsBy)
129 | 
130 |     # B vs A
131 |     BvsA_matrix = []
132 |     for i in range(0,len(freq_dists[1])):
133 |         BxVsAy = []
134 |         for j in range(0, len(freq_dists[0])):
135 |             d = entropy(freq_dists[1][i],freq_dists[0][j])
136 |             BxVsAy.append(d)
137 |         BvsA_matrix.append(BxVsAy)
138 | 
139 |     end_time = time.time()
140 |     print "Model Building Time: " + "{0:.5f}".format(end_time - start_time)
141 |     ##########################
142 |     #Compute success metric
143 |     #Set A - YouTube
144 |     #Set B - CovertCast
145 |     #TP = Correctly identify CovertCast
146 |     #TN = Correctly identify YouTube
147 |     ##########################
148 | 
149 |     total_KL_distances = 0
150 |     success = 0
151 |     TrueNegatives = 0
152 |     TruePositives = 0
153 | 
154 |     #A - B
155 |     for i in range(0,len(freq_dists[0])):
156 |         for j in range(0, len(AvsA_matrix[i])):
157 |             for k in range(0, len(AvsB_matrix[i])):
158 |                 total_KL_distances+=1
159 |                 if(AvsA_matrix[i][j] < AvsB_matrix[i][k]):
160 |                     success += 1
161 |                     TrueNegatives += 1
162 |     # B - A
163 |     for i in range(0,len(freq_dists[1])):
164 |         for j in range(0, len(BvsB_matrix[i])):
165 |             for k in range(0, len(BvsA_matrix[i])):
166 |                 total_KL_distances +=1
167 |                 if(BvsB_matrix[i][j] < BvsA_matrix[i][k]):
168 |                     success += 1
169 |                     TruePositives += 1
170 | 
171 | 
172 |     print "Total Accuracy: " + str(success / float(total_KL_distances))
173 |     print "TruePositives: " + str(TruePositives / float(total_KL_distances/2.0))
174 |     print "TrueNegatives: " + str(TrueNegatives / float(total_KL_distances/2.0))
175 | 
176 | 
177 | if __name__ == "__main__":
178 | 
179 |     sampleFolders = ['TrafficCaptures/240Resolution/']
180 | 
181 |     for sampleFolder in sampleFolders:
182 |         print "###########################"
183 |         print os.path.dirname(sampleFolder)
184 |         print "###########################"
185 |         for cfg in cfgs:
186 |             print "KL classifier - " + cfg[0] + " vs " + cfg[1]
187 |             for binWidth in BIN_WIDTH:
188 |                 print "Bin Width: " + str(binWidth)
189 |                 KL_Classify(ComputeFrequencyDistributions(sampleFolder, cfg, binWidth))
190 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Index
  2 | 
  3 | 1. Classifiers description
  4 | 2. Extracting features for feeding classifiers
  5 | 3. Running the classifiers
  6 | 4. Watching some more figures
  7 | 5. Full example for Facet analysis
  8 | 
  9 | #### Python 2.7 package requirements
 10 | Install the required Python packages by running:
 11 |     `pip install -r requirements.txt`
 12 |     
 13 | [**Note**] Not using virtualenvs, packages will be installed system-wide.
 14 | 
 15 | #### Traffic Captures Location
 16 | Traffic captures are available [here](https://turbina.gsd.inesc-id.pt/resources/resources.html).
 17 | 
 18 | Copy each `TrafficCaptures` folder into the respective path in **MPTAnalysis** repo:
 19 | 
 20 |     MPTAnalysis/FacetAnalysis/TrafficCaptures
 21 |     MPTAnalysis/DeltaShaperAnalysis/TrafficCaptures
 22 |     MPTAnalysis/CovertCastAnalysis/TrafficCaptures
 23 | 
 24 | ## 1- Classifiers Description
 25 | ### Similarity-based classifiers
 26 | *EMD classifier:*
 27 | `EMD_classifier.py` -- This file includes the threshold-based EMD classifier as proposed in DeltaShaper.
 28 | 
 29 | *Chi-Square classifier:*
 30 | `X2_classifier.py`-- This file includes the Chi-Square test-based classifier as proposed in Facet.
 31 | 
 32 | *Kullback-Leibler classifier:*
 33 | `KL_classifier.py`-- This file includes the Kullback-Leibler-divergence  classifier as proposed in CovertCast. 
 34 | 
 35 | ### Decision Tree-based classifiers
 36 | *Decision Tree, Random Forest, and XGBoost:*
 37 | `xgboost_classifier.py` -- This file includes the three decision tree-based classifiers used in our paper.
 38 | 
 39 | ### Semi-Supervised / Unsupervised
 40 | *Autoencoder:*
 41 | `autoencoder.py`-- This file contains the TensorFlow code required to run our semi-supervised autoencoder.
 42 | 
 43 | *One-Class SVM:*
 44 | `OCSVM.py`-- This file includes the One-Class SVM classifier.
 45 | 
 46 | *IsolationForests:*
 47 | `IsolationForests.py`-- This file includes the Isolation Forests classifier.
 48 | 
 49 | ## 2 - Extracting features for feeding classifiers
 50 | ### Similarity-based classifiers
 51 | For using our similarity-based classifiers, raw packet captures must be mangled in order to extract binned packet sizes / bi-grams of packet sizes. `ParseCaptures.py`includes the code for parsing the raw packet captures into packet length bins of size [15, 20, 50], which will be respectively used by the [KL, X2, EMD] classifiers. Extracted features will be located in a newly generated folder called `auxFolder`.
 52 | 
 53 | Albeit `ParseCaptures.py` is also prepared to extract inter-packet timing features, we will not be using these with our similarity-based classifiers.
 54 | 
 55 | [**Disclaimer**] Extraction can take a while, I did not parallelize this code as it would be just a one-time execution.
 56 | 
 57 | ### Remaining classifiers
 58 | For using the remaining classifiers, we will extract features and build datasets to be stored in `.csv` files.
 59 | 
 60 | File `extractFeatures.py`contains the required code for extracting our two different sets of features (Binned packet lengths / Summary statistics) from existing packet captures. This file defines two functions for each set of features, respectively: `FeatureExtractionPLBenchmark` and `FeatureExtractionStatsBenchmark`. Each can be called in the main code. `GenerateDatasets` will take the job of combining the extracted sets of features and build the datasets.
 61 | 
 62 | Feature datasets will be stored in the `FeatureSets` folder. For instance `PL_60` stores the datasets pertaining to the extraction of binned Packet Lengths collected in an interval of 60 seconds of the whole packet trace.
 63 | 
 64 | ## 3 - Running the classifiers
 65 | ### Similarity-based classifiers
 66 | `X2_classifier.py` provides two main functions for analysis, which can be selected to be used in its `main` interchangeably. `Prepare_X_RatioReproduction` reproduces the results of Facet's paper, outputting the results of a classifier with changing deltas (and enabling us to plot a ROC curve). `Prepare_X_Fixed` allows for obtaining fixed classification results for comparison with the Kullback-Leibler classifier which only outputs fixed classification rates.
 67 | 
 68 | Creates a folder called `X2` for holding AUC plots and serialized TPR/FPR rates for later producing the figures included in the paper.
 69 | 
 70 | [**Warning**] This code is not parallelized. Building the models for the classifier is an overnight effort (at least for Facet data).
 71 | 
 72 | ----
 73 | `EMD_classifier.py`can just be executed in order to output the classifier's results with changing deltas. The script prints the delta threshold where maximum accuracy for the classifier is reached, in order to compare with the Kullback-Leibler classifier which only outputs fixed classification rates.
 74 | 
 75 | Creates a folder called `EMD` for holding AUC plots and serialized TPR/FPR rates for later producing the figures included in the paper.
 76 | 
 77 | ----
 78 | `KL_classifier.py` outputs fixed classification results.
 79 | 
 80 | ### Decision Tree-based classifiers
 81 | `xgboost_classifier.py`outputs the classification results of our three different decision tree-based classifiers, for different True Positive / False Positive rates ratios. For training these classifiers, data is assumed to be fully labeled.
 82 | 
 83 | The script creates a folder called `xgBoost` for storing ROC AUC figures of each classification effort, along with serialized data for building our paper ROC figures + feature importance data.
 84 | 
 85 | [**Note**] In the `main` function, variable `data_folder` must point the folder containing the dataset extracted with the desired feature set. For our paper results, `FeatureSets/Stats_60` or `FeatureSets/PL_60` correspond to the either of our feature sets using Summary Statistics or binned Packet Lengths.
 86 | 
 87 | ### Semi-Supervised / Unsupervised
 88 | `OCSVM.py` runs a grid search on the parameter space of (nu,gamma) for OCSVM. It outputs the average and maximum AUC obtained after attempting to classify data points from a learned representation of legitimate video transmissions-only. In its `main` function, variable `data_folder` must point the folder containing the dataset extracted with the desired feature set.
 89 | 
 90 | `autoencoder.py` runs a grid search on the parameter space of (neurons in the hidden layer, size of the compressed representation layer) for our Autoencoder. It outputs the average and maximum AUC obtained after attempting to classify data points from a learned representation of legitimate video transmissions-only. In its `main` function, variable `data_folder` must point the folder containing the dataset extracted with the desired feature set.
 91 | 
 92 | `IsolationForests.py`runs a grid search on the parameter space of (number of trees, samples per tree) for our Isolation Forest. It outputs the average and maximum AUC obtained after attempting to classify unlabeled data points. In its `main` function, variable `data_folder` must point the folder containing the dataset extracted with the desired feature set.
 93 | 
 94 | The script creates a folder called `Isolation` for storing ROC AUC figures.
 95 | 
 96 | [**Note**] In the `main` function, variable `data_folder` must point the folder containing the dataset extracted with the desired feature set. For our paper results, `FeatureSets/Stats_60` or `FeatureSets/PL_60` correspond to the either of our feature sets using Summary Statistics or binned Packet Lengths.
 97 | 
 98 | ## 4 - Watching some more figures
 99 | In the case of Facet / DeltaShaper analysis, there is a folder called `Figures`. This folder includes `generateFigures.py` which generates the figures used in our paper + some more detail about feature analysis.
100 | 
101 | ## 5 - Full example for Facet analysis
102 | 
103 |     #Parse raw .pcap files for generating features for similarity-based classifiers
104 |     $ cd FacetAnalysis
105 |     $ python ParseCaptures.py
106 |     
107 |     #Run any similarity-based classifier
108 |     $ python [EMD_classifier.py, KL_classifier.py, X2_classifier.py]
109 |     
110 |     #Parse raw .pcap files for generating features for state-of-the-art ML algorithms
111 |     $ python extractFeatures.py
112 |     
113 |     #Run any ML classifier
114 |     $ python [xgboost_classifier.py, OCSVM.py, autoencoder.py, IsolationForests.py]
115 |     
116 |     #Generate paper figures
117 |     $ cd Figures
118 |     $ python generateFigures.py
119 | 
120 | 
121 | 


--------------------------------------------------------------------------------
/CovertCastAnalysis/EMD_classifier.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import dpkt
  3 | import subprocess
  4 | import socket
  5 | import os
  6 | from random import randint
  7 | import math
  8 | from itertools import product
  9 | import datetime
 10 | from matplotlib import pyplot as plt
 11 | from matplotlib.pyplot import cm
 12 | from matplotlib.ticker import MultipleLocator, FormatStrFormatter
 13 | import numpy as np
 14 | from pyemd import emd
 15 | import collections
 16 | 
 17 | BIN_WIDTH = [50]
 18 | folder = "auxFolder/"
 19 | 
 20 | 
 21 | cfgs = [
 22 | ["YouTube_home_world_live",
 23 | "CovertCast_home_world"]
 24 | ]
 25 | 
 26 | 
 27 | 
 28 | 
 29 | def GatherChatSamples(sampleFolder, baselines, binWidth):
 30 |     Samples = []
 31 |     for baseline in baselines:
 32 |         for cap in os.listdir(folder + sampleFolder + baseline):
 33 |             Samples.append(folder + sampleFolder + baseline + "/" + cap + "/" + 'packetCount_' + str(binWidth))
 34 |     return Samples
 35 | 
 36 | def ComputeRate(sampleFolder, emdResults, num_irregular_samples, num_regular_samples, binWidth):
 37 |     deltas = np.arange(0.001, 1, 0.001)
 38 | 
 39 |     Sensitivity = []
 40 |     Specificity = []
 41 | 
 42 | 
 43 |     max_acc = 0
 44 |     max_delta = 0
 45 |     max_tpr = 0
 46 |     max_tnr = 0
 47 |     max_fpr = 0
 48 | 
 49 |     accuracy = 0
 50 |     for delta in deltas:
 51 |         FPositives = 0
 52 |         FNegatives = 0
 53 |         TPositives = 0
 54 |         TNegatives = 0
 55 | 
 56 | 
 57 |         #Positives are Facet samples classified as Facet
 58 |         for i, capEMD in enumerate(emdResults):
 59 |             if(capEMD > delta and i < num_regular_samples): # Regular baselines
 60 |                 FPositives += 1
 61 |             if (capEMD < delta and i < num_regular_samples):
 62 |                 TNegatives += 1
 63 |             if(capEMD <= delta and i >= num_regular_samples): #Irregular baseline
 64 |                 FNegatives += 1
 65 |             if(capEMD > delta and i >= num_regular_samples):
 66 |                 TPositives += 1
 67 |         """
 68 |         #NEGATED
 69 |         for i, capEMD in enumerate(emdResults):
 70 |             if(capEMD > delta and i < num_regular_samples): # Regular baselines
 71 |                 TNegatives += 1
 72 |             if (capEMD < delta and i < num_regular_samples):
 73 |                 FPositives += 1
 74 |             if(capEMD <= delta and i >= num_regular_samples): #Irregular baseline
 75 |                 TPositives += 1
 76 |             if(capEMD > delta and i >= num_regular_samples):
 77 |                 FNegatives += 1
 78 |         """
 79 |         Sensitivity.append(TPositives/(TPositives+float(FNegatives)))
 80 |         Specificity.append(TNegatives/(TNegatives+float(FPositives)))
 81 | 
 82 |         accuracy = (TPositives + TNegatives)/float(num_irregular_samples + num_regular_samples)
 83 |         if(accuracy > max_acc):
 84 |             max_acc = accuracy
 85 |             max_delta = delta
 86 |             max_tpr = TPositives/(TPositives+float(FNegatives))
 87 |             max_tnr = TNegatives/(TNegatives+float(FPositives))
 88 |             max_fpr = 1 - max_tnr
 89 | 
 90 |     fig = plt.figure()
 91 |     ax1 = fig.add_subplot(111)
 92 | 
 93 |     print "AUC"
 94 |     auc = np.trapz(np.array(Sensitivity), 1 - np.array(Specificity))
 95 |     print auc
 96 |     #ROC Curve
 97 |     ax1.plot(1 - np.array(Specificity), np.array(Sensitivity), 'k.-', color='black', label = 'ROC (AUC = %0.2f)' % (auc))
 98 |     ax1.plot([0, 1], [0, 1], 'k--', lw=2, color='orange', label = 'Random Guess')
 99 |     ax1.grid(color='black', linestyle='dotted')
100 | 
101 |     plt.title('Receiver Operating Characteristic (ROC)')
102 |     plt.xlabel('False Positive Rate', fontsize='x-large')
103 |     plt.ylabel('True Positive Rate', fontsize='x-large')
104 |     plt.legend(loc='lower right', fontsize='large')
105 | 
106 |     plt.setp(ax1.get_xticklabels(), fontsize=14)
107 |     plt.setp(ax1.get_yticklabels(), fontsize=14)
108 | 
109 |     max_stats = "Max acc: " + str(max_acc) + " Max TPR:" + str(max_tpr) + " Max TNR:" + str(max_tnr) + " Max FPR:" + str(max_fpr) + " delta:" + str(max_delta)
110 | 
111 |     fig.savefig('EMD/' + sampleFolder + baselines[1] + '/Rate_' + str(binWidth) + '.pdf')   # save the figure to file
112 |     plt.close(fig)
113 | 
114 |     return max_stats
115 | 
116 | 
117 | def GenerateDists(samples, binWidth):
118 |     dists = []
119 |     print "Building distributions"
120 | 
121 |     for sample in samples:
122 |         #print sample
123 |         f = open(sample, 'r')
124 | 
125 |         Gk = {}
126 |         bins=[]
127 |         #Generate the set of all possible bins
128 |         for i in range(0,1500, binWidth):
129 |             Gk[str(i).replace(" ", "")] = 0
130 | 
131 | 
132 |         lines = f.readlines()
133 |         for line in lines:
134 |             try:
135 |                 bins.append(line.rstrip('\n'))
136 |             except IndexError:
137 |                 break #Reached last index, stop processing
138 | 
139 |         #Account for each bin elem
140 |         for i in bins:
141 |             Gk[str(i)]+=1
142 | 
143 |         od = collections.OrderedDict(sorted(Gk.items()))
144 |         Gklist = []
145 |         for i in od:
146 |             Gklist.append(float(od[i]))
147 |         Gklist = np.array(Gklist)
148 | 
149 |         dists.append(Gklist)
150 |         f.close()
151 |     print "End - Building distributions"
152 | 
153 |     #Build distance matrix
154 |     Gk = {}
155 |     bins=[]
156 |     #Generate the set of all possible bins
157 |     for i in range(0,1500, binWidth):
158 |         Gk[str(i).replace(" ", "")] = 0
159 | 
160 |     #Generate distance matrix
161 |     distance_matrix = []
162 |     for i in range(0,len(Gk)):
163 |         line =[]
164 |         for j in range(0,len(Gk)):
165 |             if(i==j):
166 |                 line.append(0.0)
167 |             else:
168 |                 line.append(1.0)
169 | 
170 |         distance_matrix.append(np.array(line))
171 |     distance_matrix = np.array(distance_matrix)
172 | 
173 |     return dists, distance_matrix
174 | 
175 | 
176 | def Classifier(toClassify, allSamples, baseSamples, distance_matrix, binWidth):
177 |     emdResults = []
178 |     emdSum = 0
179 |     ##################################
180 |     #Read first element in combination
181 |     ##################################
182 |     Gk_corelist = toClassify
183 | 
184 |     for n, sample in enumerate(baseSamples):
185 | 
186 |         Gklist = sample
187 |         ############################
188 |         ###### NORMALIZATION #######
189 |         ############################
190 |         ground1 = max(Gk_corelist)
191 |         ground2 = max(Gklist)
192 |         if(ground1 > ground2):
193 |             MAX = ground1
194 |         else:
195 |             MAX = ground2
196 | 
197 |         if(max(np.cumsum(Gk_corelist)) > max(np.cumsum(Gklist))):
198 |             cSum = max(np.cumsum(Gk_corelist))
199 |         else:
200 |             cSum =  max(np.cumsum(Gklist))
201 | 
202 |         dtm = distance_matrix/cSum
203 | 
204 |         emdR = float(emd(Gk_corelist, Gklist, dtm))
205 |         emdSum += emdR
206 |         emdResults.append(emdR)
207 | 
208 |     avgEMD = emdSum / len(emdResults)
209 |     #print str(avgEMD)
210 |     return avgEMD
211 | 
212 | def plotEMD(sampleFolder, baselines, binWidth):
213 |     regularSamples = GatherChatSamples(sampleFolder,baselines[:-1], binWidth)
214 |     allSamples = GatherChatSamples(sampleFolder, baselines, binWidth)
215 | 
216 |     allSamplesDists, distance_matrix = GenerateDists(allSamples, binWidth)
217 | 
218 |     emdResults = []
219 |     for n, bs in enumerate(allSamplesDists):
220 |         #print allSamples[n]
221 |         emdResults.append(Classifier(bs, allSamples, allSamplesDists[:len(regularSamples)], distance_matrix, binWidth))
222 | 
223 |     acc = float(0)
224 |     for i in range(0,len(regularSamples)):
225 |         acc += emdResults[i]
226 |     acc = acc/len(regularSamples)
227 |     print "AVG Regular " + str(acc)
228 | 
229 |     max_stat = ComputeRate(sampleFolder, emdResults, len(allSamples) - len(regularSamples), len(regularSamples), binWidth)
230 |     print max_stat
231 | 
232 | 
233 | if __name__ == "__main__":
234 | 
235 |     sampleFolders = ["TrafficCaptures/"]
236 | 
237 |     if not os.path.exists('EMD'):
238 |                 os.makedirs('EMD')
239 | 
240 |     for sampleFolder in sampleFolders:
241 |         for baselines in cfgs:
242 |             print "==========================================="
243 |             print "Analyzing " + baselines[0] + " - " + baselines[1]
244 |             for binWidth in BIN_WIDTH:
245 |                 print "##############"
246 |                 print "BinWidth: " + str(binWidth)
247 |                 if not os.path.exists('EMD/' + sampleFolder + baselines[1]):
248 |                     os.makedirs('EMD/' + sampleFolder + baselines[1])
249 |                 plotEMD(sampleFolder, baselines, binWidth)
250 | 


--------------------------------------------------------------------------------
/CovertCastAnalysis/xgboost_classifier.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import os
  3 | import csv
  4 | import numpy as np
  5 | from scipy import interp
  6 | import random
  7 | from random import shuffle
  8 | import math
  9 | #Classifiers
 10 | from xgboost import XGBClassifier
 11 | from sklearn.ensemble import RandomForestClassifier
 12 | from sklearn.tree import DecisionTreeClassifier
 13 | from sklearn.neighbors import KNeighborsClassifier
 14 | #Eval Metrics
 15 | from sklearn.model_selection import train_test_split, KFold
 16 | from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc
 17 | from sklearn.model_selection import cross_val_score
 18 | 
 19 | np.random.seed(1)
 20 | random.seed(1)
 21 | 
 22 | 
 23 | def gatherHoldoutData(data_folder, cfg):
 24 |     SPLIT_FACTOR = 0.7
 25 |     #Load Datasets
 26 |     f = open(data_folder + cfg[0] + "_dataset.csv", 'r')
 27 |     reader = csv.reader(f, delimiter=',')
 28 |     reg = list(reader)
 29 | 
 30 |     f = open(data_folder + cfg[1] + "_dataset.csv", 'r')
 31 |     reader = csv.reader(f, delimiter=',')
 32 |     fac = list(reader)
 33 |     print "###########################################"
 34 |     print "Configuration " + cfg[1]
 35 |     print "###########################################"
 36 | 
 37 |     #Convert data to floats (and labels to integers)
 38 |     reg_data = []
 39 |     for i in reg[1:]:
 40 |         int_array = []
 41 |         for pl in i[:-1]:
 42 |             int_array.append(float(pl))
 43 |         int_array.append(0)
 44 |         reg_data.append(int_array)
 45 | 
 46 |     fac_data = []
 47 |     for i in fac[1:]:
 48 |         int_array = []
 49 |         for pl in i[:-1]:
 50 |             int_array.append(float(pl))
 51 |         int_array.append(1)
 52 |         fac_data.append(int_array)
 53 | 
 54 | 
 55 |     #Shuffle both datasets
 56 |     shuffled_reg_data = random.sample(reg_data, len(reg_data))
 57 |     shuffled_fac_data = random.sample(fac_data, len(fac_data))
 58 | 
 59 |     #Build label tensors
 60 |     reg_labels = []
 61 |     for i in shuffled_reg_data:
 62 |         reg_labels.append(int(i[len(reg_data[0])-1]))
 63 | 
 64 |     fac_labels = []
 65 |     for i in shuffled_fac_data:
 66 |         fac_labels.append(int(i[len(reg_data[0])-1]))
 67 | 
 68 |     #Take label out of data tensors
 69 |     for i in range(0, len(shuffled_reg_data)):
 70 |         shuffled_reg_data[i].pop()
 71 | 
 72 |     for i in range(0, len(shuffled_fac_data)):
 73 |         shuffled_fac_data[i].pop()
 74 | 
 75 | 
 76 |     #Build training and testing datasets
 77 |     #Split each class data in the appropriate proportion for training
 78 |     reg_proportion_index = int(len(reg_labels)* SPLIT_FACTOR)
 79 |     reg_train_x = shuffled_reg_data[:reg_proportion_index]
 80 |     reg_train_y = reg_labels[:reg_proportion_index]
 81 | 
 82 |     fac_proportion_index = int(len(fac_labels)*SPLIT_FACTOR)
 83 |     fac_train_x = shuffled_fac_data[:fac_proportion_index]
 84 |     fac_train_y = fac_labels[:fac_proportion_index]
 85 | 
 86 |     #Create training sets by combining the randomly selected samples from each class
 87 |     train_x = reg_train_x + fac_train_x
 88 |     train_y = reg_train_y + fac_train_y
 89 | 
 90 |     #Make the split for the testing data
 91 |     reg_test_x = shuffled_reg_data[reg_proportion_index:]
 92 |     reg_test_y = reg_labels[reg_proportion_index:]
 93 | 
 94 |     fac_test_x = shuffled_fac_data[fac_proportion_index:]
 95 |     fac_test_y = fac_labels[fac_proportion_index:]
 96 | 
 97 |     #Create testing set by combining the holdout samples
 98 |     test_x = reg_test_x + fac_test_x
 99 |     test_y = reg_test_y + fac_test_y
100 | 
101 |     return train_x, train_y, test_x, test_y
102 | 
103 | def gatherAllData(data_folder, cfg):
104 |     #Load Datasets
105 |     f = open(data_folder + cfg[0] + "_dataset.csv", 'r')
106 |     reader = csv.reader(f, delimiter=',')
107 |     reg = list(reader)
108 | 
109 |     f = open(data_folder + cfg[1] + "_dataset.csv", 'r')
110 |     reader = csv.reader(f, delimiter=',')
111 |     fac = list(reader)
112 |     print "###########################################"
113 |     print "Configuration " + cfg[1]
114 |     print "###########################################"
115 | 
116 |     #Convert data to floats (and labels to integers)
117 |     reg_data = []
118 |     for i in reg[1:]:
119 |         int_array = []
120 |         for pl in i[:-1]:
121 |             int_array.append(float(pl))
122 |         int_array.append(0)
123 |         reg_data.append(int_array)
124 | 
125 |     fac_data = []
126 |     for i in fac[1:]:
127 |         int_array = []
128 |         for pl in i[:-1]:
129 |             int_array.append(float(pl))
130 |         int_array.append(1)
131 |         fac_data.append(int_array)
132 | 
133 | 
134 |     #Shuffle both datasets
135 |     shuffled_reg_data = random.sample(reg_data, len(reg_data))
136 |     shuffled_fac_data = random.sample(fac_data, len(fac_data))
137 | 
138 |     #Build label tensors
139 |     reg_labels = []
140 |     for i in shuffled_reg_data:
141 |         reg_labels.append(int(i[len(reg_data[0])-1]))
142 | 
143 |     fac_labels = []
144 |     for i in shuffled_fac_data:
145 |         fac_labels.append(int(i[len(reg_data[0])-1]))
146 | 
147 |     #Take label out of data tensors
148 |     for i in range(0, len(shuffled_reg_data)):
149 |         shuffled_reg_data[i].pop()
150 | 
151 |     for i in range(0, len(shuffled_fac_data)):
152 |         shuffled_fac_data[i].pop()
153 | 
154 |     #Create training sets by combining the randomly selected samples from each class
155 |     train_x = shuffled_reg_data + shuffled_fac_data
156 |     train_y = reg_labels + fac_labels
157 | 
158 |     #Shuffle positive/negative samples for CV purposes
159 |     x_shuf = []
160 |     y_shuf = []
161 |     index_shuf = range(len(train_x))
162 |     shuffle(index_shuf)
163 |     for i in index_shuf:
164 |         x_shuf.append(train_x[i])
165 |         y_shuf.append(train_y[i])
166 | 
167 |     return x_shuf, y_shuf
168 | 
169 | 
170 | def runXGBoost(data_folder, cfg):
171 |     #Gather the dataset
172 |     print "Gather dataset"
173 |     train_x, train_y, test_x, test_y = gatherHoldoutData(data_folder, cfg)
174 | 
175 | 
176 |     param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}
177 |     param['nthread'] = 4
178 |     param['eval_metric'] = 'auc'
179 | 
180 | 
181 |     model = XGBClassifier()
182 |     model.fit(np.asarray(train_x), np.asarray(train_y))
183 | 
184 |     y_pred = model.predict(np.asarray(test_x))
185 |     predictions = [round(value) for value in y_pred]
186 | 
187 |     # evaluate predictions
188 |     accuracy = accuracy_score(np.asarray(test_y), predictions)
189 |     print("Accuracy: %.2f%%" % (accuracy * 100.0))
190 | 
191 |     y_pred = model.predict_proba(np.asarray(test_x))[:,1]
192 |     print 'Area under ROC:', roc_auc_score(np.asarray(test_y),y_pred)
193 | 
194 | 
195 | def runClassification_CV(data_folder,cfg,classifier):
196 |     print "Gather dataset"
197 |     train_x, train_y= gatherAllData(data_folder, cfg)
198 | 
199 |     model = classifier[0]
200 |     clf_name = classifier[1]
201 | 
202 |     #Report Cross-Validation Accuracy
203 |     scores = cross_val_score(model, np.asarray(train_x), np.asarray(train_y), cv=10)
204 |     print clf_name
205 |     print "Avg. Accuracy: " + str(sum(scores)/float(len(scores)))
206 | 
207 |     cv = KFold(n_splits=10)
208 |     tprs = []
209 |     aucs = []
210 |     mean_fpr = np.linspace(0, 1, 100)
211 | 
212 | 
213 |     #Split the data in k-folds, perform classification, and report ROC
214 |     i = 0
215 |     for train, test in cv.split(train_x, train_y):
216 |         probas_ = model.fit(np.asarray(train_x)[train], np.asarray(train_y)[train]).predict_proba(np.asarray(train_x)[test])
217 | 
218 |         # Compute ROC curve and area under the curve
219 |         fpr, tpr, thresholds = roc_curve(np.asarray(train_y)[test], probas_[:, 1])
220 |         tprs.append(interp(mean_fpr, fpr, tpr))
221 |         tprs[-1][0] = 0.0
222 |         roc_auc = auc(fpr, tpr)
223 |         aucs.append(roc_auc)
224 |         #plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
225 |         i += 1
226 | 
227 |     plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Random Guess', alpha=.8)
228 | 
229 | 
230 |     mean_tpr = np.mean(tprs, axis=0)
231 |     mean_tpr[-1] = 1.0
232 |     mean_auc = auc(mean_fpr, mean_tpr)
233 | 
234 |     unblock70 = True
235 |     unblock80 = True
236 |     unblock90 = True
237 |     unblock95 = True
238 |     for n, i in enumerate(mean_tpr):
239 |         if(i >= 0.7 and unblock70):
240 |             print '70%  TPR  = ' + str(mean_fpr[n])
241 |             unblock70 = False
242 |         if(i >= 0.8 and unblock80):
243 |             print '80%  TPR  = ' + str(mean_fpr[n])
244 |             unblock80 = False
245 |         if(i >= 0.9 and unblock90):
246 |             print '90%  TPR  = ' + str(mean_fpr[n])
247 |             unblock90 = False
248 |         if(i >= 0.95 and unblock95):
249 |             print '95%  TPR  = ' + str(mean_fpr[n])
250 |             unblock95 = False
251 | 
252 |     #Figure properties
253 |     fig = plt.figure()
254 |     ax1 = fig.add_subplot(111)
255 | 
256 |     std_auc = np.std(aucs)
257 |     plt.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.3f)' % (mean_auc, std_auc), lw=2, alpha=.8)
258 | 
259 |     #Compute Standard Deviation between folds
260 |     std_tpr = np.std(tprs, axis=0)
261 |     tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
262 |     tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
263 |     plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.3, label=r'$\pm$ ROC Std. Dev.')
264 | 
265 | 
266 | 
267 |     ax1.plot([0, 1], [0, 1], 'k--', lw=2, color='orange', label = 'Random Guess')
268 |     ax1.grid(color='black', linestyle='dotted')
269 | 
270 |     plt.title('Receiver Operating Characteristic (ROC)')
271 |     plt.xlabel('False Positive Rate', fontsize='x-large')
272 |     plt.ylabel('True Positive Rate', fontsize='x-large')
273 |     plt.legend(loc='lower right', fontsize='large')
274 | 
275 |     plt.setp(ax1.get_xticklabels(), fontsize=14)
276 |     plt.setp(ax1.get_yticklabels(), fontsize=14)
277 | 
278 |     fig.savefig('xgBoost/' + "ROC_" + clf_name + "_" + cfg[1] + ".pdf")   # save the figure to file
279 |     plt.close(fig)
280 | 
281 | if __name__ == "__main__":
282 |     data_folder = 'TrafficCaptures/'
283 | 
284 |     cfgs = [
285 |     ["YouTube_home_world_live",
286 |     "CovertCast_home_world"]]
287 | 
288 | 
289 |     classifiers = [
290 |     [RandomForestClassifier(n_estimators=100, max_features=None), "RandomForest"],
291 |     [DecisionTreeClassifier(), "Decision Tree"],
292 |     [XGBClassifier(),"XGBoost"]
293 |     ]
294 | 
295 | 
296 |     if not os.path.exists('xgBoost'):
297 |                 os.makedirs('xgBoost')
298 | 
299 |     for cfg in cfgs:
300 |         for classifier in classifiers:
301 |             print "Running classifiers for " + cfg[0] + " and " + cfg[1]
302 |             runClassification_CV(data_folder, cfg, classifier)
303 | 


--------------------------------------------------------------------------------
/DeltaShaperAnalysis/EMD_classifier.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import dpkt
  3 | import subprocess
  4 | import socket
  5 | import os
  6 | from random import randint
  7 | import math
  8 | from itertools import product
  9 | import datetime
 10 | from matplotlib import pyplot as plt
 11 | from matplotlib.pyplot import cm
 12 | from matplotlib.ticker import MultipleLocator, FormatStrFormatter
 13 | import numpy as np
 14 | from pyemd import emd
 15 | import collections
 16 | 
 17 | BIN_WIDTH = [50] #20,50,100
 18 | folder = "auxFolder/"
 19 | 
 20 | cfgs = [
 21 | ["RegularTraffic",
 22 | "DeltaShaperTraffic_320"],
 23 | ["RegularTraffic",
 24 | "DeltaShaperTraffic_160"]]
 25 | 
 26 | 
 27 | def GatherChatSamples(sampleFolder, baselines, binWidth):
 28 |     Samples = []
 29 |     for baseline in baselines:
 30 |         for cap in os.listdir(folder + sampleFolder + baseline):
 31 |             Samples.append(folder + sampleFolder + baseline + "/" + cap + "/" + 'packetCount_' + str(binWidth))
 32 |     return Samples
 33 | 
 34 | def ComputeRate(sampleFolder, emdResults, num_irregular_samples, num_regular_samples, binWidth):
 35 |     deltas = np.arange(0.001, 1, 0.001)
 36 | 
 37 |     Sensitivity = []
 38 |     Specificity = []
 39 | 
 40 |     holding90 = True
 41 |     holding80 = True
 42 |     holding70 = True
 43 | 
 44 |     thresh90 = 0
 45 |     thresh80 = 0
 46 |     thresh70 = 0
 47 | 
 48 |     val90 = 0
 49 |     val80 = 0
 50 |     val70 = 0
 51 | 
 52 |     max_acc = 0
 53 |     max_delta = 0
 54 |     max_tpr = 0
 55 |     max_tnr = 0
 56 |     max_fpr = 0
 57 | 
 58 |     found_conservative_threshold = False
 59 |     conservative_acc = 0
 60 |     conservative_delta = 0
 61 |     conservative_tnr = 0
 62 | 
 63 |     accuracy = 0
 64 |     for delta in deltas:
 65 |         FPositives = 0
 66 |         FNegatives = 0
 67 |         TPositives = 0
 68 |         TNegatives = 0
 69 | 
 70 | 
 71 |         #Positives are DS samples classified as DS
 72 |         for i, capEMD in enumerate(emdResults):
 73 |             if(capEMD > delta and i < num_regular_samples): # Regular baselines
 74 |                 FPositives += 1
 75 |             if (capEMD < delta and i < num_regular_samples):
 76 |                 TNegatives += 1
 77 |             if(capEMD <= delta and i >= num_regular_samples): #Irregular baseline
 78 |                 FNegatives += 1
 79 |             if(capEMD > delta and i >= num_regular_samples):
 80 |                 TPositives += 1
 81 |         """
 82 |         #NEGATED
 83 |         for i, capEMD in enumerate(emdResults):
 84 |             if(capEMD > delta and i < num_regular_samples): # Regular baselines
 85 |                 TNegatives += 1
 86 |             if (capEMD < delta and i < num_regular_samples):
 87 |                 FPositives += 1
 88 |             if(capEMD <= delta and i >= num_regular_samples): #Irregular baseline
 89 |                 TPositives += 1
 90 |             if(capEMD > delta and i >= num_regular_samples):
 91 |                 FNegatives += 1
 92 |         """
 93 |         Sensitivity.append(TPositives/(TPositives+float(FNegatives)))
 94 |         Specificity.append(TNegatives/(TNegatives+float(FPositives)))
 95 | 
 96 |         accuracy = (TPositives + TNegatives)/float(num_irregular_samples + num_regular_samples)
 97 |         TNR = TNegatives/(TNegatives+float(FPositives))
 98 |         FNR = FNegatives/(TPositives+float(FNegatives))
 99 |         TPR = TPositives/(TPositives+float(FNegatives))
100 |         FPR = FPositives/(FPositives+float(TNegatives))
101 |         #print delta
102 |         #print TNegatives/(TNegatives+float(FPositives))
103 | 
104 |         if(holding90):
105 |             if(FNR >= 0.1):
106 |                 holding90 = False
107 |                 thresh90 = delta
108 |                 val90 = FPR
109 | 
110 |         if(holding80):
111 |             if(FNR >= 0.2):
112 |                 holding80 = False
113 |                 thresh80 = delta
114 |                 val80 = FPR
115 | 
116 |         if(holding70):
117 |             if(FNR >= 0.3):
118 |                 holding70 = False
119 |                 thresh70 = delta
120 |                 val70 = FPR
121 | 
122 |         #Conservative threshold - the delta where all legitimate videos are classified as such
123 |         if(TNegatives/(TNegatives+float(FPositives)) >= 1 and not found_conservative_threshold):
124 |             conservative_acc =  accuracy
125 |             conservative_delta = delta
126 |             conservative_tpr = TPositives/(TPositives+float(FNegatives))
127 |             found_conservative_threshold = True
128 | 
129 |         if(accuracy > max_acc):
130 |             max_acc = accuracy
131 |             max_delta = delta
132 |             max_tpr = TPositives/(TPositives+float(FNegatives))
133 |             max_tnr = TNegatives/(TNegatives+float(FPositives))
134 |             max_fpr = 1 - max_tnr
135 | 
136 |     fig = plt.figure()
137 |     ax1 = fig.add_subplot(111)
138 | 
139 |     print "TPR90 = " + str(val90)
140 |     print "TPR80 = " + str(val80)
141 |     print "TPR70 = " + str(val70)
142 |     
143 |     print "AUC"
144 |     auc = np.trapz(np.array(Sensitivity)[::-1], (1-np.array(Specificity))[::-1])
145 |     print auc
146 |     #ROC Curve
147 |     np.save('EMD/' + sampleFolder + baselines[1] + '/Rate_' + str(binWidth) + '_Sensitivity', np.array(Sensitivity))
148 |     np.save('EMD/' + sampleFolder + baselines[1] + '/Rate_' + str(binWidth) + '_Specificity', np.array(Specificity))
149 | 
150 |     ax1.plot(1 - np.array(Specificity), np.array(Sensitivity), 'k.-', color='black', label = 'ROC (AUC = %0.2f)' % (auc))
151 |     ax1.plot([0, 1], [0, 1], 'k--', lw=2, color='orange', label = 'Random Guess')
152 |     ax1.grid(color='black', linestyle='dotted')
153 | 
154 |     plt.title('Receiver Operating Characteristic (ROC)')
155 |     plt.xlabel('False Positive Rate', fontsize='x-large')
156 |     plt.ylabel('True Positive Rate', fontsize='x-large')
157 |     plt.legend(loc='lower right', fontsize='large')
158 | 
159 |     plt.setp(ax1.get_xticklabels(), fontsize=14)
160 |     plt.setp(ax1.get_yticklabels(), fontsize=14)
161 | 
162 |     max_stats = "Max acc: " + str(max_acc) + " Max TPR:" + str(max_tpr) + " Max TNR:" + str(max_tnr) + " Max FPR:" + str(max_fpr) + " delta:" + str(max_delta)
163 | 
164 |     fig.savefig('EMD/' + sampleFolder + baselines[1] + '/Rate_' + str(binWidth) + '.pdf')   # save the figure to file
165 |     plt.close(fig)
166 | 
167 |     conservative_stats = "Con. acc: " + str(conservative_acc) + " Con. TPR:" + str(conservative_tnr) + " delta:" + str(conservative_delta)
168 |     print conservative_stats
169 |     return max_stats
170 | 
171 | 
172 | def GenerateDists(samples, binWidth):
173 |     dists = []
174 |     print "Building distributions"
175 | 
176 |     for sample in samples:
177 |         #print sample
178 |         f = open(sample, 'r')
179 | 
180 |         Gk = {}
181 |         bins=[]
182 |         #Generate the set of all possible bins
183 |         for i in range(0,1500, binWidth):
184 |             Gk[str(i).replace(" ", "")] = 0
185 | 
186 | 
187 |         lines = f.readlines()
188 |         for line in lines:
189 |             try:
190 |                 bins.append(line.rstrip('\n'))
191 |             except IndexError:
192 |                 break #Reached last index, stop processing
193 | 
194 |         #Account for each bin elem
195 |         for i in bins:
196 |             Gk[str(i)]+=1
197 | 
198 |         od = collections.OrderedDict(sorted(Gk.items()))
199 |         Gklist = []
200 |         for i in od:
201 |             Gklist.append(float(od[i]))
202 |         Gklist = np.array(Gklist)
203 | 
204 |         dists.append(Gklist)
205 |         f.close()
206 |     print "End - Building distributions"
207 | 
208 |     #Build distance matrix
209 |     Gk = {}
210 |     bins=[]
211 |     #Generate the set of all possible bins
212 |     for i in range(0,1500, binWidth):
213 |         Gk[str(i).replace(" ", "")] = 0
214 | 
215 |     #Generate distance matrix
216 |     distance_matrix = []
217 |     for i in range(0,len(Gk)):
218 |         line =[]
219 |         for j in range(0,len(Gk)):
220 |             if(i==j):
221 |                 line.append(0.0)
222 |             else:
223 |                 line.append(1.0)
224 | 
225 |         distance_matrix.append(np.array(line))
226 |     distance_matrix = np.array(distance_matrix)
227 | 
228 |     return dists, distance_matrix
229 | 
230 | 
231 | def Classifier(toClassify, allSamples, baseSamples, distance_matrix, binWidth):
232 |     emdResults = []
233 |     emdSum = 0
234 |     ##################################
235 |     #Read first element in combination
236 |     ##################################
237 |     Gk_corelist = toClassify
238 | 
239 |     for n, sample in enumerate(baseSamples):
240 | 
241 |         Gklist = sample
242 |         ############################
243 |         ###### NORMALIZATION #######
244 |         ############################
245 |         ground1 = max(Gk_corelist)
246 |         ground2 = max(Gklist)
247 |         if(ground1 > ground2):
248 |             MAX = ground1
249 |         else:
250 |             MAX = ground2
251 | 
252 |         if(max(np.cumsum(Gk_corelist)) > max(np.cumsum(Gklist))):
253 |             cSum = max(np.cumsum(Gk_corelist))
254 |         else:
255 |             cSum =  max(np.cumsum(Gklist))
256 | 
257 |         dtm = distance_matrix/cSum
258 | 
259 |         emdR = float(emd(Gk_corelist, Gklist, dtm))
260 |         emdSum += emdR
261 |         emdResults.append(emdR)
262 | 
263 |     avgEMD = emdSum / len(emdResults)
264 |     #print str(avgEMD)
265 |     return avgEMD
266 | 
267 | def plotEMD(sampleFolder, baselines, binWidth):
268 |     regularSamples = GatherChatSamples(sampleFolder,baselines[:-1], binWidth)
269 |     allSamples = GatherChatSamples(sampleFolder, baselines, binWidth)
270 | 
271 |     allSamplesDists, distance_matrix = GenerateDists(allSamples, binWidth)
272 | 
273 |     emdResults = []
274 |     for n, bs in enumerate(allSamplesDists):
275 |         #print allSamples[n]
276 |         emdResults.append(Classifier(bs, allSamples, allSamplesDists[:len(regularSamples)], distance_matrix, binWidth))
277 | 
278 |     acc = float(0)
279 |     for i in range(0,len(regularSamples)):
280 |         acc += emdResults[i]
281 |     acc = acc/len(regularSamples)
282 |     print "AVG Regular " + str(acc)
283 | 
284 |     max_stat = ComputeRate(sampleFolder, emdResults, len(allSamples) - len(regularSamples), len(regularSamples), binWidth)
285 |     print max_stat
286 | 
287 |     fig = plt.figure()
288 |     ax1 = fig.add_subplot(111)
289 | 
290 |     means = [np.mean(x) for x in emdResults]
291 |     plt.scatter(range(1,len(emdResults)+1), means)
292 | 
293 | 
294 |     minor_ticks = np.arange(0, len(emdResults)+1, 1)
295 |     ax1.set_xticks(minor_ticks, minor=True)
296 |     majorLocator = MultipleLocator(5)
297 |     majorFormatter = FormatStrFormatter('%d')
298 |     ax1.xaxis.set_major_locator(majorLocator)
299 |     ax1.xaxis.set_major_formatter(majorFormatter)
300 | 
301 |     for label in (ax1.get_xticklabels() + ax1.get_yticklabels()):
302 |         label.set_fontsize(14)
303 | 
304 |     start, end = ax1.get_xlim()
305 |     ax1.yaxis.set_ticks(np.arange(start, end, 0.025))
306 |     ax1.yaxis.set_major_formatter(FormatStrFormatter('%0.3f'))
307 |     plt.xlim(xmin=0, xmax=len(allSamples))
308 |     plt.ylim(ymin=0,ymax=0.5)
309 | 
310 |     plt.setp(ax1.get_xticklabels(), fontsize=14)
311 |     plt.setp(ax1.get_yticklabels(), fontsize=14)
312 |     plt.title(max_stat, fontsize='xx-small')
313 |     plt.xlabel('Stream i', fontsize='x-large')
314 |     plt.ylabel('EMD Cost', fontsize='x-large')
315 |     fig.savefig('EMD/' + sampleFolder + baselines[1] + '/EMD_' + str(binWidth) + '.pdf')   # save the figure to file
316 |     plt.close(fig)
317 | 
318 | 
319 | if __name__ == "__main__":
320 | 
321 |     sampleFolders = ["TrafficCaptures/480Resolution/"]
322 | 
323 |     if not os.path.exists('EMD'):
324 |                 os.makedirs('EMD')
325 | 
326 |     for sampleFolder in sampleFolders:
327 |         for baselines in cfgs:
328 |             print "==========================================="
329 |             print "Analyzing " + baselines[0] + " - " + baselines[1]
330 |             for binWidth in BIN_WIDTH:
331 |                 print "##############"
332 |                 print "BinWidth: " + str(binWidth)
333 |                 if not os.path.exists('EMD/' + sampleFolder + baselines[1]):
334 |                     os.makedirs('EMD/' + sampleFolder + baselines[1])
335 |                 plotEMD(sampleFolder, baselines, binWidth)
336 | 


--------------------------------------------------------------------------------
/FacetAnalysis/EMD_classifier.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import dpkt
  3 | import subprocess
  4 | import socket
  5 | import os
  6 | from random import randint
  7 | import math
  8 | from itertools import product
  9 | import time
 10 | import datetime
 11 | from matplotlib import pyplot as plt
 12 | from matplotlib.pyplot import cm
 13 | from matplotlib.ticker import MultipleLocator, FormatStrFormatter
 14 | import numpy as np
 15 | from pyemd import emd
 16 | import collections
 17 | 
 18 | BIN_WIDTH = [50]
 19 | folder = "auxFolder/"
 20 | 
 21 | cfgs = [
 22 | ["RegularTraffic_Christmas",
 23 | "FacetTraffic_12.5_Christmas"],
 24 | ["RegularTraffic_Christmas",
 25 | "FacetTraffic_25_Christmas"],
 26 | ["RegularTraffic_Christmas",
 27 | "FacetTraffic_50_Christmas"]]
 28 | 
 29 | 
 30 | 
 31 | def GatherChatSamples(sampleFolder, baselines, binWidth):
 32 |     Samples = []
 33 |     for baseline in baselines:
 34 |         for cap in os.listdir(folder + sampleFolder + baseline):
 35 |             Samples.append(folder + sampleFolder + baseline + "/" + cap + "/" + 'packetCount_' + str(binWidth))
 36 |     return Samples
 37 | 
 38 | def ComputeRate(sampleFolder, emdResults, num_irregular_samples, num_regular_samples, binWidth):
 39 |     deltas = np.arange(0.001, 0.5, 0.001)
 40 | 
 41 |     Sensitivity = []
 42 |     Specificity = []
 43 | 
 44 |     holding90 = True
 45 |     holding80 = True
 46 |     holding70 = True
 47 | 
 48 |     thresh90 = 0
 49 |     thresh80 = 0
 50 |     thresh70 = 0
 51 | 
 52 |     val90 = 0
 53 |     val80 = 0
 54 |     val70 = 0
 55 | 
 56 |     max_acc = 0
 57 |     max_delta = 0
 58 |     max_tpr = 0
 59 |     max_tnr = 0
 60 |     max_fpr = 0
 61 | 
 62 |     accuracy = 0
 63 |     for delta in deltas:
 64 |         FPositives = 0
 65 |         FNegatives = 0
 66 |         TPositives = 0
 67 |         TNegatives = 0
 68 | 
 69 |         """
 70 |         #Positives are Facet samples classified as Facet
 71 |         for i, capEMD in enumerate(emdResults):
 72 |             if(capEMD > delta and i < num_regular_samples): # Regular baselines
 73 |                 FPositives += 1
 74 |             if (capEMD < delta and i < num_regular_samples):
 75 |                 TNegatives += 1
 76 |             if(capEMD <= delta and i >= num_regular_samples): #Irregular baseline
 77 |                 FNegatives += 1
 78 |             if(capEMD > delta and i >= num_regular_samples):
 79 |                 TPositives += 1
 80 |         """
 81 |         #NEGATED
 82 |         for i, capEMD in enumerate(emdResults):
 83 |             if(capEMD > delta and i < num_regular_samples): # Regular baselines
 84 |                 TNegatives += 1
 85 |             if (capEMD < delta and i < num_regular_samples):
 86 |                 FPositives += 1
 87 |             if(capEMD <= delta and i >= num_regular_samples): #Irregular baseline
 88 |                 TPositives += 1
 89 |             if(capEMD > delta and i >= num_regular_samples):
 90 |                 FNegatives += 1
 91 | 
 92 |         Sensitivity.append(TPositives/(TPositives+float(FNegatives)))
 93 |         Specificity.append(TNegatives/(TNegatives+float(FPositives)))
 94 | 
 95 |         accuracy = (TPositives + TNegatives)/float(num_irregular_samples + num_regular_samples)
 96 |         TNR = TNegatives/(TNegatives+float(FPositives))
 97 |         FNR = FNegatives/(TPositives+float(FNegatives))
 98 |         TPR = TPositives/(TPositives+float(FNegatives))
 99 |         FPR = FPositives/(FPositives+float(TNegatives))
100 | 
101 |         if(holding90):
102 |             if(FPR >= 0.1):
103 |                 holding90 = False
104 |                 thresh90 = delta
105 |                 val90 = FNR
106 | 
107 |         if(holding80):
108 |             if(FPR >= 0.2):
109 |                 holding80 = False
110 |                 thresh80 = delta
111 |                 val80 = FNR
112 | 
113 |         if(holding70):
114 |             if(FPR >= 0.3):
115 |                 holding70 = False
116 |                 thresh70 = delta
117 |                 val70 = FNR
118 | 
119 |         if(accuracy > max_acc):
120 |             max_acc = accuracy
121 |             max_delta = delta
122 |             max_tpr = TPositives/(TPositives+float(FNegatives))
123 |             max_tnr = TNegatives/(TNegatives+float(FPositives))
124 |             max_fpr = 1 - max_tnr
125 | 
126 |     fig = plt.figure()
127 |     ax1 = fig.add_subplot(111)
128 | 
129 |     print "TPR90 = " + str(val90)
130 |     print "TPR80 = " + str(val80)
131 |     print "TPR70 = " + str(val70)
132 | 
133 |     print "AUC"
134 |     auc = np.trapz(np.array(Sensitivity), 1 - np.array(Specificity))
135 |     print auc
136 |     #ROC Curve
137 | 
138 |     np.save('EMD/' + sampleFolder + baselines[1] + '/Rate_' + str(binWidth) + '_Sensitivity', np.array(Sensitivity))
139 |     np.save('EMD/' + sampleFolder + baselines[1] + '/Rate_' + str(binWidth) + '_Specificity', np.array(Specificity))
140 |     ax1.plot(1 - np.array(Specificity), np.array(Sensitivity), 'k.-', color='black', label = 'ROC (AUC = %0.2f)' % (auc))
141 |     ax1.plot([0, 1], [0, 1], 'k--', lw=2, color='orange', label = 'Random Guess')
142 |     ax1.grid(color='black', linestyle='dotted')
143 | 
144 |     plt.title('Receiver Operating Characteristic (ROC)')
145 |     plt.xlabel('False Positive Rate', fontsize='x-large')
146 |     plt.ylabel('True Positive Rate', fontsize='x-large')
147 |     plt.legend(loc='lower right', fontsize='large')
148 | 
149 |     plt.setp(ax1.get_xticklabels(), fontsize=14)
150 |     plt.setp(ax1.get_yticklabels(), fontsize=14)
151 | 
152 |     max_stats = "Max acc: " + str(max_acc) + " Max TPR:" + str(max_tpr) + " Max TNR:" + str(max_tnr) + " Max FPR:" + str(max_fpr) + " delta:" + str(max_delta)
153 | 
154 |     fig.savefig('EMD/' + sampleFolder + baselines[1] + '/Rate_' + str(binWidth) + '.pdf')   # save the figure to file
155 |     plt.close(fig)
156 | 
157 |     return max_stats
158 | 
159 | 
160 | def GenerateDists(samples, binWidth):
161 |     dists = []
162 |     print "Building distributions"
163 | 
164 |     for sample in samples:
165 |         #print sample
166 |         f = open(sample, 'r')
167 | 
168 |         Gk = {}
169 |         bins=[]
170 |         #Generate the set of all possible bins
171 |         for i in range(0,1500, binWidth):
172 |             Gk[str(i).replace(" ", "")] = 0
173 | 
174 | 
175 |         lines = f.readlines()
176 |         for line in lines:
177 |             try:
178 |                 bins.append(line.rstrip('\n'))
179 |             except IndexError:
180 |                 break #Reached last index, stop processing
181 | 
182 |         #Account for each bin elem
183 |         for i in bins:
184 |             Gk[str(i)]+=1
185 | 
186 |         od = collections.OrderedDict(sorted(Gk.items()))
187 |         Gklist = []
188 |         for i in od:
189 |             Gklist.append(float(od[i]))
190 |         Gklist = np.array(Gklist)
191 | 
192 |         dists.append(Gklist)
193 |         f.close()
194 |     print "End - Building distributions"
195 | 
196 |     #Build distance matrix
197 |     Gk = {}
198 |     bins=[]
199 |     #Generate the set of all possible bins
200 |     for i in range(0,1500, binWidth):
201 |         Gk[str(i).replace(" ", "")] = 0
202 | 
203 |     #Generate distance matrix
204 |     distance_matrix = []
205 |     for i in range(0,len(Gk)):
206 |         line =[]
207 |         for j in range(0,len(Gk)):
208 |             if(i==j):
209 |                 line.append(0.0)
210 |             else:
211 |                 line.append(1.0)
212 | 
213 |         distance_matrix.append(np.array(line))
214 |     distance_matrix = np.array(distance_matrix)
215 | 
216 |     return dists, distance_matrix
217 | 
218 | 
219 | def Classifier(toClassify, allSamples, baseSamples, distance_matrix, binWidth):
220 |     emdResults = []
221 |     emdSum = 0
222 |     ##################################
223 |     #Read first element in combination
224 |     ##################################
225 |     Gk_corelist = toClassify
226 | 
227 |     times = []
228 |     #start_time = time.time()
229 |     for n, sample in enumerate(baseSamples):
230 | 
231 |         Gklist = sample
232 |         ############################
233 |         ###### NORMALIZATION #######
234 |         ############################
235 |         ground1 = max(Gk_corelist)
236 |         ground2 = max(Gklist)
237 |         if(ground1 > ground2):
238 |             MAX = ground1
239 |         else:
240 |             MAX = ground2
241 | 
242 |         if(max(np.cumsum(Gk_corelist)) > max(np.cumsum(Gklist))):
243 |             cSum = max(np.cumsum(Gk_corelist))
244 |         else:
245 |             cSum =  max(np.cumsum(Gklist))
246 | 
247 |         dtm = distance_matrix/cSum
248 | 
249 |         #start_time = time.time()
250 |         emdR = float(emd(Gk_corelist, Gklist, dtm))
251 |         #end_time = time.time()
252 |         #times.append(end_time - start_time)
253 |         emdSum += emdR
254 |         emdResults.append(emdR)
255 |     #end_time = time.time()
256 |     #print "Sample classification time: " + "{0:.5f}".format(end_time - start_time)
257 |     #print "Avg. EMD time: " + "{0:.5f}".format(np.mean(times,axis=0))
258 |     avgEMD = emdSum / len(emdResults)
259 |     #print str(avgEMD)
260 |     return avgEMD
261 | 
262 | def plotEMD(sampleFolder, baselines, binWidth):
263 |     regularSamples = GatherChatSamples(sampleFolder,baselines[:-1], binWidth)
264 |     allSamples = GatherChatSamples(sampleFolder, baselines, binWidth)
265 | 
266 |     allSamplesDists, distance_matrix = GenerateDists(allSamples, binWidth)
267 | 
268 |     start_time = time.time()
269 |     emdResults = []
270 |     times = []
271 |     for n, bs in enumerate(allSamplesDists):
272 |         #print allSamples[n]
273 |         start_sample_time = time.time()
274 |         emdResults.append(Classifier(bs, allSamples, allSamplesDists[:len(regularSamples)], distance_matrix, binWidth))
275 |         end_sample_time = time.time()
276 |         times.append(end_sample_time - start_sample_time)
277 |     print "Avg Sample Classification: " + "{0:.5f}".format(np.mean(times,axis=0))
278 |     end_time = time.time()
279 |     print "Time Elapsed: " + "{0:.5f}".format(end_time - start_time)
280 | 
281 |     acc = float(0)
282 |     for i in range(0,len(regularSamples)):
283 |         acc += emdResults[i]
284 |     acc = acc/len(regularSamples)
285 |     print "AVG Regular " + str(acc)
286 | 
287 |     max_stat = ComputeRate(sampleFolder, emdResults, len(allSamples) - len(regularSamples), len(regularSamples), binWidth)
288 |     print max_stat
289 |     """
290 |     fig = plt.figure()
291 |     ax1 = fig.add_subplot(111)
292 | 
293 |     means = [np.mean(x) for x in emdResults]
294 |     plt.scatter(range(1,len(emdResults)+1), means)
295 | 
296 | 
297 |     minor_ticks = np.arange(0, len(emdResults)+1, 1)
298 |     ax1.set_xticks(minor_ticks, minor=True)
299 |     majorLocator = MultipleLocator(5)
300 |     majorFormatter = FormatStrFormatter('%d')
301 |     ax1.xaxis.set_major_locator(majorLocator)
302 |     ax1.xaxis.set_major_formatter(majorFormatter)
303 | 
304 |     for label in (ax1.get_xticklabels() + ax1.get_yticklabels()):
305 |         label.set_fontsize(14)
306 | 
307 |     start, end = ax1.get_xlim()
308 |     ax1.yaxis.set_ticks(np.arange(start, end, 0.025))
309 |     ax1.yaxis.set_major_formatter(FormatStrFormatter('%0.3f'))
310 |     plt.xlim(xmin=0, xmax=len(allSamples))
311 |     plt.ylim(ymin=0,ymax=0.5)
312 | 
313 |     plt.setp(ax1.get_xticklabels(), fontsize=14)
314 |     plt.setp(ax1.get_yticklabels(), fontsize=14)
315 |     plt.title(max_stat, fontsize='xx-small')
316 |     plt.xlabel('Stream i', fontsize='x-large')
317 |     plt.ylabel('EMD Cost', fontsize='x-large')
318 |     fig.savefig('EMD/' + sampleFolder + baselines[1] + '/EMD_' + str(binWidth) + '.pdf')   # save the figure to file
319 |     plt.close(fig)
320 |     """
321 | 
322 | if __name__ == "__main__":
323 | 
324 |     sampleFolders = ["TrafficCaptures/240Resolution/"]
325 | 
326 |     if not os.path.exists('EMD'):
327 |                 os.makedirs('EMD')
328 | 
329 |     for sampleFolder in sampleFolders:
330 |         for baselines in cfgs:
331 |             print "==========================================="
332 |             print "Analyzing " + baselines[0] + " - " + baselines[1]
333 |             for binWidth in BIN_WIDTH:
334 |                 print "##############"
335 |                 print "BinWidth: " + str(binWidth)
336 |                 if not os.path.exists('EMD/' + sampleFolder + baselines[1]):
337 |                     os.makedirs('EMD/' + sampleFolder + baselines[1])
338 |                 plotEMD(sampleFolder, baselines, binWidth)
339 | 


--------------------------------------------------------------------------------
/DeltaShaperAnalysis/xgboost_classifier.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import os
  3 | import csv
  4 | import numpy as np
  5 | from scipy import interp
  6 | import random
  7 | from random import shuffle
  8 | import math
  9 | import time
 10 | #Classifiers
 11 | from xgboost import XGBClassifier
 12 | from sklearn.ensemble import RandomForestClassifier
 13 | from sklearn.tree import DecisionTreeClassifier
 14 | from sklearn.neighbors import KNeighborsClassifier
 15 | #Eval Metrics
 16 | from sklearn.model_selection import train_test_split, KFold
 17 | from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc
 18 | from sklearn.model_selection import cross_val_score
 19 | 
 20 | np.random.seed(1)
 21 | random.seed(1)
 22 | 
 23 | 
 24 | def gatherHoldoutData(data_folder, cfg):
 25 |     SPLIT_FACTOR = 0.7
 26 |     #Load Datasets
 27 |     f = open(data_folder + cfg[0] + "_dataset.csv", 'r')
 28 |     reader = csv.reader(f, delimiter=',')
 29 |     reg = list(reader)
 30 | 
 31 |     f = open(data_folder + cfg[1] + "_dataset.csv", 'r')
 32 |     reader = csv.reader(f, delimiter=',')
 33 |     fac = list(reader)
 34 |     print "###########################################"
 35 |     print "Configuration " + cfg[1]
 36 |     print "###########################################"
 37 | 
 38 |     #Convert data to floats (and labels to integers)
 39 |     features_id = reg[0]
 40 |     reg_data = []
 41 |     for i in reg[1:]:
 42 |         int_array = []
 43 |         for pl in i[:-1]:
 44 |             int_array.append(float(pl))
 45 |         int_array.append(0)
 46 |         reg_data.append(int_array)
 47 | 
 48 |     fac_data = []
 49 |     for i in fac[1:]:
 50 |         int_array = []
 51 |         for pl in i[:-1]:
 52 |             int_array.append(float(pl))
 53 |         int_array.append(1)
 54 |         fac_data.append(int_array)
 55 | 
 56 | 
 57 |     #Shuffle both datasets
 58 |     shuffled_reg_data = random.sample(reg_data, len(reg_data))
 59 |     shuffled_fac_data = random.sample(fac_data, len(fac_data))
 60 | 
 61 |     #Build label tensors
 62 |     reg_labels = []
 63 |     for i in shuffled_reg_data:
 64 |         reg_labels.append(int(i[len(reg_data[0])-1]))
 65 | 
 66 |     fac_labels = []
 67 |     for i in shuffled_fac_data:
 68 |         fac_labels.append(int(i[len(reg_data[0])-1]))
 69 | 
 70 |     #Take label out of data tensors
 71 |     for i in range(0, len(shuffled_reg_data)):
 72 |         shuffled_reg_data[i].pop()
 73 | 
 74 |     for i in range(0, len(shuffled_fac_data)):
 75 |         shuffled_fac_data[i].pop()
 76 | 
 77 | 
 78 |     #Build training and testing datasets
 79 |     #Split each class data in the appropriate proportion for training
 80 |     reg_proportion_index = int(len(reg_labels)* SPLIT_FACTOR)
 81 |     reg_train_x = shuffled_reg_data[:reg_proportion_index]
 82 |     reg_train_y = reg_labels[:reg_proportion_index]
 83 | 
 84 |     fac_proportion_index = int(len(fac_labels)*SPLIT_FACTOR)
 85 |     fac_train_x = shuffled_fac_data[:fac_proportion_index]
 86 |     fac_train_y = fac_labels[:fac_proportion_index]
 87 | 
 88 |     #Create training sets by combining the randomly selected samples from each class
 89 |     train_x = reg_train_x + fac_train_x
 90 |     train_y = reg_train_y + fac_train_y
 91 | 
 92 |     #Make the split for the testing data
 93 |     reg_test_x = shuffled_reg_data[reg_proportion_index:]
 94 |     reg_test_y = reg_labels[reg_proportion_index:]
 95 | 
 96 |     fac_test_x = shuffled_fac_data[fac_proportion_index:]
 97 |     fac_test_y = fac_labels[fac_proportion_index:]
 98 | 
 99 |     #Create testing set by combining the holdout samples
100 |     test_x = reg_test_x + fac_test_x
101 |     test_y = reg_test_y + fac_test_y
102 | 
103 |     return train_x, train_y, test_x, test_y
104 | 
105 | def gatherAllData(data_folder, cfg, dataset_fraction):
106 |     #Load Datasets
107 |     f = open(data_folder + cfg[0] + "_dataset.csv", 'r')
108 |     reader = csv.reader(f, delimiter=',')
109 |     reg = list(reader)
110 |     reg = reg[:int(dataset_fraction*len(reg))]
111 | 
112 | 
113 |     f = open(data_folder + cfg[1] + "_dataset.csv", 'r')
114 |     reader = csv.reader(f, delimiter=',')
115 |     fac = list(reader)
116 |     fac = fac[:int(dataset_fraction*len(fac))]
117 | 
118 |     print "###########################################"
119 |     print "Configuration " + cfg[1]
120 |     print "###########################################"
121 | 
122 |     #Convert data to floats (and labels to integers)
123 |     features_id = reg[0]
124 |     reg_data = []
125 |     for i in reg[1:]:
126 |         int_array = []
127 |         for pl in i[:-1]:
128 |             int_array.append(float(pl))
129 |         int_array.append(0)
130 |         reg_data.append(int_array)
131 | 
132 |     fac_data = []
133 |     for i in fac[1:]:
134 |         int_array = []
135 |         for pl in i[:-1]:
136 |             int_array.append(float(pl))
137 |         int_array.append(1)
138 |         fac_data.append(int_array)
139 | 
140 | 
141 |     #Shuffle both datasets
142 |     shuffled_reg_data = random.sample(reg_data, len(reg_data))
143 |     shuffled_fac_data = random.sample(fac_data, len(fac_data))
144 | 
145 |     #Build label tensors
146 |     reg_labels = []
147 |     for i in shuffled_reg_data:
148 |         reg_labels.append(int(i[len(reg_data[0])-1]))
149 | 
150 |     fac_labels = []
151 |     for i in shuffled_fac_data:
152 |         fac_labels.append(int(i[len(reg_data[0])-1]))
153 | 
154 |     #Take label out of data tensors
155 |     for i in range(0, len(shuffled_reg_data)):
156 |         shuffled_reg_data[i].pop()
157 | 
158 |     for i in range(0, len(shuffled_fac_data)):
159 |         shuffled_fac_data[i].pop()
160 | 
161 |     #Create training sets by combining the randomly selected samples from each class
162 |     train_x = shuffled_reg_data + shuffled_fac_data
163 |     train_y = reg_labels + fac_labels
164 | 
165 |     #Shuffle positive/negative samples for CV purposes
166 |     x_shuf = []
167 |     y_shuf = []
168 |     index_shuf = range(len(train_x))
169 |     shuffle(index_shuf)
170 |     for i in index_shuf:
171 |         x_shuf.append(train_x[i])
172 |         y_shuf.append(train_y[i])
173 | 
174 |     return x_shuf, y_shuf, features_id
175 | 
176 | 
177 | def runXGBoost(data_folder, cfg):
178 |     #Gather the dataset
179 |     print "Gather dataset"
180 |     train_x, train_y, test_x, test_y = gatherHoldoutData(data_folder, cfg)
181 | 
182 | 
183 |     param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}
184 |     param['nthread'] = 4
185 |     param['eval_metric'] = 'auc'
186 | 
187 | 
188 |     model = XGBClassifier()
189 |     model.fit(np.asarray(train_x), np.asarray(train_y))
190 | 
191 |     y_pred = model.predict(np.asarray(test_x))
192 |     predictions = [round(value) for value in y_pred]
193 | 
194 |     # evaluate predictions
195 |     accuracy = accuracy_score(np.asarray(test_y), predictions)
196 |     print("Accuracy: %.2f%%" % (accuracy * 100.0))
197 | 
198 |     y_pred = model.predict_proba(np.asarray(test_x))[:,1]
199 |     print 'Area under ROC:', roc_auc_score(np.asarray(test_y),y_pred)
200 | 
201 | 
202 | def runClassification_CV(data_folder, feature_set, cfg,classifier):
203 |     print "Gather dataset"
204 |     dataset_fraction = 1.0
205 |     train_x, train_y, features_id = gatherAllData(data_folder, cfg, dataset_fraction)
206 | 
207 |     model = classifier[0]
208 |     clf_name = classifier[1]
209 | 
210 |     #Report Cross-Validation Accuracy
211 |     #scores = cross_val_score(model, np.asarray(train_x), np.asarray(train_y), cv=10)
212 |     print clf_name
213 |     #print "Avg. Accuracy: " + str(sum(scores)/float(len(scores)))
214 | 
215 |     cv = KFold(n_splits=10)
216 |     tprs = []
217 |     aucs = []
218 |     mean_fpr = np.linspace(0, 1, 100)
219 |     train_times = []
220 |     test_times = []
221 |     importances = []
222 | 
223 |     #Split the data in k-folds, perform classification, and report ROC
224 |     i = 0
225 |     for train, test in cv.split(train_x, train_y):
226 | 
227 |         start_train = time.time()
228 |         model = model.fit(np.asarray(train_x)[train], np.asarray(train_y)[train])
229 |         end_train = time.time()
230 |         train_times.append(end_train - start_train)
231 | 
232 |         start_test = time.time()
233 |         probas_ = model.predict_proba(np.asarray(train_x)[test])
234 |         end_test = time.time()
235 |         test_times.append(end_test - start_test)
236 | 
237 |         """
238 |         c=[]
239 |         for value in np.asarray(train_x)[test]:
240 |             a = np.reshape(value,(1, -1))
241 |             c.append(a)
242 | 
243 |         load = []
244 |         for v in c:
245 |             start_test = time.time()
246 |             a = model.predict_proba(v)
247 |             end_test = time.time()
248 |             load.append(end_test - start_test)
249 |         print "Individual prediction avg: " + "{0:.5f}".format(np.mean(load))
250 |         """
251 |         fpr, tpr, thresholds = roc_curve(np.asarray(train_y)[test], probas_[:, 1])
252 |         tprs.append(interp(mean_fpr, fpr, tpr))
253 |         tprs[-1][0] = 0.0
254 |         roc_auc = auc(fpr, tpr)
255 |         aucs.append(roc_auc)
256 | 
257 |         #Check feature importance in this fold
258 |         f_imp = model.feature_importances_
259 |         importances.append(f_imp)
260 |         #plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
261 |         i += 1
262 | 
263 |     plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Random Guess', alpha=.8)
264 | 
265 | 
266 |     mean_tpr = np.mean(tprs, axis=0)
267 |     mean_tpr[-1] = 1.0
268 |     mean_auc = auc(mean_fpr, mean_tpr)
269 |     print "Model AUC: " + "{0:.3f}".format(mean_auc)
270 |     print "Training time (Avg. fold): " + str(np.mean(train_times, axis=0))
271 |     print "Test time (Avg. fold): " + str(np.mean(test_times, axis=0))
272 | 
273 |     unblock70 = True
274 |     unblock80 = True
275 |     unblock90 = True
276 |     unblock95 = True
277 |     for n, i in enumerate(mean_tpr):
278 |         if(i >= 0.7 and unblock70):
279 |             print '70%  TPR  = ' + "{0:.3f}".format(mean_fpr[n])
280 |             unblock70 = False
281 |         if(i >= 0.8 and unblock80):
282 |             print '80%  TPR  = ' + "{0:.3f}".format(mean_fpr[n])
283 |             unblock80 = False
284 |         if(i >= 0.9 and unblock90):
285 |             print '90%  TPR  = ' + "{0:.3f}".format(mean_fpr[n])
286 |             unblock90 = False
287 |         if(i >= 0.95 and unblock95):
288 |             print '95%  TPR  = ' + "{0:.3f}".format(mean_fpr[n])
289 |             unblock95 = False
290 | 
291 |     #Figure properties
292 |     fig = plt.figure()
293 |     ax1 = fig.add_subplot(111)
294 | 
295 |     std_auc = np.std(aucs)
296 |     np.save('xgBoost/' + feature_set + "/ROC_" + clf_name + "_" + cfg[1] + "_Sensitivity", np.array(mean_tpr))
297 |     np.save('xgBoost/' + feature_set + "/ROC_" + clf_name + "_" + cfg[1] + "_Specificity", np.array(mean_fpr))
298 | 
299 |     plt.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.3f)' % (mean_auc, std_auc), lw=2, alpha=.8)
300 | 
301 |     #Compute Standard Deviation between folds
302 |     std_tpr = np.std(tprs, axis=0)
303 |     tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
304 |     tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
305 |     plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.3, label=r'$\pm$ ROC Std. Dev.')
306 | 
307 | 
308 | 
309 |     ax1.plot([0, 1], [0, 1], 'k--', lw=2, color='orange', label = 'Random Guess')
310 |     ax1.grid(color='black', linestyle='dotted')
311 | 
312 |     plt.title('Receiver Operating Characteristic (ROC)')
313 |     plt.xlabel('False Positive Rate', fontsize='x-large')
314 |     plt.ylabel('True Positive Rate', fontsize='x-large')
315 |     plt.legend(loc='lower right', fontsize='large')
316 | 
317 |     plt.setp(ax1.get_xticklabels(), fontsize=14)
318 |     plt.setp(ax1.get_yticklabels(), fontsize=14)
319 | 
320 |     #fig.savefig('xgBoost/' + feature_set + "/ROC_" + clf_name + "_" + cfg[1] + ".pdf")   # save the figure to file
321 |     plt.close(fig)
322 | 
323 |     mean_importances = []
324 |     for n in range(0,len(importances[0])):
325 |         mean_imp = (importances[0][n] + importances[1][n] + importances[2][n] + importances[3][n] + importances[4][n] + importances[5][n] + importances[6][n] + importances[7][n] + importances[8][n] + importances[9][n])/10.0
326 |         mean_importances.append(mean_imp)
327 |     f_imp = zip(mean_importances,features_id)
328 |     f_imp.sort(key = lambda t: t[0], reverse=True)
329 | 
330 |     np.save('xgBoost/' + feature_set + "/FeatureImportance_" + clf_name + "_" + cfg[1], np.array(f_imp))
331 | 
332 |     #for f in f_imp[:20]:
333 |     #    print "Importance: %f, Feature: %s" % (f[0], f[1])
334 | 
335 | 
336 | def runClassification_adhocCV(data_folder,feature_set, cfg,classifier):
337 |     print "Gather dataset"
338 |     dataset_fraction = 1.0
339 |     train_x, train_y, features_id = gatherAllData(data_folder, cfg, dataset_fraction)
340 | 
341 |     model = classifier[0]
342 |     clf_name = classifier[1]
343 | 
344 |     #Report Cross-Validation Accuracy
345 |     #scores = cross_val_score(model, np.asarray(train_x), np.asarray(train_y), cv=10)
346 |     print clf_name
347 |     #print "Avg. Accuracy: " + str(sum(scores)/float(len(scores)))
348 | 
349 |     cv = KFold(n_splits=10)
350 |     tprs = []
351 |     aucs = []
352 |     mean_fpr = np.linspace(0, 1, 100)
353 |     train_times = []
354 |     test_times = []
355 |     importances = []
356 | 
357 |     #Split the data in k-folds, perform classification, and report ROC
358 | 
359 |     for i in range(0,10):
360 |         X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.9)
361 |         start_train = time.time()
362 |         model = model.fit(np.asarray(X_train), np.asarray(y_train))
363 |         end_train = time.time()
364 |         train_times.append(end_train - start_train)
365 | 
366 |         start_test = time.time()
367 |         probas_ = model.predict_proba(np.asarray(X_test))
368 |         end_test = time.time()
369 |         test_times.append(end_test - start_test)
370 | 
371 |         # Compute ROC curve and area under the curve
372 |         fpr, tpr, thresholds = roc_curve(np.asarray(y_test), probas_[:, 1], pos_label=1)
373 |         #print "Accuracy " + str(accuracy_score(np.asarray(train_y)[test], probas_[:, 1]))
374 |         tprs.append(interp(mean_fpr, fpr, tpr))
375 |         tprs[-1][0] = 0.0
376 |         roc_auc = auc(fpr, tpr)
377 |         aucs.append(roc_auc)
378 | 
379 |         #Check feature importance in this fold
380 |         f_imp = model.feature_importances_
381 |         importances.append(f_imp)
382 | 
383 | 
384 |     plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Random Guess', alpha=.8)
385 | 
386 | 
387 |     mean_tpr = np.mean(tprs, axis=0)
388 |     mean_tpr[-1] = 1.0
389 |     mean_auc = auc(mean_fpr, mean_tpr)
390 |     print "Model AUC: " + "{0:.3f}".format(mean_auc)
391 |     print "Training time (Avg. fold): " + str(np.mean(train_times, axis=0))
392 |     print "Test time (Avg. fold): " + str(np.mean(test_times, axis=0))
393 | 
394 | 
395 |     unblock70 = True
396 |     unblock80 = True
397 |     unblock90 = True
398 |     unblock95 = True
399 |     for n, i in enumerate(mean_tpr):
400 |         if(i >= 0.7 and unblock70):
401 |             print '70%  TPR  = ' + "{0:.3f}".format(mean_fpr[n])
402 |             unblock70 = False
403 |         if(i >= 0.8 and unblock80):
404 |             print '80%  TPR  = ' + "{0:.3f}".format(mean_fpr[n])
405 |             unblock80 = False
406 |         if(i >= 0.9 and unblock90):
407 |             print '90%  TPR  = ' + "{0:.3f}".format(mean_fpr[n])
408 |             unblock90 = False
409 |         if(i >= 0.95 and unblock95):
410 |             print '95%  TPR  = ' + "{0:.3f}".format(mean_fpr[n])
411 |             unblock95 = False
412 | 
413 |     #Figure properties
414 |     fig = plt.figure()
415 |     ax1 = fig.add_subplot(111)
416 | 
417 |     std_auc = np.std(aucs)
418 | 
419 |     #np.save('xgBoost/' + feature_set + "/ROC_" + clf_name + "_" + cfg[1] + "_Sensitivity", np.array(mean_tpr))
420 |     #np.save('xgBoost/' + feature_set + "/ROC_" + clf_name + "_" + cfg[1] + "_Specificity", np.array(mean_fpr))
421 |     plt.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.3f)' % (mean_auc, std_auc), lw=2, alpha=.8)
422 | 
423 |     #Compute Standard Deviation between folds
424 |     std_tpr = np.std(tprs, axis=0)
425 |     tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
426 |     tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
427 |     plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.3, label=r'$\pm$ ROC Std. Dev.')
428 | 
429 | 
430 | 
431 |     ax1.plot([0, 1], [0, 1], 'k--', lw=2, color='orange', label = 'Random Guess')
432 |     ax1.grid(color='black', linestyle='dotted')
433 | 
434 |     plt.title('Receiver Operating Characteristic (ROC)')
435 |     plt.xlabel('False Positive Rate', fontsize='x-large')
436 |     plt.ylabel('True Positive Rate', fontsize='x-large')
437 |     plt.legend(loc='lower right', fontsize='large')
438 | 
439 |     plt.setp(ax1.get_xticklabels(), fontsize=14)
440 |     plt.setp(ax1.get_yticklabels(), fontsize=14)
441 | 
442 |     #fig.savefig('xgBoost/' + feature_set + "/ROC_" + clf_name + "_" + cfg[1] + ".pdf")   # save the figure to file
443 |     plt.close(fig)
444 | 
445 |     #Compute mean importance of feature accross CV folds
446 |     bin_number = list(range(len(train_x[0])))
447 |     mean_importances = []
448 |     for n in range(0,len(importances[0])):
449 |         mean_imp = (importances[0][n] + importances[1][n] + importances[2][n] + importances[3][n] + importances[4][n] + importances[5][n] + importances[6][n] + importances[7][n] + importances[8][n] + importances[9][n])/10.0
450 |         mean_importances.append(mean_imp)
451 |     #print mean_importances
452 |     f_imp = zip(bin_number,mean_importances,features_id)
453 |     f_imp.sort(key = lambda t: t[1], reverse=True)
454 | 
455 |     #np.save('xgBoost/' + feature_set + "/FeatureImportance_" + clf_name + "_" + cfg[1], np.array(f_imp))
456 | 
457 |     #for f in f_imp[:20]:
458 |     #    print "Importance: %f, Feature: %s" % (f[1], f[2])
459 | 
460 | 
461 | if __name__ == "__main__":
462 | 
463 |     cfgs = [
464 |     ["RegularTraffic",
465 |     "DeltaShaperTraffic_320"],
466 |     ["RegularTraffic",
467 |     "DeltaShaperTraffic_160"]]
468 | 
469 |     if not os.path.exists('xgBoost'):
470 |                 os.makedirs('xgBoost')
471 | 
472 |     classifiers = [
473 |     [DecisionTreeClassifier(), "DecisionTree"],
474 |     [RandomForestClassifier(n_estimators=100, max_features='auto',n_jobs=1), "RandomForest"],
475 |     [XGBClassifier(),"XGBoost"]
476 |     ]
477 | 
478 |     feature_set = 'Stats_60'
479 |     data_folder = 'FeatureSets/' + feature_set + '/'
480 |     if not os.path.exists('xgBoost/' + feature_set):
481 |             os.makedirs('xgBoost/' + feature_set)
482 | 
483 |     print "\n================================================="
484 |     print "One-class SVM - Summary Statistic Features - Set1"
485 |     print "================================================="
486 |     for cfg in cfgs:
487 |         for classifier in classifiers:
488 |             print "Running classifiers for " + cfg[0] + " and " + cfg[1]
489 |             runClassification_CV(data_folder, feature_set, cfg, classifier)
490 |     print "#####################################\n"
491 | 
492 | 
493 |     feature_set = 'PL_60'
494 |     data_folder = 'FeatureSets/' + feature_set + '/'
495 |     if not os.path.exists('xgBoost/' + feature_set):
496 |             os.makedirs('xgBoost/' + feature_set)
497 |                 
498 |     print "\n================================================="
499 |     print "One-class SVM - Packet Length Features - Set2"
500 |     print "================================================="
501 |     for cfg in cfgs:
502 |         for classifier in classifiers:
503 |             print "Running classifiers for " + cfg[0] + " and " + cfg[1]
504 |             runClassification_CV(data_folder, feature_set, cfg, classifier)
505 | 
506 |     
507 | 
508 | 
509 | 
510 | 


--------------------------------------------------------------------------------
/FacetAnalysis/xgboost_classifier.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import os
  3 | import csv
  4 | import numpy as np
  5 | from scipy import interp
  6 | import random
  7 | from random import shuffle
  8 | import math
  9 | import time
 10 | 
 11 | import sklearn
 12 | from sklearn import preprocessing
 13 | from sklearn.model_selection import ParameterGrid
 14 | #Classifiers
 15 | from xgboost import XGBClassifier
 16 | import xgboost as xgb
 17 | from sklearn.ensemble import RandomForestClassifier
 18 | from sklearn.tree import DecisionTreeClassifier
 19 | from sklearn.svm import SVC
 20 | #Eval Metrics
 21 | import sys
 22 | from sklearn.model_selection import train_test_split, KFold
 23 | from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc
 24 | from sklearn.model_selection import cross_val_score
 25 | from sklearn.decomposition import PCA
 26 | from sklearn.model_selection import GridSearchCV
 27 | from sklearn.metrics import classification_report
 28 | 
 29 | sklearn.set_config(assume_finite=True)
 30 | np.random.seed(1)
 31 | random.seed(1)
 32 | 
 33 | 
 34 | def gatherHoldoutData(data_folder, cfg):
 35 |     SPLIT_FACTOR = 0.7
 36 |     #Load Datasets
 37 |     f = open(data_folder + cfg[0] + "_dataset.csv", 'r')
 38 |     reader = csv.reader(f, delimiter=',')
 39 |     reg = list(reader)
 40 | 
 41 |     f = open(data_folder + cfg[1] + "_dataset.csv", 'r')
 42 |     reader = csv.reader(f, delimiter=',')
 43 |     fac = list(reader)
 44 |     print "###########################################"
 45 |     print "Configuration " + cfg[1]
 46 |     print "###########################################"
 47 | 
 48 |     #Convert data to floats (and labels to integers)
 49 |     reg_data = []
 50 | 
 51 |     for i in reg[1:]:
 52 |         int_array = []
 53 |         for pl in i[:-1]:
 54 |             int_array.append(float(pl))
 55 |         int_array.append(0)
 56 |         reg_data.append(int_array)
 57 | 
 58 |     fac_data = []
 59 |     for i in fac[1:]:
 60 |         int_array = []
 61 |         for pl in i[:-1]:
 62 |             int_array.append(float(pl))
 63 |         int_array.append(1)
 64 |         fac_data.append(int_array)
 65 | 
 66 | 
 67 |     #Shuffle both datasets
 68 |     shuffled_reg_data = random.sample(reg_data, len(reg_data))
 69 |     shuffled_fac_data = random.sample(fac_data, len(fac_data))
 70 | 
 71 |     #Build label tensors
 72 |     reg_labels = []
 73 |     for i in shuffled_reg_data:
 74 |         reg_labels.append(int(i[len(reg_data[0])-1]))
 75 | 
 76 |     fac_labels = []
 77 |     for i in shuffled_fac_data:
 78 |         fac_labels.append(int(i[len(reg_data[0])-1]))
 79 | 
 80 |     #Take label out of data tensors
 81 |     for i in range(0, len(shuffled_reg_data)):
 82 |         shuffled_reg_data[i].pop()
 83 | 
 84 |     for i in range(0, len(shuffled_fac_data)):
 85 |         shuffled_fac_data[i].pop()
 86 | 
 87 | 
 88 |     #Build training and testing datasets
 89 |     #Split each class data in the appropriate proportion for training
 90 |     reg_proportion_index = int(len(reg_labels)* SPLIT_FACTOR)
 91 |     reg_train_x = shuffled_reg_data[:reg_proportion_index]
 92 |     reg_train_y = reg_labels[:reg_proportion_index]
 93 | 
 94 |     fac_proportion_index = int(len(fac_labels)*SPLIT_FACTOR)
 95 |     fac_train_x = shuffled_fac_data[:fac_proportion_index]
 96 |     fac_train_y = fac_labels[:fac_proportion_index]
 97 | 
 98 |     #Create training sets by combining the randomly selected samples from each class
 99 |     train_x = reg_train_x + fac_train_x
100 |     train_y = reg_train_y + fac_train_y
101 | 
102 |     #Make the split for the testing data
103 |     reg_test_x = shuffled_reg_data[reg_proportion_index:]
104 |     reg_test_y = reg_labels[reg_proportion_index:]
105 | 
106 |     fac_test_x = shuffled_fac_data[fac_proportion_index:]
107 |     fac_test_y = fac_labels[fac_proportion_index:]
108 | 
109 |     #Create testing set by combining the holdout samples
110 |     test_x = reg_test_x + fac_test_x
111 |     test_y = reg_test_y + fac_test_y
112 | 
113 |     return train_x, train_y, test_x, test_y
114 | 
115 | def gatherAllData(data_folder, cfg, dataset_fraction):
116 |     #Load Datasets
117 |     f = open(data_folder + cfg[0] + "_dataset.csv", 'r')
118 |     reader = csv.reader(f, delimiter=',')
119 |     reg = list(reader)
120 |     reg = reg[:int(dataset_fraction*len(reg))]
121 |     #print sys.getsizeof(reg)
122 |     #print sys.getsizeof(reg[0])
123 |     #print sys.getsizeof(reg[1])
124 |     f = open(data_folder + cfg[1] + "_dataset.csv", 'r')
125 |     reader = csv.reader(f, delimiter=',')
126 |     fac = list(reader)
127 |     fac = fac[:int(dataset_fraction*len(fac))]
128 | 
129 |     #print "Size regular dataset: " + str(len(reg))
130 |     #print "Size censored dataset: " + str(len(fac))
131 |     print "###########################################"
132 |     print "Configuration " + cfg[1]
133 |     print "###########################################"
134 | 
135 |     #Convert data to floats (and labels to integers)
136 |     features_id = reg[0]
137 |     reg_data = []
138 |     for i in reg[1:]:
139 |         int_array = []
140 |         for pl in i[:-1]:
141 |             int_array.append(float(pl))
142 |         int_array.append(0)
143 |         reg_data.append(int_array)
144 | 
145 |     fac_data = []
146 |     for i in fac[1:]:
147 |         int_array = []
148 |         for pl in i[:-1]:
149 |             int_array.append(float(pl))
150 |         int_array.append(1)
151 |         fac_data.append(int_array)
152 | 
153 | 
154 |     #Shuffle both datasets
155 |     shuffled_reg_data = random.sample(reg_data, len(reg_data))
156 |     shuffled_fac_data = random.sample(fac_data, len(fac_data))
157 |     #shuffled_reg_data = shuffled_reg_data[:int(dataset_fraction*len(shuffled_reg_data))]
158 |     #shuffled_fac_data = shuffled_fac_data[:int(dataset_fraction*len(shuffled_fac_data))]
159 |     #Build label tensors
160 |     reg_labels = []
161 |     for i in shuffled_reg_data:
162 |         reg_labels.append(int(i[len(reg_data[0])-1]))
163 | 
164 |     fac_labels = []
165 |     for i in shuffled_fac_data:
166 |         fac_labels.append(int(i[len(reg_data[0])-1]))
167 | 
168 |     #Take label out of data tensors
169 |     for i in range(0, len(shuffled_reg_data)):
170 |         shuffled_reg_data[i].pop()
171 | 
172 |     for i in range(0, len(shuffled_fac_data)):
173 |         shuffled_fac_data[i].pop()
174 | 
175 |     #Create training sets by combining the randomly selected samples from each class
176 |     train_x = shuffled_reg_data + shuffled_fac_data
177 |     train_y = reg_labels + fac_labels
178 | 
179 |     #Shuffle positive/negative samples for CV purposes
180 |     x_shuf = []
181 |     y_shuf = []
182 |     index_shuf = range(len(train_x))
183 |     shuffle(index_shuf)
184 |     for i in index_shuf:
185 |         x_shuf.append(train_x[i])
186 |         y_shuf.append(train_y[i])
187 | 
188 |     return x_shuf, y_shuf, features_id
189 | 
190 | 
191 | def runClassification_CV(data_folder,feature_set, cfg,classifier):
192 |     print "Gather dataset"
193 |     dataset_fraction = 1.0
194 |     train_x, train_y, features_id = gatherAllData(data_folder, cfg, dataset_fraction)
195 | 
196 |     model = classifier[0]
197 |     clf_name = classifier[1]
198 | 
199 |     #Report Cross-Validation Accuracy
200 |     #scores = cross_val_score(model, np.asarray(train_x), np.asarray(train_y), cv=10)
201 |     print clf_name
202 |     #print "Avg. Accuracy: " + str(sum(scores)/float(len(scores)))
203 | 
204 |     cv = KFold(n_splits=10)
205 |     tprs = []
206 |     aucs = []
207 |     mean_fpr = np.linspace(0, 1, 100)
208 |     train_times = []
209 |     test_times = []
210 |     importances = []
211 | 
212 |     #Split the data in k-folds, perform classification, and report ROC
213 |     i = 0
214 |     for train, test in cv.split(train_x, train_y):
215 | 
216 |         start_train = time.time()
217 |         model = model.fit(np.asarray(train_x)[train], np.asarray(train_y)[train])
218 |         end_train = time.time()
219 |         train_times.append(end_train - start_train)
220 | 
221 |         start_test = time.time()
222 |         probas_ = model.predict_proba(np.asarray(train_x)[test])
223 |         end_test = time.time()
224 |         test_times.append(end_test - start_test)
225 | 
226 |         """
227 |         #For time benchmarking
228 |         c=[]
229 |         for value in np.asarray(train_x)[test]:
230 |             a = np.reshape(value,(1, -1))
231 |             c.append(a)
232 | 
233 |         load = []
234 |         for v in c:
235 |             start_test = time.time()
236 |             a = model.predict_proba(v)
237 |             end_test = time.time()
238 |             load.append(end_test - start_test)
239 |         print "Individual prediction avg: " + "{0:.5f}".format(np.mean(load))
240 |         """
241 | 
242 |         # Compute ROC curve and area under the curve
243 |         fpr, tpr, thresholds = roc_curve(np.asarray(train_y)[test], probas_[:, 1], pos_label=1)
244 |         #print "Accuracy " + str(accuracy_score(np.asarray(train_y)[test], probas_[:, 1]))
245 |         tprs.append(interp(mean_fpr, fpr, tpr))
246 |         tprs[-1][0] = 0.0
247 |         roc_auc = auc(fpr, tpr)
248 |         aucs.append(roc_auc)
249 | 
250 |         #Check feature importance in this fold
251 |         f_imp = model.feature_importances_
252 |         importances.append(f_imp)
253 |         #plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
254 |         i += 1
255 | 
256 |     plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Random Guess', alpha=.8)
257 | 
258 | 
259 |     mean_tpr = np.mean(tprs, axis=0)
260 |     mean_tpr[-1] = 1.0
261 |     mean_auc = auc(mean_fpr, mean_tpr)
262 |     print "Model AUC: " + "{0:.3f}".format(mean_auc)
263 |     print "Training time (Avg. fold): " + str(np.mean(train_times, axis=0))
264 |     print "Test time (Avg. fold): " + str(np.mean(test_times, axis=0))
265 | 
266 | 
267 |     unblock70 = True
268 |     unblock80 = True
269 |     unblock90 = True
270 |     unblock95 = True
271 |     for n, i in enumerate(mean_tpr):
272 |         if(i >= 0.7 and unblock70):
273 |             print '70%  TPR  = ' + "{0:.3f}".format(mean_fpr[n])
274 |             unblock70 = False
275 |         if(i >= 0.8 and unblock80):
276 |             print '80%  TPR  = ' + "{0:.3f}".format(mean_fpr[n])
277 |             unblock80 = False
278 |         if(i >= 0.9 and unblock90):
279 |             print '90%  TPR  = ' + "{0:.3f}".format(mean_fpr[n])
280 |             unblock90 = False
281 |         if(i >= 0.95 and unblock95):
282 |             print '95%  TPR  = ' + "{0:.3f}".format(mean_fpr[n])
283 |             unblock95 = False
284 | 
285 |     #Figure properties
286 |     fig = plt.figure()
287 |     ax1 = fig.add_subplot(111)
288 | 
289 |     std_auc = np.std(aucs)
290 | 
291 |     np.save('xgBoost/' + feature_set + "/ROC_" + clf_name + "_" + cfg[1] + "_Sensitivity", np.array(mean_tpr))
292 |     np.save('xgBoost/' + feature_set + "/ROC_" + clf_name + "_" + cfg[1] + "_Specificity", np.array(mean_fpr))
293 |     plt.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.3f)' % (mean_auc, std_auc), lw=2, alpha=.8)
294 | 
295 |     #Compute Standard Deviation between folds
296 |     std_tpr = np.std(tprs, axis=0)
297 |     tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
298 |     tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
299 |     plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.3, label=r'$\pm$ ROC Std. Dev.')
300 | 
301 | 
302 | 
303 |     ax1.plot([0, 1], [0, 1], 'k--', lw=2, color='orange', label = 'Random Guess')
304 |     ax1.grid(color='black', linestyle='dotted')
305 | 
306 |     plt.title('Receiver Operating Characteristic (ROC)')
307 |     plt.xlabel('False Positive Rate', fontsize='x-large')
308 |     plt.ylabel('True Positive Rate', fontsize='x-large')
309 |     plt.legend(loc='lower right', fontsize='large')
310 | 
311 |     plt.setp(ax1.get_xticklabels(), fontsize=14)
312 |     plt.setp(ax1.get_yticklabels(), fontsize=14)
313 | 
314 |     fig.savefig('xgBoost/' + feature_set + "/ROC_" + clf_name + "_" + cfg[1] + ".pdf")   # save the figure to file
315 |     plt.close(fig)
316 | 
317 |     #Compute mean importance of feature accross CV folds
318 |     bin_number = list(range(len(train_x[0])))
319 |     mean_importances = []
320 |     for n in range(0,len(importances[0])):
321 |         mean_imp = (importances[0][n] + importances[1][n] + importances[2][n] + importances[3][n] + importances[4][n] + importances[5][n] + importances[6][n] + importances[7][n] + importances[8][n] + importances[9][n])/10.0
322 |         mean_importances.append(mean_imp)
323 |     #print mean_importances
324 |     f_imp = zip(bin_number,mean_importances,features_id)
325 |     f_imp.sort(key = lambda t: t[1], reverse=True)
326 | 
327 |     np.save('xgBoost/' + feature_set + "/FeatureImportance_" + clf_name + "_" + cfg[1], np.array(f_imp))
328 | 
329 | 
330 | 
331 | def runClassification_adhocCV(data_folder,feature_set, cfg,classifier):
332 |     print "Gather dataset"
333 |     dataset_fraction = 1.0
334 |     train_x, train_y, features_id = gatherAllData(data_folder, cfg, dataset_fraction)
335 | 
336 |     model = classifier[0]
337 |     clf_name = classifier[1]
338 | 
339 |     #Report Cross-Validation Accuracy
340 |     #scores = cross_val_score(model, np.asarray(train_x), np.asarray(train_y), cv=10)
341 |     print clf_name
342 |     #print "Avg. Accuracy: " + str(sum(scores)/float(len(scores)))
343 | 
344 |     cv = KFold(n_splits=10)
345 |     tprs = []
346 |     aucs = []
347 |     mean_fpr = np.linspace(0, 1, 100)
348 |     train_times = []
349 |     test_times = []
350 |     importances = []
351 | 
352 |     #Split the data in k-folds, perform classification, and report ROC
353 | 
354 |     for i in range(0,10):
355 |         X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.1)
356 |         start_train = time.time()
357 |         model = model.fit(np.asarray(X_train), np.asarray(y_train))
358 |         end_train = time.time()
359 |         train_times.append(end_train - start_train)
360 | 
361 |         start_test = time.time()
362 |         probas_ = model.predict_proba(np.asarray(X_test))
363 |         end_test = time.time()
364 |         test_times.append(end_test - start_test)
365 | 
366 |         # Compute ROC curve and area under the curve
367 |         fpr, tpr, thresholds = roc_curve(np.asarray(y_test), probas_[:, 1], pos_label=1)
368 |         #print "Accuracy " + str(accuracy_score(np.asarray(train_y)[test], probas_[:, 1]))
369 |         tprs.append(interp(mean_fpr, fpr, tpr))
370 |         tprs[-1][0] = 0.0
371 |         roc_auc = auc(fpr, tpr)
372 |         aucs.append(roc_auc)
373 | 
374 |         #Check feature importance in this fold
375 |         f_imp = model.feature_importances_
376 |         importances.append(f_imp)
377 | 
378 | 
379 |     plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Random Guess', alpha=.8)
380 | 
381 | 
382 |     mean_tpr = np.mean(tprs, axis=0)
383 |     mean_tpr[-1] = 1.0
384 |     mean_auc = auc(mean_fpr, mean_tpr)
385 |     print "Model AUC: " + "{0:.3f}".format(mean_auc)
386 |     print "Training time (Avg. fold): " + str(np.mean(train_times, axis=0))
387 |     print "Test time (Avg. fold): " + str(np.mean(test_times, axis=0))
388 | 
389 | 
390 |     unblock70 = True
391 |     unblock80 = True
392 |     unblock90 = True
393 |     unblock95 = True
394 |     for n, i in enumerate(mean_tpr):
395 |         if(i >= 0.7 and unblock70):
396 |             print '70%  TPR  = ' + "{0:.3f}".format(mean_fpr[n])
397 |             unblock70 = False
398 |         if(i >= 0.8 and unblock80):
399 |             print '80%  TPR  = ' + "{0:.3f}".format(mean_fpr[n])
400 |             unblock80 = False
401 |         if(i >= 0.9 and unblock90):
402 |             print '90%  TPR  = ' + "{0:.3f}".format(mean_fpr[n])
403 |             unblock90 = False
404 |         if(i >= 0.95 and unblock95):
405 |             print '95%  TPR  = ' + "{0:.3f}".format(mean_fpr[n])
406 |             unblock95 = False
407 | 
408 |     #Figure properties
409 |     fig = plt.figure()
410 |     ax1 = fig.add_subplot(111)
411 | 
412 |     std_auc = np.std(aucs)
413 | 
414 |     #np.save('xgBoost/' + feature_set + "/ROC_" + clf_name + "_" + cfg[1] + "_Sensitivity", np.array(mean_tpr))
415 |     #np.save('xgBoost/' + feature_set + "/ROC_" + clf_name + "_" + cfg[1] + "_Specificity", np.array(mean_fpr))
416 |     plt.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.3f)' % (mean_auc, std_auc), lw=2, alpha=.8)
417 | 
418 |     #Compute Standard Deviation between folds
419 |     std_tpr = np.std(tprs, axis=0)
420 |     tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
421 |     tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
422 |     plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.3, label=r'$\pm$ ROC Std. Dev.')
423 | 
424 | 
425 | 
426 |     ax1.plot([0, 1], [0, 1], 'k--', lw=2, color='orange', label = 'Random Guess')
427 |     ax1.grid(color='black', linestyle='dotted')
428 | 
429 |     plt.title('Receiver Operating Characteristic (ROC)')
430 |     plt.xlabel('False Positive Rate', fontsize='x-large')
431 |     plt.ylabel('True Positive Rate', fontsize='x-large')
432 |     plt.legend(loc='lower right', fontsize='large')
433 | 
434 |     plt.setp(ax1.get_xticklabels(), fontsize=14)
435 |     plt.setp(ax1.get_yticklabels(), fontsize=14)
436 | 
437 |     #fig.savefig('xgBoost/' + feature_set + "/ROC_" + clf_name + "_" + cfg[1] + ".pdf")   # save the figure to file
438 |     plt.close(fig)
439 | 
440 |     #Compute mean importance of feature accross CV folds
441 |     bin_number = list(range(len(train_x[0])))
442 |     mean_importances = []
443 |     for n in range(0,len(importances[0])):
444 |         mean_imp = (importances[0][n] + importances[1][n] + importances[2][n] + importances[3][n] + importances[4][n] + importances[5][n] + importances[6][n] + importances[7][n] + importances[8][n] + importances[9][n])/10.0
445 |         mean_importances.append(mean_imp)
446 |     #print mean_importances
447 |     f_imp = zip(bin_number,mean_importances,features_id)
448 |     f_imp.sort(key = lambda t: t[1], reverse=True)
449 | 
450 |     #np.save('xgBoost/' + feature_set + "/FeatureImportance_" + clf_name + "_" + cfg[1], np.array(f_imp))
451 | 
452 |     #for f in f_imp[:20]:
453 |     #    print "Importance: %f, Feature: %s" % (f[1], f[2])
454 | 
455 | if __name__ == "__main__":
456 |     cfgs = [
457 |     ["RegularTraffic_Christmas",
458 |     "FacetTraffic_12.5_Christmas"],
459 |     ["RegularTraffic_Christmas",
460 |     "FacetTraffic_25_Christmas"],
461 |     ["RegularTraffic_Christmas",
462 |     "FacetTraffic_50_Christmas"]]
463 | 
464 |     if not os.path.exists('xgBoost'):
465 |                 os.makedirs('xgBoost')
466 | 
467 | 
468 |     classifiers = [
469 |     [DecisionTreeClassifier(), "DecisionTree"],
470 |     [RandomForestClassifier(n_estimators=100, max_features='auto',n_jobs=1), "RandomForest"],
471 |     [XGBClassifier(),"XGBoost"]
472 |     ]
473 | 
474 | 
475 |     feature_set = 'Stats_60' #'Stats_60' / 'PL_60'
476 |     data_folder = 'FeatureSets/' + feature_set + '/'
477 |     if not os.path.exists('xgBoost/' + feature_set):
478 |                 os.makedirs('xgBoost/' + feature_set)
479 | 
480 |     print "\n================================================="
481 |     print "One-class SVM - Summary Statistic Features - Set1"
482 |     print "================================================="
483 |     for cfg in cfgs:
484 |         for classifier in classifiers:
485 |             print "Running classifiers for " + cfg[0] + " and " + cfg[1]
486 |             runClassification_CV(data_folder, feature_set, cfg, classifier)
487 |     print "#####################################\n"
488 | 
489 | 
490 |     feature_set = 'PL_60' #'Stats_60' / 'PL_60'
491 |     data_folder = 'FeatureSets/' + feature_set + '/'
492 |     if not os.path.exists('xgBoost/' + feature_set):
493 |                 os.makedirs('xgBoost/' + feature_set)
494 | 
495 |     print "\n================================================="
496 |     print "One-class SVM - Packet Length Features - Set2"
497 |     print "================================================="
498 |     for cfg in cfgs:
499 |         for classifier in classifiers:
500 |             print "Running classifiers for " + cfg[0] + " and " + cfg[1]
501 |             runClassification_CV(data_folder, feature_set, cfg, classifier)
502 | 


--------------------------------------------------------------------------------
/FacetAnalysis/autoencoder.py:
--------------------------------------------------------------------------------
  1 | import socket
  2 | import dpkt
  3 | import os
  4 | import tensorflow as tf
  5 | import csv
  6 | import numpy as np
  7 | import random
  8 | import math
  9 | from sklearn.metrics import roc_curve, auc
 10 | from matplotlib import pyplot as plt
 11 | from sklearn import preprocessing
 12 | import time
 13 | 
 14 | from copy import deepcopy
 15 | from scipy import interp
 16 | 
 17 | np.random.seed(1)
 18 | graph_level_seed = 1
 19 | operation_level_seed = 1
 20 | tf.set_random_seed(graph_level_seed)
 21 | random.seed(1)
 22 | 
 23 | plt.rcParams['font.family'] = 'Helvetica'
 24 | 
 25 | def gatherDataset_january(data_folder, cfg, SPLIT_FACTOR):
 26 |     random.seed(1)
 27 |     #Load Datasets
 28 |     f = open(data_folder + cfg[0] + "_dataset.csv", 'r')
 29 |     reader = csv.reader(f, delimiter=',')
 30 |     reg = list(reader)
 31 | 
 32 |     f = open(data_folder + cfg[1] + "_dataset.csv", 'r')
 33 |     reader = csv.reader(f, delimiter=',')
 34 |     fac = list(reader)
 35 |     print "###########################################"
 36 |     print "Configuration " + cfg[1]
 37 |     print "###########################################"
 38 | 
 39 |     #Convert data to floats (and labels to integers)
 40 |     reg_data = []
 41 |     for i in reg[1:]:
 42 |         int_array = []
 43 |         for pl in i[:-1]:
 44 |             int_array.append(float(pl))
 45 |         int_array.append(1)
 46 |         reg_data.append(int_array)
 47 | 
 48 |     fac_data = []
 49 |     for i in fac[1:]:
 50 |         int_array = []
 51 |         for pl in i[:-1]:
 52 |             int_array.append(float(pl))
 53 |         int_array.append(0)
 54 |         fac_data.append(int_array)
 55 | 
 56 | 
 57 |     #Shuffle both datasets
 58 |     shuffled_reg_data = random.sample(reg_data, len(reg_data))
 59 |     shuffled_fac_data = random.sample(fac_data, len(fac_data))
 60 | 
 61 |     #Build label tensors
 62 |     reg_labels = []
 63 |     for i in shuffled_reg_data:
 64 |         reg_labels.append(int(i[len(reg_data[0])-1]))
 65 | 
 66 |     fac_labels = []
 67 |     for i in shuffled_fac_data:
 68 |         fac_labels.append(int(i[len(reg_data[0])-1]))
 69 | 
 70 |     #Take label out of data tensors
 71 |     for i in range(0, len(shuffled_reg_data)):
 72 |         shuffled_reg_data[i].pop()
 73 | 
 74 |     for i in range(0, len(shuffled_fac_data)):
 75 |         shuffled_fac_data[i].pop()
 76 | 
 77 | 
 78 |     #Build training and testing datasets
 79 |     #Split each class data in the appropriate proportion for training
 80 |     reg_proportion_index = int(len(reg_labels)* SPLIT_FACTOR)
 81 |     reg_train_x = shuffled_reg_data[:reg_proportion_index]
 82 |     reg_train_y = reg_labels[:reg_proportion_index]
 83 | 
 84 |     fac_proportion_index = int(len(fac_labels)*SPLIT_FACTOR)
 85 |     fac_train_x = shuffled_fac_data[:fac_proportion_index]
 86 |     fac_train_y = fac_labels[:fac_proportion_index]
 87 | 
 88 |     #Create training sets by simply using normal samples
 89 |     train_x = reg_train_x #+ fac_train_x
 90 |     train_y = reg_train_y #+ fac_train_y
 91 | 
 92 |     #Make the split for the testing data
 93 |     reg_test_x = shuffled_reg_data[reg_proportion_index:]
 94 |     reg_test_y = reg_labels[reg_proportion_index:]
 95 | 
 96 |     fac_test_x = shuffled_fac_data[fac_proportion_index:]
 97 |     fac_test_y = fac_labels[fac_proportion_index:]
 98 | 
 99 |     #Create testing set by combining the holdout samples
100 |     test_x = reg_test_x + fac_test_x
101 |     test_y = reg_test_y + fac_test_y
102 | 
103 |     return train_x, train_y, test_x, test_y, len(reg_data[0])
104 | 
105 | def gatherDataset_10times(data_folder, cfg, split_factor):
106 |     random.seed(1)
107 |     SPLIT_FACTOR = split_factor
108 |     #Load Datasets
109 |     f = open(data_folder + cfg[0] + "_dataset.csv", 'r')
110 |     reader = csv.reader(f, delimiter=',')
111 |     reg = list(reader)
112 | 
113 |     f = open(data_folder + cfg[1] + "_dataset.csv", 'r')
114 |     reader = csv.reader(f, delimiter=',')
115 |     fac = list(reader)
116 |     print "###########################################"
117 |     print "Configuration " + cfg[1]
118 |     print "###########################################"
119 | 
120 | 
121 |     #Convert data to floats (and labels to integers)
122 |     reg_data = []
123 |     for i in reg[1:]:
124 |         int_array = []
125 |         for pl in i[:-1]:
126 |             int_array.append(float(pl))
127 |         int_array.append(0) #0, inliers
128 |         reg_data.append(int_array)
129 | 
130 |     fac_data = []
131 |     for i in fac[1:]:
132 |         int_array = []
133 |         for pl in i[:-1]:
134 |             int_array.append(float(pl))
135 |         int_array.append(1) #1, outliers
136 |         fac_data.append(int_array)
137 | 
138 |     train_x_t = []
139 |     train_y_t = []
140 |     test_x_t = []
141 |     test_y_t = []
142 | 
143 |     for k in range(0,10):
144 |         reg_data2 = deepcopy(reg_data)
145 |         fac_data2 = deepcopy(fac_data)
146 | 
147 | 
148 |         #Shuffle both datasets
149 |         shuffled_reg_data = random.sample(reg_data2, len(reg_data2))
150 |         shuffled_fac_data = random.sample(fac_data2, len(fac_data2))
151 | 
152 |         #Build label tensors
153 |         reg_labels = []
154 |         for i in shuffled_reg_data:
155 |             reg_labels.append(int(i[len(reg_data2[0])-1]))
156 | 
157 |         fac_labels = []
158 |         for i in shuffled_fac_data:
159 |             fac_labels.append(int(i[len(reg_data2[0])-1]))
160 | 
161 |         #Take label out of data tensors
162 |         for i in range(0, len(shuffled_reg_data)):
163 |             shuffled_reg_data[i].pop()
164 | 
165 |         for i in range(0, len(shuffled_fac_data)):
166 |             shuffled_fac_data[i].pop()
167 | 
168 | 
169 |         #Build training and testing datasets
170 |         #Split each class data in the appropriate proportion for training
171 |         reg_proportion_index = int(len(reg_labels)* SPLIT_FACTOR)
172 |         reg_train_x = shuffled_reg_data[:reg_proportion_index]
173 |         reg_train_y = reg_labels[:reg_proportion_index]
174 | 
175 |         fac_proportion_index = int(len(fac_labels)*SPLIT_FACTOR)
176 |         fac_train_x = shuffled_fac_data[:fac_proportion_index]
177 |         fac_train_y = fac_labels[:fac_proportion_index]
178 | 
179 |         #Create training sets by combining the randomly selected samples from each class
180 |         train_x = reg_train_x
181 |         train_y = reg_train_y
182 | 
183 |         #Make the split for the testing data
184 |         reg_test_x = shuffled_reg_data[reg_proportion_index:]
185 |         reg_test_y = reg_labels[reg_proportion_index:]
186 |         fac_test_x = shuffled_fac_data[fac_proportion_index:]
187 |         fac_test_y = fac_labels[fac_proportion_index:]
188 | 
189 |         #Create testing set by combining the holdout samples
190 |         test_x = reg_test_x + fac_test_x
191 |         test_y = reg_test_y + fac_test_y
192 | 
193 |         train_x_t.append(train_x)
194 |         train_y_t.append(train_y)
195 |         test_x_t.append(test_x)
196 |         test_y_t.append(test_y)
197 | 
198 |     return train_x_t, train_y_t, test_x_t, test_y_t, len(reg_data2[0])
199 | 
200 | class Encoder(object):
201 |     def __init__(self, inp, n_features, n_hidden, drop_input, drop_hidden, repr_size):
202 |         # inp is the placeholder for the input, n_features is the number of features our data has (21 in this example)
203 |         # n_hidden is the size of the first hidden layer and repr_size is the dimensionality of the representation
204 |         self.inp = inp
205 |         self.n_features = n_features
206 |         self.n_hidden = n_hidden
207 |         self.W1 = tf.Variable(tf.random_normal([n_features, self.n_hidden], stddev=0.35))
208 |         self.W2 = tf.Variable(tf.random_normal([self.n_hidden, repr_size], stddev=0.35))
209 | 
210 | 
211 |         self.b1 = tf.Variable(tf.random_normal([self.n_hidden], stddev=0.35))
212 |         self.b2 = tf.Variable(tf.random_normal([repr_size], stddev=0.35))
213 | 
214 |         self.layer_0 = tf.nn.dropout(self.inp, drop_input)
215 |         self.layer_1 = tf.nn.relu(tf.matmul(self.layer_0, self.W1) + self.b1)
216 |         self.layer_1 = tf.nn.dropout(self.layer_1, drop_hidden)
217 |         self.encoder_out = tf.matmul(self.layer_1, self.W2) + self.b2
218 | 
219 | 
220 | class Decoder(object):
221 |     def __init__(self, inp, n_features, n_hidden, drop_input, drop_hidden, repr_size):
222 |         self.inp = inp
223 |         self.n_hidden = n_hidden
224 |         self.W1 = tf.Variable(tf.random_normal([repr_size, self.n_hidden], stddev=0.35))
225 |         self.W2 = tf.Variable(tf.random_normal([self.n_hidden, n_features], stddev=0.35))
226 |         self.b1 = tf.Variable(tf.random_normal([self.n_hidden], stddev=0.35))
227 |         self.b2 = tf.Variable(tf.random_normal([n_features], stddev=0.35))
228 | 
229 |         self.layer_0 = tf.nn.dropout(self.inp, drop_input)
230 |         self.layer_1 = tf.nn.relu(tf.matmul(self.layer_0, self.W1) + self.b1)
231 |         self.layer_1 = tf.nn.dropout(self.layer_1, drop_hidden)
232 |         self.decoder_out = tf.matmul(self.layer_1, self.W2) + self.b2
233 | 
234 | class Autoencoder(object):
235 |     def __init__(self, n_features, batch_size, n_hidden, drop_input, drop_hidden, repr_size, learning_rate):
236 |         # n_features is the number of features our data has (21 in this example)
237 |         # repr_size the dimensionality of our representation
238 |         # n_hidden_1 is the size of the layers closest to the in and output
239 |         # n_hidden_2 is the size of the layers closest to the embedding layer
240 |         # batch_size number of samples to run per batch
241 | 
242 |         self.n_features = n_features
243 |         self.batch_size = batch_size
244 |         self.n_hidden = n_hidden
245 |         self.drop_input = drop_input
246 |         self.hidden = drop_hidden
247 |         self.repr_size = repr_size
248 | 
249 |         # Start session, placeholder has None in shape for batches
250 |         self.sess = tf.Session()
251 |         self.inp = tf.placeholder(tf.float32, [None, n_features])
252 | 
253 |         # Make the encoder and the decoder
254 |         self.encoder = Encoder(self.inp, n_features, n_hidden, drop_input, drop_hidden, repr_size)
255 |         self.decoder = Decoder(self.encoder.encoder_out, n_features, n_hidden, drop_input, drop_hidden, repr_size)
256 | 
257 |         # Loss function mean squared error and AdamOptimizer
258 |         self.loss = tf.reduce_mean(tf.square(self.decoder.decoder_out - self.inp), -1)
259 |         self.mean_loss = tf.reduce_mean(self.loss)
260 |         self.optimizer = tf.train.AdamOptimizer(learning_rate)
261 |         self.train_op = self.optimizer.minimize(self.mean_loss)
262 | 
263 |         # Initialize all variables
264 |         self.sess.run(tf.global_variables_initializer())
265 | 
266 |     def run_epoch(self, data_list):
267 |         # Train once over the passed data_list and return the mean reconstruction loss after the epoch
268 |         for index in range(len(data_list) // self.batch_size):
269 |             self.sess.run(self.train_op, feed_dict={self.inp: data_list[index * self.batch_size : (index+1) * self.batch_size]})
270 |         return self.sess.run(self.mean_loss, feed_dict={self.inp: data_list})
271 | 
272 |     def representations(self, data_list):
273 |         # Return a list of representations for the given list of samples
274 |         return self.sess.run(self.encoder.encoder_out, feed_dict={self.inp: data_list})
275 | 
276 |     def reconstruction_errors(self, data_list):
277 |         # Get mean squared reconstruction errors of passed data_list
278 |         return self.sess.run(self.loss, feed_dict={self.inp: data_list})
279 | 
280 | 
281 | def runANN(data_folder,cfg):
282 |     epochs = 1000
283 |     #Gather the dataset
284 |     #train_x, train_y are just regular samples
285 |     train_x, train_y, test_x, test_y, num_input = gatherDataset_january(data_folder, cfg, 0.7)
286 | 
287 |     #std_scale = preprocessing.StandardScaler().fit(train_x)
288 |     #train_x = std_scale.transform(train_x)
289 |     #test_x = std_scale.transform(test_x)
290 | 
291 |     #n_features, batch_size, n_hidden, drop_input, drop_hidden, repr_size
292 |     ae = Autoencoder(num_input, 128, 128, 0.8, 0.5, 32)
293 |     for i in range(epochs):
294 |         if(i%50==0):
295 |             print "Epoch: " + str(i)
296 |         ae.run_epoch(train_x)
297 | 
298 |     """
299 |     #Show compressed representation of samples (valid for repr_size=2,3)
300 |     anomaly_repr = ae.representations(test_x[len(test_x)/2:])
301 |     normal_repr = ae.representations(test_x[:len(test_x)/2])
302 |     anom_x, anom_y = zip(*anomaly_repr)
303 |     norm_x, norm_y = zip(*normal_repr)
304 |     plt.scatter(anom_x, anom_y, color='red', alpha=0.7)
305 |     plt.scatter(norm_x, norm_y, alpha=0.7)
306 |     plt.show()
307 |     """
308 | 
309 |     #Reconstruct samples
310 |     anomaly_errors = ae.reconstruction_errors(test_x[len(test_x)/2:])
311 |     normal_val_errors = ae.reconstruction_errors(test_x[:len(test_x)/2])
312 | 
313 |     roc_y = [1 for _ in range(len(anomaly_errors))] + [0 for _ in range(len(normal_val_errors))]
314 |     roc_score = np.concatenate([anomaly_errors, normal_val_errors])
315 | 
316 | 
317 |     # Compute ROC curve and ROC area for each class
318 |     #number of thresholds = number of data samples - default drop_intermediate
319 |     # does not show some low performing configs for creating smoother ROCs
320 | 
321 |     fpr, tpr, thresholds = roc_curve(roc_y, roc_score, drop_intermediate=True)
322 |     roc_auc = auc(fpr, tpr)
323 | 
324 | 
325 |     plt.figure()
326 |     lw = 2
327 |     plt.plot(fpr, tpr, color='darkorange',lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
328 |     plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
329 |     plt.xlim([0.0, 1.0])
330 |     plt.ylim([0.0, 1.05])
331 |     plt.xlabel('False Positive Rate')
332 |     plt.ylabel('True Positive Rate')
333 |     plt.title('Receiver operating characteristic example')
334 |     plt.legend(loc="lower right")
335 |     plt.show()
336 | 
337 | 
338 | def runANNSearch(data_folder,cfg):
339 |     epochs = 100
340 |     #Gather the dataset
341 |     #train_x, train_y are just regular samples
342 |     train_x_t, train_y_t, test_x_t, test_y_t, num_input = gatherDataset_10times(data_folder, cfg, 0.9)
343 | 
344 |     #std_scale = preprocessing.StandardScaler().fit(train_x)
345 |     #train_x = std_scale.transform(train_x)
346 |     #test_x = std_scale.transform(test_x)
347 | 
348 |     max_auc = 0
349 |     max_batch_size = 0
350 |     max_hidden = 0
351 |     max_repr_size = 0
352 | 
353 |     auc_report = []
354 |     n_hidden_report = []
355 |     repr_size_report = []
356 |     batch_sizes_report = []
357 | 
358 |     best_config = []
359 |     max_auc = 0
360 | 
361 |     learning_rates = [0.001]  # [0.01, 0.001] # default is 0.001
362 |     batch_sizes = [32]#[8, 16, 32, 64, 128, 256]
363 |     n_hiddens = [8, 16, 32, 64, 128, 256]#np.logspace(2, 10, base=2, num=12)
364 |     #drop_inputs = [0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
365 |     #drop_hiddens = [0.5, 0.6, 0.7, 0.8, 0.9, 1]
366 |     repr_sizes = [4, 8, 16, 32, 64, 128, 256] #np.logspace(2, 10, base=2, num=12) #num 20
367 | 
368 |     for learning_rate in learning_rates:
369 |         for batch_size in batch_sizes:
370 |             for n_hidden in n_hiddens:
371 |                 for repr_size in repr_sizes:
372 |                     if(repr_size <= n_hidden):
373 |                         #start = time.time()
374 |                         np.random.seed(1)
375 |                         graph_level_seed = 1
376 |                         operation_level_seed = 1
377 |                         tf.set_random_seed(graph_level_seed)
378 |                         random.seed(1)
379 | 
380 |                         step_auc = []
381 |                         mean_fpr = np.linspace(0, 1, 100)
382 |                         tprs = []
383 |                         for n in range(0,10):
384 |                             #n_features, batch_size, n_hidden, drop_input, drop_hidden, repr_size
385 |                             ae = Autoencoder(num_input, batch_size, int(n_hidden), 1, 1, int(repr_size), learning_rate)
386 | 
387 |                             train_x = train_x_t[n]
388 |                             train_y = train_y_t[n]
389 |                             test_x = test_x_t[n]
390 |                             test_y = test_y_t[n]
391 | 
392 |                             for i in range(epochs):
393 |                                 ae.run_epoch(train_x)
394 | 
395 |                             #Reconstruct samples
396 |                             anomaly_errors = ae.reconstruction_errors(test_x[len(test_x)/2:])
397 |                             normal_val_errors = ae.reconstruction_errors(test_x[:len(test_x)/2])
398 | 
399 |                             roc_y = [1 for _ in range(len(anomaly_errors))] + [0 for _ in range(len(normal_val_errors))]
400 |                             roc_score = np.concatenate([anomaly_errors, normal_val_errors])
401 | 
402 | 
403 |                             # Compute ROC curve and ROC area for each class
404 |                             fpr, tpr, thresholds = roc_curve(roc_y, roc_score, drop_intermediate=True)
405 |                             tprs.append(interp(mean_fpr, fpr, tpr))
406 |                             tprs[-1][0] = 0.0
407 |                             roc_auc = auc(fpr, tpr)
408 |                             #print "Fold %i auc: %f" % (n, roc_auc)
409 |                             step_auc.append(roc_auc)
410 | 
411 |                         avg_auc = sum(step_auc)/float(len(step_auc))
412 | 
413 |                         auc_report.append(avg_auc)
414 |                         """
415 |                         n_hidden_report.append(int(n_hidden))
416 |                         repr_size_report.append(int(repr_size))
417 |                         batch_sizes_report.append(batch_size)
418 |                         """
419 |                         mean_tpr = np.mean(tprs, axis=0)
420 |                         mean_tpr[-1] = 1.0
421 |                         mean_auc = auc(mean_fpr, mean_tpr)
422 | 
423 |                         if(mean_auc > max_auc):
424 |                             max_auc = mean_auc
425 |                             best_config = [mean_fpr, mean_tpr, n_hidden, repr_size]
426 | 
427 |                         #end = time.time()
428 |                         #print(end - start)
429 |                         print ("%f - Batch Size:%i, Learning Rate:%f, n_hidden:%i, repr_size:%i" % (avg_auc, batch_size, learning_rate, int(n_hidden), int(repr_size)))
430 | 
431 | 
432 |     fig = plt.figure()
433 |     ax1 = fig.add_subplot(111)
434 |     plt.xlim([0, 1])
435 |     plt.ylim([0, 1])
436 |     plt.xlabel('False Positive Rate', fontsize=26)
437 |     plt.ylabel('True Positive Rate', fontsize=26)
438 | 
439 |     ax1.plot([0, 1], [0, 1], 'k--', lw=2, color='orange', label = 'Random Guess')
440 |     ax1.grid(color='black', linestyle='dotted')
441 |     plt.setp(ax1.get_xticklabels(), fontsize=16)
442 |     plt.setp(ax1.get_yticklabels(), fontsize=16)
443 |     plt.plot(best_config[0], best_config[1], color='b', label=r'ROC (AUC = %0.2f)' % (max_auc), lw=2, alpha=.8)
444 |     plt.legend(loc='lower right', fontsize='x-large')
445 | 
446 |     fig.savefig('Autoencoder/' + "Facet_Autoencoder_" + cfg[1] + ".pdf")   # save the figure to file
447 |     plt.close(fig)
448 | 
449 |     print "################\n# Summary"
450 |     print "Max. AUC: %f, N_hidden: %i, Repr_Size: %i" % (max_auc, best_config[2],best_config[3])
451 |     print "Avg. AUC %f: " % (np.mean(auc_report,axis=0))
452 |     """
453 |     full_report = zip(auc_report, batch_sizes_report, n_hidden_report, repr_size_report)
454 |     full_report.sort(key = lambda t: t[0])
455 | 
456 |     f = open(cfg[1] + '_report.txt', 'w')
457 | 
458 |     for item in full_report:
459 |         f.write("%f - Batch Size:%i, n_hidden:%i, repr_size:%i\n" % (item[0], item[1], item[2], item[3]))
460 |     np.save(cfg[1] + '_report', np.array(full_report))
461 |     """
462 | 
463 | 
464 | if __name__ == "__main__":
465 | 
466 |     cfgs = [
467 |     ["RegularTraffic_Christmas",
468 |     "FacetTraffic_12.5_Christmas"],
469 |     ["RegularTraffic_Christmas",
470 |     "FacetTraffic_25_Christmas"],
471 |     ["RegularTraffic_Christmas",
472 |     "FacetTraffic_50_Christmas"]]
473 | 
474 | 
475 |     print "Autoencoder - Packet Length Features - Set2"
476 |     feature_set = 'PL_60'
477 |     data_folder = 'FeatureSets/' + feature_set + '/'
478 | 
479 |     for cfg in cfgs:
480 |         runANNSearch(data_folder,cfg)
481 | 


--------------------------------------------------------------------------------
/CovertCastAnalysis/X2_classifier.py:
--------------------------------------------------------------------------------
  1 | import dpkt
  2 | import os
  3 | from matplotlib import pyplot as plt
  4 | from matplotlib.pyplot import cm
  5 | import numpy as np
  6 | import socket
  7 | import collections
  8 | from itertools import product
  9 | from scipy.stats import entropy, chisquare, norm, rv_continuous
 10 | import random
 11 | 
 12 | random.seed(a=1)
 13 | 
 14 | auxFolder = 'auxFolder/'
 15 | 
 16 | cfgs = [
 17 | ["YouTube_home_world_live",
 18 | "CovertCast_home_world"]
 19 | ]
 20 | 
 21 | 
 22 | BIN_WIDTH = [20]
 23 | #BIN_WIDTH = [50]
 24 | 
 25 | def ComputeBiGramDistributions(sampleFolder, cfg, binWidth):
 26 |     freq_dists = []
 27 | 
 28 |     for mode in cfg:
 29 |     #Compute frequency distribution for A and B
 30 |         freq_dist = []
 31 |         for sample in os.listdir(sampleFolder + mode):
 32 | 
 33 |             f = open(auxFolder + os.path.dirname(sampleFolder) + "/" + mode + "/" + sample + '/bigrams_' + str(binWidth), 'r')
 34 | 
 35 |             bin_dict = {}
 36 |             bigrams=[]
 37 |             #Generate the set of all possible bigrams
 38 |             for i in product(range(0,1500, binWidth), repeat=2):
 39 |                 bin_dict[str(i).replace(" ", "")] = 1
 40 | 
 41 | 
 42 |             lines = f.readlines()
 43 |             for line in lines:
 44 |                 try:
 45 |                     bigrams.append(line.rstrip('\n'))
 46 |                 except IndexError:
 47 |                     break #Reached last index, stop processing
 48 |             f.close()
 49 | 
 50 |             #Account for each bin elem
 51 |             for i in bigrams:
 52 |                 bin_dict['('+str(i)+')']+=1
 53 | 
 54 |             #Order bin_key : num_packets
 55 |             od_dict = collections.OrderedDict(sorted(bin_dict.items()))
 56 |             bin_list = []
 57 |             for i in od_dict:
 58 |                 bin_list.append(float(od_dict[i]))
 59 | 
 60 |             #Build up the list of a distribution samples freq dist
 61 |             freq_dist.append(bin_list)
 62 |         #Build up the list of all freq dists for different sample folders
 63 |         freq_dists.append(freq_dist)
 64 | 
 65 |     return freq_dists
 66 | 
 67 | 
 68 | def computeIntraVariance(freq_dists):
 69 |     varIntra = np.zeros(len(freq_dists[0][0]))
 70 | 
 71 |     for i in range(0, len(freq_dists[0][0])):
 72 |         somatory = 0
 73 | 
 74 |         for m in freq_dists:
 75 |             term = 0
 76 |             #Compute total n_grams in model
 77 |             total_ngrams_model = 0
 78 |             for v in m:
 79 |                 total_ngrams_model += sum(v)
 80 | 
 81 |             #Compute probability of a given n_gram in model
 82 |             prob_ngram_model = 0
 83 |             for v in m:
 84 |                 prob_ngram_model += v[i]
 85 |             prob_ngram_model = prob_ngram_model / float(total_ngrams_model)
 86 | 
 87 |             for v in m:
 88 |                 n_gram_prob_v = v[i]/sum(v)
 89 |                 term += (float(n_gram_prob_v) - prob_ngram_model)**2
 90 | 
 91 |             somatory += 1/float(len(m)) * term
 92 | 
 93 |         varIntra[i] = 1/2.0 * somatory
 94 | 
 95 |     return varIntra
 96 | 
 97 | 
 98 | def computeInterVariance(freq_dists):
 99 |     varInter = np.zeros(len(freq_dists[0][0]))
100 | 
101 |     total_videos = len(freq_dists[0]) + len(freq_dists[1])
102 | 
103 |     for i in range(0, len(freq_dists[0][0])):
104 |         somatory = 0
105 | 
106 |         ###For each model
107 |         for n, m in enumerate(freq_dists):
108 |             #Compute total n_grams in model
109 |             total_ngrams_model = 0
110 |             for v in m:
111 |                 total_ngrams_model += sum(v)
112 | 
113 |             #Compute total n_grams in other model
114 |             total_ngrams_other_model = 0
115 |             for v in freq_dists[(n+1)%2]:
116 |                 total_ngrams_other_model += sum(v)
117 | 
118 |             #Compute probability of a given n_gram in model
119 |             prob_ngram_model = 0
120 |             for v in m:
121 |                 prob_ngram_model += v[i]
122 |             prob_ngram_model = prob_ngram_model / float(total_ngrams_model)
123 | 
124 |             #Compute probability of a given n_gram in the other model
125 |             prob_ngram_other_model = 0
126 |             for v in freq_dists[(n+1)%2]:
127 |                 prob_ngram_other_model += v[i]
128 |             prob_ngram_other_model = prob_ngram_other_model / float(total_ngrams_other_model)
129 | 
130 |             ###For each video in model
131 |             for v in m:
132 |                 n_gram_prob_v = v[i]/sum(v)
133 |                 somatory += (float(n_gram_prob_v) - prob_ngram_model)**2
134 | 
135 |         varInter[i] = 1.0/total_videos * somatory
136 | 
137 |     return varInter
138 | 
139 | 
140 | def optimizeBigrams(freq_dists):
141 | 
142 |     varIntra = computeIntraVariance(freq_dists)
143 |     varInter = computeInterVariance(freq_dists)
144 | 
145 |     DIS = np.zeros(len(varIntra))
146 |     DIS = varInter/varIntra
147 | 
148 |     indexes_to_remove = []
149 | 
150 |     for n, i in enumerate(DIS):
151 |         if(i < 1):
152 |             indexes_to_remove.append(n)
153 | 
154 |     return indexes_to_remove
155 | 
156 | 
157 | def buildModels(freq_dists):
158 |     #####################################
159 |     # Build models
160 |     #####################################
161 |     model_chat = np.zeros(len(freq_dists[0][0]))
162 |     model_censored = np.zeros(len(freq_dists[0][0]))
163 | 
164 |     total_ngrams_chat_set = 0
165 |     for dist in freq_dists[0]:
166 |         total_ngrams_chat_set += sum(dist)
167 | 
168 |     total_ngrams_censored_set = 0
169 |     for dist in freq_dists[1]:
170 |         total_ngrams_censored_set += sum(dist)
171 | 
172 | 
173 |     for i in range(0, len(model_chat)):
174 |         somatory = 0
175 |         for v in freq_dists[0]:
176 |             n_gram_prob = v[i]/sum(v)
177 |             v_total_grams = sum(v)
178 |             somatory += (v_total_grams * n_gram_prob)
179 |         model_chat[i] = (1/total_ngrams_chat_set) * somatory
180 | 
181 | 
182 |     for i in range(0, len(model_censored)):
183 |         somatory = 0
184 |         for v in freq_dists[1]:
185 |             n_gram_prob = v[i]/float(sum(v))
186 |             v_total_grams = sum(v)
187 |             somatory += (v_total_grams * n_gram_prob)
188 |         model_censored[i] = (1/float(total_ngrams_censored_set)) * somatory
189 | 
190 |     return model_chat, model_censored
191 | 
192 | #Reproduces Facet Fixed threshold evalution
193 | def Prepare_X_Fixed(fig_folder, cfg,binWidth,freq_dists):
194 |     optimization = True
195 | 
196 |     #Transform original freq_dists to include only the better bi-grams
197 |     chat_samples = freq_dists[0]
198 |     censored_samples = freq_dists[1]
199 | 
200 |     filtered_freq_dists = []
201 |     filtered_chat_samples = []
202 |     filtered_censored_samples = []
203 | 
204 |     if(optimization):
205 |         #Optimize bigram choice, build updated frequency distributions
206 |         indexes_to_remove = optimizeBigrams(freq_dists)
207 | 
208 |         for sample in chat_samples:
209 |             filtered_chat_samples.append(np.delete(sample, indexes_to_remove))
210 | 
211 |         for sample in censored_samples:
212 |             filtered_censored_samples.append(np.delete(sample, indexes_to_remove))
213 |     else:
214 |         #Ignore optimization procedure, carry on with original frequency distributions
215 |         filtered_chat_samples = chat_samples
216 |         filtered_censored_samples = censored_samples
217 | 
218 |     #2x Cross validation
219 |     filtered_freq_dists1 = []
220 |     filtered_freq_dists2 = []
221 | 
222 |     #To Remove
223 |     #x = random.sample(filtered_chat_samples, len(filtered_chat_samples))
224 |     #x2 = random.sample(filtered_censored_samples, len(filtered_censored_samples))
225 | 
226 |     filtered_freq_dists1.append(filtered_chat_samples[:len(filtered_chat_samples)/2])
227 |     filtered_freq_dists1.append(filtered_censored_samples[:len(filtered_censored_samples)/2])
228 | 
229 |     filtered_freq_dists2.append(filtered_chat_samples[len(filtered_chat_samples)/2:])
230 |     filtered_freq_dists2.append(filtered_censored_samples[len(filtered_censored_samples)/2:])
231 | 
232 |     model_chat1, model_censored1 = buildModels(filtered_freq_dists1)
233 |     acc1, tnr1, fnr1, tpr1, fpr1, ppv1, npv1 = X_Classify_Fixed(cfg,binWidth,filtered_freq_dists2, model_chat1, model_censored1)
234 |     print "1st Fold"
235 |     print "Acc = " + str(acc1)
236 |     print "TPR = " + str(tpr1)
237 |     print "TNR = " + str(tnr1)
238 |     print "FPR = " + str(fpr1)
239 |     print "FNR = " + str(fnr1)
240 |     print "PPV = " + str(ppv1)
241 |     print "NPV = " + str(npv1)
242 | 
243 |     model_chat2, model_censored2 = buildModels(filtered_freq_dists2)
244 |     acc2, tnr2, fnr2, tpr2, fpr2, ppv2, npv2 = X_Classify_Fixed(cfg,binWidth,filtered_freq_dists1, model_chat2, model_censored2)
245 |     print "\n2nd Fold"
246 |     print "Acc = " + str(acc2)
247 |     print "TPR = " + str(tpr2)
248 |     print "TNR = " + str(tnr2)
249 |     print "FPR = " + str(fpr2)
250 |     print "FNR = " + str(fnr2)
251 |     print "PPV = " + str(ppv2)
252 |     print "NPV = " + str(npv2)
253 | 
254 |     print "\n###################"
255 |     print "Average"
256 |     print "Acc = " + str((acc1 + acc2)/2.0)
257 |     print "TPR = " + str((tpr1 + tpr2)/2.0)
258 |     print "TNR = " + str((tnr1 + tnr2)/2.0)
259 |     print "FPR = " + str((fpr1 + fpr2)/2.0)
260 |     print "FNR = " + str((fnr1 + fnr2)/2.0)
261 |     print "PPV = " + str((ppv1 + ppv2)/2.0)
262 |     print "NPV = " + str((npv1 + npv2)/2.0)
263 | 
264 | 
265 | ######################################################################################
266 | def X_Classify_Fixed(cfg, binWidth, freq_dists, model_chat, model_censored):
267 |     ##########################
268 |     #Classify samples
269 |     ##########################
270 |     FPositives = 0
271 |     FNegatives = 0
272 |     TPositives = 0
273 |     TNegatives = 0
274 | 
275 |     #True negative is being classified as facet when it is facet
276 |     for v in freq_dists[0]:
277 |         chat_score = chisquare(v, model_chat)
278 |         censored_score = chisquare(v, model_censored)
279 | 
280 |         if(chat_score < censored_score):
281 |             TPositives += 1
282 |         elif(censored_score < chat_score):
283 |             FNegatives += 1
284 | 
285 |     for v in freq_dists[1]:
286 |         chat_score = chisquare(v, model_chat)
287 |         censored_score = chisquare(v, model_censored)
288 | 
289 |         if(censored_score < chat_score):
290 |             TNegatives += 1
291 |         elif(chat_score < censored_score):
292 |             FPositives += 1
293 | 
294 | 
295 |     accuracy = (TPositives + TNegatives)/float(len(freq_dists[0]) + len(freq_dists[1]))
296 |     TNR = TNegatives/(TNegatives+float(FPositives))
297 |     FNR = FNegatives/(TPositives+float(FNegatives))
298 |     TPR = TPositives/(TPositives+float(FNegatives))
299 |     FPR = FPositives/(FPositives+float(TNegatives))
300 |     PPV = TPositives/(TPositives+float(FPositives))
301 |     NPV = TNegatives/(TNegatives+float(FNegatives))
302 | 
303 |     return accuracy, TNR, FNR, TPR, FPR, PPV, NPV
304 | 
305 | 
306 | #Reproduces Facet Changing deltas evaluation
307 | def Prepare_X_RatioReproduction(fig_folder, cfg,binWidth,freq_dists):
308 |     optimization = True
309 | 
310 | 
311 |     #Transform original freq_dists to include only the better bi-grams
312 |     chat_samples = freq_dists[0]
313 |     censored_samples = freq_dists[1]
314 | 
315 |     filtered_freq_dists = []
316 |     filtered_chat_samples = []
317 |     filtered_censored_samples = []
318 | 
319 |     if(optimization):
320 |         #Optimize bigram choice, build updated frequency distributions
321 |         indexes_to_remove = optimizeBigrams(freq_dists)
322 | 
323 |         for sample in chat_samples:
324 |             filtered_chat_samples.append(np.delete(sample, indexes_to_remove))
325 | 
326 |         for sample in censored_samples:
327 |             filtered_censored_samples.append(np.delete(sample, indexes_to_remove))
328 |     else:
329 |         #Ignore optimization procedure, carry on with original frequency distributions
330 |         filtered_chat_samples = chat_samples
331 |         filtered_censored_samples = censored_samples
332 | 
333 |     #2x Cross validation
334 |     filtered_freq_dists1 = []
335 |     filtered_freq_dists2 = []
336 | 
337 |     #To remove
338 |     #x = random.sample(filtered_chat_samples, len(filtered_chat_samples))
339 |     #x2 = random.sample(filtered_censored_samples, len(filtered_censored_samples))
340 | 
341 |     filtered_freq_dists1.append(filtered_chat_samples[:len(filtered_chat_samples)/2])
342 |     filtered_freq_dists1.append(filtered_censored_samples[:len(filtered_censored_samples)/2])
343 | 
344 |     filtered_freq_dists2.append(filtered_chat_samples[len(filtered_chat_samples)/2:])
345 |     filtered_freq_dists2.append(filtered_censored_samples[len(filtered_censored_samples)/2:])
346 | 
347 |     model_chat1, model_censored1 = buildModels(filtered_freq_dists1)
348 |     max_acc, max_delta, max_tpr, max_fpr, val90, val80, val70, specificity, sensitivity = X_Classify_RatioReproduction(cfg,binWidth,filtered_freq_dists2, model_chat1, model_censored1)
349 |     print "1st Fold"
350 |     print "TPR90 = " + str(val90)
351 |     print "TPR80 = " + str(val80)
352 |     print "TPR70 = " + str(val70)
353 |     print "Max acc: " + str(max_acc) + " Max TPR:" + str(max_tpr) + " Max FPR:" + str(max_fpr) + " delta:" + str(max_delta)
354 | 
355 |     model_chat2, model_censored2 = buildModels(filtered_freq_dists2)
356 |     max_acc2, max_delta2, max_tpr2, max_fpr2, val902, val802, val702, specificity2, sensitivity2 = X_Classify_RatioReproduction(cfg,binWidth,filtered_freq_dists1, model_chat2, model_censored2)
357 |     print "2nd Fold"
358 |     print "TPR90 = " + str(val902)
359 |     print "TPR80 = " + str(val802)
360 |     print "TPR70 = " + str(val702)
361 |     print "Max acc: " + str(max_acc2) + " Max TPR:" + str(max_tpr2) + " Max FPR:" + str(max_fpr2) + " delta:" + str(max_delta2)
362 | 
363 |     print "###################"
364 |     print "Average FPR"
365 |     print "TPR90 = " + str((val902+val90)/2.0)
366 |     print "TPR80 = " + str((val802+val80)/2.0)
367 |     print "TPR70 = " + str((val702+val70)/2.0)
368 |     print "Max acc: " + str((max_acc+max_acc2)/2.0) + " Max TPR:" + str((max_tpr+max_tpr2)/2.0) + " Max FPR:" + str((max_fpr+max_fpr2)/2.0) + " delta:" + str((max_delta + max_delta2)/2.0)
369 | 
370 |     fig = plt.figure()
371 |     ax1 = fig.add_subplot(111)
372 | 
373 | 
374 |     Specificity = (specificity + specificity2)/2.0
375 |     Sensitivity = (sensitivity + sensitivity2)/2.0
376 | 
377 |     """
378 |     np.set_printoptions(threshold=np.inf)
379 |     print specificity
380 |     print specificity2
381 |     """
382 | 
383 |     #ROC Curve
384 |     ax1.plot(1 - specificity, sensitivity, color='red', lw=2, alpha=0.7, label = 'k-Fold ROC')
385 |     ax1.plot(1 - specificity2, sensitivity2, color='red', lw=2, alpha=0.7)
386 |     ax1.plot(1 - Specificity, Sensitivity, 'k.-', color='black', label = 'Mean ROC')
387 |     ax1.plot([0, 1], [0, 1], 'k--', lw=2, color='orange', label = 'Random Guess')
388 |     ax1.grid(color='black', linestyle='dotted')
389 | 
390 |     plt.title('Receiver Operating Characteristic (ROC)')
391 |     plt.xlabel('False Positive Rate', fontsize='x-large')
392 |     plt.ylabel('True Positive Rate', fontsize='x-large')
393 |     plt.legend(loc='lower right', fontsize='large')
394 | 
395 |     plt.setp(ax1.get_xticklabels(), fontsize=14)
396 |     plt.setp(ax1.get_yticklabels(), fontsize=14)
397 | 
398 |     fig.savefig(fig_folder + "ROC_" + str(optimization) + "_" + cfg[1] + "_" + str(binWidth)+".pdf")   # save the figure to file
399 |     plt.close(fig)
400 | 
401 | def X_Classify_RatioReproduction(cfg, binWidth,freq_dists, model_chat, model_censored):
402 |     ##########################
403 |     #Classify samples
404 |     ##########################
405 |     deltas = np.arange(0.001, 5, 0.001)
406 |     FalsePositives = []
407 |     FalseNegatives = []
408 |     TruePositives = []
409 |     TrueNegatives = []
410 | 
411 |     Sensitivity = []
412 |     Specificity = []
413 |     FalsePositiveRate = []
414 |     FalseNegativeRate =[]
415 | 
416 |     holding90 = True
417 |     holding80 = True
418 |     holding70 = True
419 | 
420 |     thresh90 = 0
421 |     thresh80 = 0
422 |     thresh70 = 0
423 | 
424 |     val90 = 0
425 |     val80 = 0
426 |     val70 = 0
427 | 
428 |     max_acc = 0
429 |     max_delta = 0
430 |     max_tpr = 0
431 |     max_fpr = 0
432 | 
433 |     for delta in deltas:
434 |         FPositives = 0
435 |         FNegatives = 0
436 |         TPositives = 0
437 |         TNegatives = 0
438 | 
439 |         chat_ratios = []
440 |         censored_ratios = []
441 | 
442 |         #Positive example is chat
443 |         #True positive is being classified as facet when it is facet
444 |         for v in freq_dists[0]:
445 |             chat_score, p_value = chisquare(v, model_chat)
446 |             censored_score, p_value2 = chisquare(v, model_censored)
447 | 
448 | 
449 |             ratio = chat_score / float(censored_score)
450 |             chat_ratios.append(ratio)
451 |             if(ratio < delta):
452 |                 TNegatives += 1
453 |             elif(ratio > delta):
454 |                 FPositives += 1
455 | 
456 |         for v in freq_dists[1]:
457 |             chat_score, p_value = chisquare(v, model_chat)
458 |             censored_score, p_value2 = chisquare(v, model_censored)
459 | 
460 |             ratio = chat_score / float(censored_score)
461 |             censored_ratios.append(ratio)
462 |             if(ratio > delta):
463 |                 TPositives += 1
464 |             elif(ratio < delta):
465 |                 FNegatives += 1
466 | 
467 | 
468 |         accuracy = (TPositives + TNegatives)/float(len(freq_dists[0]) + len(freq_dists[1]))
469 |         TNR = TNegatives/(TNegatives+float(FPositives))
470 |         FNR = FNegatives/(TPositives+float(FNegatives))
471 |         TPR = TPositives/(TPositives+float(FNegatives))
472 |         FPR = FPositives/(FPositives+float(TNegatives))
473 | 
474 |         if(accuracy > max_acc):
475 |             max_acc = accuracy
476 |             max_tpr = TPR
477 |             max_fpr = FPR
478 |             max_delta = delta
479 | 
480 |         FalsePositives.append(FPositives)
481 |         FalseNegatives.append(FNegatives)
482 |         TruePositives.append(TPositives)
483 |         TrueNegatives.append(TNegatives)
484 |         Sensitivity.append(TPositives/(TPositives+float(FNegatives)))
485 |         Specificity.append(TNegatives/(TNegatives+float(FPositives)))
486 |         FalsePositiveRate.append(FPR)
487 |         FalseNegativeRate.append(FNR)
488 | 
489 |         if(holding90):
490 |             if(FNR >= 0.1):
491 |                 holding90 = False
492 |                 thresh90 = delta
493 |                 val90 = FPR
494 | 
495 |         if(holding80):
496 |             if(FNR >= 0.2):
497 |                 holding80 = False
498 |                 thresh80 = delta
499 |                 val80 = FPR
500 | 
501 |         if(holding70):
502 |             if(FNR >= 0.3):
503 |                 holding70 = False
504 |                 thresh70 = delta
505 |                 val70 = FPR
506 | 
507 |     return max_acc, max_delta, max_tpr, max_fpr, val90, val80, val70, np.array(Specificity), np.array(Sensitivity)
508 | 
509 | 
510 | 
511 | 
512 | 
513 | if __name__ == "__main__":
514 | 
515 |     sampleFolder = "TrafficCaptures/"
516 | 
517 |     if not os.path.exists('X2'):
518 |                 os.makedirs('X2')
519 |     if not os.path.exists('X2/' + os.path.dirname(sampleFolder)):
520 |                 os.makedirs('X2/' + os.path.dirname(sampleFolder))
521 | 
522 |     fig_folder = 'X2/' + os.path.dirname(sampleFolder) + '/'
523 | 
524 | 
525 |     print "###########################"
526 |     print os.path.dirname(sampleFolder)
527 |     print "###########################"
528 |     for cfg in cfgs:
529 |         random.seed(a=1) # re-seed
530 |         print "====================================="
531 |         print "X classifier - " + cfg[0] + " vs " + cfg[1]
532 |         for binWidth in BIN_WIDTH:
533 |             print "---------------------"
534 |             print "Bin Width: " + str(binWidth)
535 |             print "---------------------"
536 |             #Compute bigram distributions and shuffle the samples
537 |             freq_dists = ComputeBiGramDistributions(sampleFolder, cfg, binWidth)
538 |             x = random.sample(freq_dists[0], len(freq_dists[0]))
539 |             x2 = random.sample(freq_dists[1], len(freq_dists[1]))
540 |             freqs = []
541 |             freqs.append(x)
542 |             freqs.append(x2)
543 | 
544 |             #For reproducing results of Facet paper (70%,80%,90% blockage)
545 |             #Prepare_X_RatioReproduction(fig_folder, cfg,binWidth, freqs)
546 | 
547 |             #For getting fixed classification rates to compare with classifiers without a notion of internal thereshold
548 |             Prepare_X_Fixed(fig_folder, cfg,binWidth, freqs)
549 | 


--------------------------------------------------------------------------------
/DeltaShaperAnalysis/X2_classifier.py:
--------------------------------------------------------------------------------
  1 | import dpkt
  2 | import os
  3 | from matplotlib import pyplot as plt
  4 | from matplotlib.pyplot import cm
  5 | import numpy as np
  6 | import socket
  7 | import collections
  8 | from itertools import product
  9 | from scipy.stats import entropy, chisquare, norm, rv_continuous
 10 | import random
 11 | 
 12 | 
 13 | 
 14 | auxFolder = 'auxFolder/'
 15 | 
 16 | cfgs = [
 17 | ["RegularTraffic",
 18 | "DeltaShaperTraffic_320"],
 19 | ["RegularTraffic",
 20 | "DeltaShaperTraffic_160"]]
 21 | 
 22 | 
 23 | BIN_WIDTH = [20]
 24 | 
 25 | def ComputeBiGramDistributions(sampleFolder, cfg, binWidth):
 26 |     freq_dists = []
 27 | 
 28 |     for mode in cfg:
 29 |     #Compute frequency distribution for A and B
 30 |         freq_dist = []
 31 |         for sample in os.listdir(sampleFolder + mode):
 32 | 
 33 |             f = open(auxFolder + os.path.dirname(sampleFolder) + "/" + mode + "/" + sample + '/bigrams_' + str(binWidth), 'r')
 34 | 
 35 |             bin_dict = {}
 36 |             bigrams=[]
 37 |             #Generate the set of all possible bigrams
 38 |             for i in product(range(0,1500, binWidth), repeat=2):
 39 |                 bin_dict[str(i).replace(" ", "")] = 1
 40 | 
 41 | 
 42 |             lines = f.readlines()
 43 |             for line in lines:
 44 |                 try:
 45 |                     bigrams.append(line.rstrip('\n'))
 46 |                 except IndexError:
 47 |                     break #Reached last index, stop processing
 48 |             f.close()
 49 | 
 50 |             #Account for each bin elem
 51 |             for i in bigrams:
 52 |                 bin_dict['('+str(i)+')']+=1
 53 | 
 54 |             #Order bin_key : num_packets
 55 |             od_dict = collections.OrderedDict(sorted(bin_dict.items()))
 56 |             bin_list = []
 57 |             for i in od_dict:
 58 |                 bin_list.append(float(od_dict[i]))
 59 | 
 60 |             #Build up the list of a distribution samples freq dist
 61 |             freq_dist.append(bin_list)
 62 |         #Build up the list of all freq dists for different sample folders
 63 |         freq_dists.append(freq_dist)
 64 | 
 65 |     return freq_dists
 66 | 
 67 | 
 68 | def computeIntraVariance(freq_dists):
 69 |     varIntra = np.zeros(len(freq_dists[0][0]))
 70 | 
 71 |     for i in range(0, len(freq_dists[0][0])):
 72 |         somatory = 0
 73 | 
 74 |         for m in freq_dists:
 75 |             term = 0
 76 |             #Compute total n_grams in model
 77 |             total_ngrams_model = 0
 78 |             for v in m:
 79 |                 total_ngrams_model += sum(v)
 80 | 
 81 |             #Compute probability of a given n_gram in model
 82 |             prob_ngram_model = 0
 83 |             for v in m:
 84 |                 prob_ngram_model += v[i]
 85 |             prob_ngram_model = prob_ngram_model / float(total_ngrams_model)
 86 | 
 87 |             for v in m:
 88 |                 n_gram_prob_v = v[i]/sum(v)
 89 |                 term += (float(n_gram_prob_v) - prob_ngram_model)**2
 90 | 
 91 |             somatory += 1/float(len(m)) * term
 92 | 
 93 |         varIntra[i] = 1/2.0 * somatory
 94 | 
 95 |     return varIntra
 96 | 
 97 | 
 98 | def computeInterVariance(freq_dists):
 99 |     varInter = np.zeros(len(freq_dists[0][0]))
100 | 
101 |     total_videos = len(freq_dists[0]) + len(freq_dists[1])
102 | 
103 |     for i in range(0, len(freq_dists[0][0])):
104 |         somatory = 0
105 | 
106 |         ###For each model
107 |         for n, m in enumerate(freq_dists):
108 |             #Compute total n_grams in model
109 |             total_ngrams_model = 0
110 |             for v in m:
111 |                 total_ngrams_model += sum(v)
112 | 
113 |             #Compute total n_grams in other model
114 |             total_ngrams_other_model = 0
115 |             for v in freq_dists[(n+1)%2]:
116 |                 total_ngrams_other_model += sum(v)
117 | 
118 |             #Compute probability of a given n_gram in model
119 |             prob_ngram_model = 0
120 |             for v in m:
121 |                 prob_ngram_model += v[i]
122 |             prob_ngram_model = prob_ngram_model / float(total_ngrams_model)
123 | 
124 |             #Compute probability of a given n_gram in the other model
125 |             prob_ngram_other_model = 0
126 |             for v in freq_dists[(n+1)%2]:
127 |                 prob_ngram_other_model += v[i]
128 |             prob_ngram_other_model = prob_ngram_other_model / float(total_ngrams_other_model)
129 | 
130 |             ###For each video in model
131 |             for v in m:
132 |                 n_gram_prob_v = v[i]/sum(v)
133 |                 somatory += (float(n_gram_prob_v) - prob_ngram_model)**2
134 | 
135 |         varInter[i] = 1.0/total_videos * somatory
136 | 
137 |     return varInter
138 | 
139 | 
140 | def optimizeBigrams(freq_dists):
141 | 
142 |     varIntra = computeIntraVariance(freq_dists)
143 |     varInter = computeInterVariance(freq_dists)
144 | 
145 |     DIS = np.zeros(len(varIntra))
146 |     DIS = varInter/varIntra
147 | 
148 |     indexes_to_remove = []
149 | 
150 |     for n, i in enumerate(DIS):
151 |         if(i < 1):
152 |             indexes_to_remove.append(n)
153 | 
154 |     return indexes_to_remove
155 | 
156 | 
157 | def buildModels(freq_dists):
158 |     #####################################
159 |     # Build models
160 |     #####################################
161 |     model_chat = np.zeros(len(freq_dists[0][0]))
162 |     model_censored = np.zeros(len(freq_dists[0][0]))
163 | 
164 |     total_ngrams_chat_set = 0
165 |     for dist in freq_dists[0]:
166 |         total_ngrams_chat_set += sum(dist)
167 | 
168 |     total_ngrams_censored_set = 0
169 |     for dist in freq_dists[1]:
170 |         total_ngrams_censored_set += sum(dist)
171 | 
172 | 
173 |     for i in range(0, len(model_chat)):
174 |         somatory = 0
175 |         for v in freq_dists[0]:
176 |             n_gram_prob = v[i]/sum(v)
177 |             v_total_grams = sum(v)
178 |             somatory += (v_total_grams * n_gram_prob)
179 |         model_chat[i] = (1/total_ngrams_chat_set) * somatory
180 | 
181 | 
182 |     for i in range(0, len(model_censored)):
183 |         somatory = 0
184 |         for v in freq_dists[1]:
185 |             n_gram_prob = v[i]/float(sum(v))
186 |             v_total_grams = sum(v)
187 |             somatory += (v_total_grams * n_gram_prob)
188 |         model_censored[i] = (1/float(total_ngrams_censored_set)) * somatory
189 | 
190 |     return model_chat, model_censored
191 | 
192 | #Reproduces Facet Fixed threshold evalution
193 | def Prepare_X_Fixed(fig_folder, cfg,binWidth,freq_dists):
194 |     optimization = True
195 | 
196 |     #Transform original freq_dists to include only the better bi-grams
197 |     chat_samples = freq_dists[0]
198 |     censored_samples = freq_dists[1]
199 | 
200 |     filtered_freq_dists = []
201 |     filtered_chat_samples = []
202 |     filtered_censored_samples = []
203 | 
204 |     if(optimization):
205 |         #Optimize bigram choice, build updated frequency distributions
206 |         indexes_to_remove = optimizeBigrams(freq_dists)
207 | 
208 |         for sample in chat_samples:
209 |             filtered_chat_samples.append(np.delete(sample, indexes_to_remove))
210 | 
211 |         for sample in censored_samples:
212 |             filtered_censored_samples.append(np.delete(sample, indexes_to_remove))
213 |     else:
214 |         #Ignore optimization procedure, carry on with original frequency distributions
215 |         filtered_chat_samples = chat_samples
216 |         filtered_censored_samples = censored_samples
217 | 
218 | 
219 |     #2x Cross validation
220 |     filtered_freq_dists1 = []
221 |     filtered_freq_dists2 = []
222 | 
223 |     filtered_freq_dists1.append(filtered_chat_samples[:len(filtered_chat_samples)/2])
224 |     filtered_freq_dists1.append(filtered_censored_samples[:len(filtered_censored_samples)/2])
225 | 
226 |     filtered_freq_dists2.append(filtered_chat_samples[len(filtered_chat_samples)/2:])
227 |     filtered_freq_dists2.append(filtered_censored_samples[len(filtered_censored_samples)/2:])
228 | 
229 |     model_chat1, model_censored1 = buildModels(filtered_freq_dists1)
230 |     acc1, tnr1, fnr1, tpr1, fpr1, ppv1, npv1 = X_Classify_Fixed(cfg,binWidth,filtered_freq_dists2, model_chat1, model_censored1)
231 |     print "1st Fold"
232 |     print "Acc = " + str(acc1)
233 |     print "TPR = " + str(tpr1)
234 |     print "TNR = " + str(tnr1)
235 |     print "FPR = " + str(fpr1)
236 |     print "FNR = " + str(fnr1)
237 |     print "PPV = " + str(ppv1)
238 |     print "NPV = " + str(npv1)
239 | 
240 |     model_chat2, model_censored2 = buildModels(filtered_freq_dists2)
241 |     acc2, tnr2, fnr2, tpr2, fpr2, ppv2, npv2 = X_Classify_Fixed(cfg,binWidth,filtered_freq_dists1, model_chat2, model_censored2)
242 |     print "\n2nd Fold"
243 |     print "Acc = " + str(acc2)
244 |     print "TPR = " + str(tpr2)
245 |     print "TNR = " + str(tnr2)
246 |     print "FPR = " + str(fpr2)
247 |     print "FNR = " + str(fnr2)
248 |     print "PPV = " + str(ppv2)
249 |     print "NPV = " + str(npv2)
250 | 
251 |     print "\n###################"
252 |     print "Average"
253 |     print "Acc = " + str((acc1 + acc2)/2.0)
254 |     print "TPR = " + str((tpr1 + tpr2)/2.0)
255 |     print "TNR = " + str((tnr1 + tnr2)/2.0)
256 |     print "FPR = " + str((fpr1 + fpr2)/2.0)
257 |     print "FNR = " + str((fnr1 + fnr2)/2.0)
258 |     print "PPV = " + str((ppv1 + ppv2)/2.0)
259 |     print "NPV = " + str((npv1 + npv2)/2.0)
260 | 
261 | 
262 | ######################################################################################
263 | def X_Classify_Fixed(cfg, binWidth, freq_dists, model_chat, model_censored):
264 |     ##########################
265 |     #Classify samples
266 |     ##########################
267 |     FPositives = 0
268 |     FNegatives = 0
269 |     TPositives = 0
270 |     TNegatives = 0
271 | 
272 |     #True negative is being classified as facet when it is facet
273 |     for v in freq_dists[0]:
274 |         chat_score = chisquare(v, model_chat)
275 |         censored_score = chisquare(v, model_censored)
276 | 
277 |         if(chat_score < censored_score):
278 |             TPositives += 1
279 |         elif(censored_score < chat_score):
280 |             FNegatives += 1
281 | 
282 |     for v in freq_dists[1]:
283 |         chat_score = chisquare(v, model_chat)
284 |         censored_score = chisquare(v, model_censored)
285 | 
286 |         if(censored_score < chat_score):
287 |             TNegatives += 1
288 |         elif(chat_score < censored_score):
289 |             FPositives += 1
290 | 
291 | 
292 |     accuracy = (TPositives + TNegatives)/float(len(freq_dists[0]) + len(freq_dists[1]))
293 |     TNR = TNegatives/(TNegatives+float(FPositives))
294 |     FNR = FNegatives/(TPositives+float(FNegatives))
295 |     TPR = TPositives/(TPositives+float(FNegatives))
296 |     FPR = FPositives/(FPositives+float(TNegatives))
297 |     PPV = TPositives/(TPositives+float(FPositives))
298 |     NPV = TNegatives/(TNegatives+float(FNegatives))
299 | 
300 |     return accuracy, TNR, FNR, TPR, FPR, PPV, NPV
301 | 
302 | 
303 | #Reproduces Facet Changing deltas evaluation
304 | def Prepare_X_RatioReproduction(fig_folder, cfg,binWidth,freq_dists):
305 |     optimization = True
306 | 
307 | 
308 |     #Transform original freq_dists to include only the better bi-grams
309 |     chat_samples = freq_dists[0]
310 |     censored_samples = freq_dists[1]
311 | 
312 |     filtered_freq_dists = []
313 |     filtered_chat_samples = []
314 |     filtered_censored_samples = []
315 | 
316 |     if(optimization):
317 |         #Optimize bigram choice, build updated frequency distributions
318 |         indexes_to_remove = optimizeBigrams(freq_dists)
319 |         np.save(fig_folder + "RemovedIndexes_" + cfg[1], np.array(indexes_to_remove))
320 | 
321 |         for sample in chat_samples:
322 |             filtered_chat_samples.append(np.delete(sample, indexes_to_remove))
323 | 
324 |         for sample in censored_samples:
325 |             filtered_censored_samples.append(np.delete(sample, indexes_to_remove))
326 |     else:
327 |         #Ignore optimization procedure, carry on with original frequency distributions
328 |         filtered_chat_samples = chat_samples
329 |         filtered_censored_samples = censored_samples
330 |     print "Finished optimization"
331 |     #2x Cross validation
332 |     filtered_freq_dists1 = []
333 |     filtered_freq_dists2 = []
334 | 
335 |     filtered_freq_dists1.append(filtered_chat_samples[:len(filtered_chat_samples)/2])
336 |     filtered_freq_dists1.append(filtered_censored_samples[:len(filtered_censored_samples)/2])
337 | 
338 |     filtered_freq_dists2.append(filtered_chat_samples[len(filtered_chat_samples)/2:])
339 |     filtered_freq_dists2.append(filtered_censored_samples[len(filtered_censored_samples)/2:])
340 | 
341 |     model_chat1, model_censored1 = buildModels(filtered_freq_dists1)
342 |     max_acc, max_delta, max_tpr, max_fpr, val90, val80, val70, specificity, sensitivity = X_Classify_RatioReproduction(cfg,binWidth,filtered_freq_dists2, model_chat1, model_censored1)
343 |     print "1st Fold"
344 |     print "TPR90 = " + str(val90)
345 |     print "TPR80 = " + str(val80)
346 |     print "TPR70 = " + str(val70)
347 |     print "Max acc: " + str(max_acc) + " Max TPR:" + str(max_tpr) + " Max FPR:" + str(max_fpr) + " delta:" + str(max_delta)
348 | 
349 |     model_chat2, model_censored2 = buildModels(filtered_freq_dists2)
350 |     max_acc2, max_delta2, max_tpr2, max_fpr2, val902, val802, val702, specificity2, sensitivity2 = X_Classify_RatioReproduction(cfg,binWidth,filtered_freq_dists1, model_chat2, model_censored2)
351 |     print "2nd Fold"
352 |     print "TPR90 = " + str(val902)
353 |     print "TPR80 = " + str(val802)
354 |     print "TPR70 = " + str(val702)
355 |     print "Max acc: " + str(max_acc2) + " Max TPR:" + str(max_tpr2) + " Max FPR:" + str(max_fpr2) + " delta:" + str(max_delta2)
356 | 
357 |     print "###################"
358 |     print "Average FPR"
359 |     print "TPR90 = " + str((val902+val90)/2.0)
360 |     print "TPR80 = " + str((val802+val80)/2.0)
361 |     print "TPR70 = " + str((val702+val70)/2.0)
362 |     print "Max acc: " + str((max_acc+max_acc2)/2.0) + " Max TPR:" + str((max_tpr+max_tpr2)/2.0) + " Max FPR:" + str((max_fpr+max_fpr2)/2.0) + " delta:" + str((max_delta + max_delta2)/2.0)
363 | 
364 |     fig = plt.figure()
365 |     ax1 = fig.add_subplot(111)
366 | 
367 | 
368 |     Specificity = (specificity + specificity2)/2.0
369 |     Sensitivity = (sensitivity + sensitivity2)/2.0
370 | 
371 |     np.save(fig_folder + "ROC_" + str(optimization) + "_" + cfg[1] + "_" + str(binWidth)+"_Sensitivity", np.array(Sensitivity))
372 |     np.save(fig_folder + "ROC_" + str(optimization) + "_" + cfg[1] + "_" + str(binWidth)+"_Specificity", np.array(Specificity))
373 |     """
374 |     np.set_printoptions(threshold=np.inf)
375 |     print specificity
376 |     print specificity2
377 |     """
378 | 
379 |     print "AUC"
380 |     auc = np.trapz(Sensitivity, 1 - Specificity)
381 |     print auc
382 |     #ROC Curve
383 |     ax1.plot(1 - specificity, sensitivity, color='red', lw=2, alpha=0.7, label = 'K-Fold ROC')
384 |     ax1.plot(1 - specificity2, sensitivity2, color='red', lw=2, alpha=0.7)
385 |     ax1.plot(1 - Specificity, Sensitivity, 'k.-', color='black', label = 'Mean ROC (AUC = %0.2f)' % (auc))
386 |     ax1.plot([0, 1], [0, 1], 'k--', lw=2, color='orange', label = 'Random Guess')
387 |     ax1.grid(color='black', linestyle='dotted')
388 | 
389 |     #plt.title('Receiver Operating Characteristic (ROC)')
390 |     plt.xlabel('False Positive Rate', fontsize='xx-large') #one size down
391 |     plt.ylabel('True Positive Rate', fontsize='xx-large')
392 |     plt.legend(loc='lower right', fontsize='x-large')
393 | 
394 |     plt.setp(ax1.get_xticklabels(), fontsize=16) #14
395 |     plt.setp(ax1.get_yticklabels(), fontsize=16)
396 | 
397 |     fig.savefig(fig_folder + "ROC_" + str(optimization) + "_" + cfg[1] + "_" + str(binWidth)+".pdf")   # save the figure to file
398 |     plt.close(fig)
399 | 
400 | def X_Classify_RatioReproduction(cfg, binWidth,freq_dists, model_chat, model_censored):
401 |     ##########################
402 |     #Classify samples
403 |     ##########################
404 |     deltas = np.arange(0.001, 5, 0.001)
405 |     FalsePositives = []
406 |     FalseNegatives = []
407 |     TruePositives = []
408 |     TrueNegatives = []
409 | 
410 |     Sensitivity = []
411 |     Specificity = []
412 |     FalsePositiveRate = []
413 |     FalseNegativeRate =[]
414 | 
415 |     holding90 = True
416 |     holding80 = True
417 |     holding70 = True
418 | 
419 |     thresh90 = 0
420 |     thresh80 = 0
421 |     thresh70 = 0
422 | 
423 |     val90 = 0
424 |     val80 = 0
425 |     val70 = 0
426 | 
427 |     max_acc = 0
428 |     max_delta = 0
429 |     max_tpr = 0
430 |     max_fpr = 0
431 | 
432 |     for delta in deltas:
433 |         FPositives = 0
434 |         FNegatives = 0
435 |         TPositives = 0
436 |         TNegatives = 0
437 | 
438 |         chat_ratios = []
439 |         censored_ratios = []
440 | 
441 |         #Positive example is chat
442 |         #True positive is being classified as facet when it is facet
443 |         for v in freq_dists[0]:
444 |             chat_score, p_value = chisquare(v, model_chat)
445 |             censored_score, p_value2 = chisquare(v, model_censored)
446 | 
447 | 
448 |             ratio = chat_score / float(censored_score)
449 |             chat_ratios.append(ratio)
450 |             if(ratio < delta):
451 |                 TNegatives += 1
452 |             elif(ratio > delta):
453 |                 FPositives += 1
454 | 
455 |         for v in freq_dists[1]:
456 |             chat_score, p_value = chisquare(v, model_chat)
457 |             censored_score, p_value2 = chisquare(v, model_censored)
458 | 
459 |             ratio = chat_score / float(censored_score)
460 |             censored_ratios.append(ratio)
461 |             if(ratio > delta):
462 |                 TPositives += 1
463 |             elif(ratio < delta):
464 |                 FNegatives += 1
465 | 
466 | 
467 |         accuracy = (TPositives + TNegatives)/float(len(freq_dists[0]) + len(freq_dists[1]))
468 |         TNR = TNegatives/(TNegatives+float(FPositives))
469 |         FNR = FNegatives/(TPositives+float(FNegatives))
470 |         TPR = TPositives/(TPositives+float(FNegatives))
471 |         FPR = FPositives/(FPositives+float(TNegatives))
472 | 
473 |         if(accuracy > max_acc):
474 |             max_acc = accuracy
475 |             max_tpr = TPR
476 |             max_fpr = FPR
477 |             max_delta = delta
478 | 
479 |         FalsePositives.append(FPositives)
480 |         FalseNegatives.append(FNegatives)
481 |         TruePositives.append(TPositives)
482 |         TrueNegatives.append(TNegatives)
483 |         Sensitivity.append(TPositives/(TPositives+float(FNegatives)))
484 |         Specificity.append(TNegatives/(TNegatives+float(FPositives)))
485 |         FalsePositiveRate.append(FPR)
486 |         FalseNegativeRate.append(FNR)
487 | 
488 |         if(holding90):
489 |             if(FNR >= 0.1):
490 |                 holding90 = False
491 |                 thresh90 = delta
492 |                 val90 = FPR
493 | 
494 |         if(holding80):
495 |             if(FNR >= 0.2):
496 |                 holding80 = False
497 |                 thresh80 = delta
498 |                 val80 = FPR
499 | 
500 |         if(holding70):
501 |             if(FNR >= 0.3):
502 |                 holding70 = False
503 |                 thresh70 = delta
504 |                 val70 = FPR
505 | 
506 |     return max_acc, max_delta, max_tpr, max_fpr, val90, val80, val70, np.array(Specificity), np.array(Sensitivity)
507 | 
508 | 
509 | 
510 | if __name__ == "__main__":
511 | 
512 |     sampleFolder = "TrafficCaptures/480Resolution/"
513 | 
514 |     if not os.path.exists('X2'):
515 |                 os.makedirs('X2')
516 |     if not os.path.exists('X2/' + os.path.dirname(sampleFolder)):
517 |                 os.makedirs('X2/' + os.path.dirname(sampleFolder))
518 | 
519 |     fig_folder = 'X2/' + os.path.dirname(sampleFolder) + '/'
520 | 
521 | 
522 |     print "###########################"
523 |     print os.path.dirname(sampleFolder)
524 |     print "###########################"
525 |     for cfg in cfgs:
526 |         random.seed(a=1) # re-seed
527 |         print "====================================="
528 |         print "X classifier - " + cfg[0] + " vs " + cfg[1]
529 |         for binWidth in BIN_WIDTH:
530 |             print "---------------------"
531 |             print "Bin Width: " + str(binWidth)
532 |             print "---------------------"
533 |             #Compute bigram distributions and shuffle the samples
534 |             freq_dists = ComputeBiGramDistributions(sampleFolder, cfg, binWidth)
535 |             x = random.sample(freq_dists[0], len(freq_dists[0]))
536 |             x2 = random.sample(freq_dists[1], len(freq_dists[1]))
537 |             freqs = []
538 |             freqs.append(x)
539 |             freqs.append(x2)
540 | 
541 |             print "Finished sample processing"
542 |             #For reproducing results of Facet paper (70%,80%,90% blockage)
543 |             Prepare_X_RatioReproduction(fig_folder, cfg,binWidth, freqs)
544 | 
545 |             #For getting fixed classification rates to compare with classifiers without a notion of internal thereshold
546 |             #Prepare_X_Fixed(fig_folder, cfg,binWidth, freqs)
547 | 


--------------------------------------------------------------------------------
/DeltaShaperAnalysis/IsolationForest.py:
--------------------------------------------------------------------------------
  1 | import socket
  2 | import dpkt
  3 | import os
  4 | import csv
  5 | import numpy as np
  6 | import random
  7 | import math
  8 | from sklearn.ensemble import IsolationForest
  9 | from sklearn.svm import OneClassSVM
 10 | from sklearn.neighbors import LocalOutlierFactor
 11 | 
 12 | from copy import deepcopy
 13 | from scipy import interp
 14 | import matplotlib.pyplot as plt
 15 | from sklearn import preprocessing
 16 | from sklearn.decomposition import PCA
 17 | 
 18 | from sklearn.model_selection import GridSearchCV
 19 | from sklearn.metrics import classification_report
 20 | from sklearn.metrics import accuracy_score
 21 | from sklearn.metrics import precision_score
 22 | from sklearn.metrics import recall_score
 23 | from sklearn.metrics import roc_auc_score
 24 | from sklearn.metrics import roc_curve
 25 | from sklearn.metrics import auc
 26 | 
 27 | plt.rcParams['font.family'] = 'Helvetica'
 28 | 
 29 | random.seed(42)
 30 | rng = np.random.RandomState(42)
 31 | 
 32 | def gatherHoldoutData(data_folder, cfg):
 33 | 
 34 |     SPLIT_FACTOR = 0.7
 35 |     #Load Datasets
 36 |     f = open(data_folder + cfg[0] + "_dataset.csv", 'r')
 37 |     reader = csv.reader(f, delimiter=',')
 38 |     reg = list(reader)
 39 | 
 40 |     f = open(data_folder + cfg[1] + "_dataset.csv", 'r')
 41 |     reader = csv.reader(f, delimiter=',')
 42 |     fac = list(reader)
 43 | 
 44 | 
 45 |     #Convert data to floats (and labels to integers)
 46 |     reg_data = []
 47 |     for i in reg[1:]:
 48 |         int_array = []
 49 |         for pl in i[:-1]:
 50 |             int_array.append(float(pl))
 51 |         int_array.append(1) #0, inliers
 52 |         reg_data.append(int_array)
 53 | 
 54 |     fac_data = []
 55 |     for i in fac[1:]:
 56 |         int_array = []
 57 |         for pl in i[:-1]:
 58 |             int_array.append(float(pl))
 59 |         int_array.append(-1) #1, outliers
 60 |         fac_data.append(int_array)
 61 | 
 62 | 
 63 |     #Shuffle both datasets
 64 |     shuffled_reg_data = random.sample(reg_data, len(reg_data))
 65 |     shuffled_fac_data = random.sample(fac_data, len(fac_data))
 66 | 
 67 |     #Build label tensors
 68 |     reg_labels = []
 69 |     for i in shuffled_reg_data:
 70 |         reg_labels.append(int(i[len(reg_data[0])-1]))
 71 | 
 72 |     fac_labels = []
 73 |     for i in shuffled_fac_data:
 74 |         fac_labels.append(int(i[len(reg_data[0])-1]))
 75 | 
 76 |     #Take label out of data tensors
 77 |     for i in range(0, len(shuffled_reg_data)):
 78 |         shuffled_reg_data[i].pop()
 79 | 
 80 |     for i in range(0, len(shuffled_fac_data)):
 81 |         shuffled_fac_data[i].pop()
 82 | 
 83 | 
 84 |     #Build training and testing datasets
 85 |     #Split each class data in the appropriate proportion for training
 86 |     reg_proportion_index = int(len(reg_labels)* SPLIT_FACTOR)
 87 |     reg_train_x = shuffled_reg_data[:reg_proportion_index]
 88 |     reg_train_y = reg_labels[:reg_proportion_index]
 89 | 
 90 |     fac_proportion_index = int(len(fac_labels)*SPLIT_FACTOR)
 91 |     fac_train_x = shuffled_fac_data[:fac_proportion_index]
 92 |     fac_train_y = fac_labels[:fac_proportion_index]
 93 | 
 94 |     #Create training sets by combining the randomly selected samples from each class
 95 |     train_x = reg_train_x + fac_train_x
 96 |     train_y = reg_train_y + fac_train_y
 97 | 
 98 |     #Make the split for the testing data
 99 |     reg_test_x = shuffled_reg_data[reg_proportion_index:]
100 |     reg_test_y = reg_labels[reg_proportion_index:]
101 | 
102 |     fac_test_x = shuffled_fac_data[fac_proportion_index:]
103 |     fac_test_y = fac_labels[fac_proportion_index:]
104 | 
105 |     #Create testing set by combining the holdout samples
106 |     test_x = reg_test_x + fac_test_x
107 |     test_y = reg_test_y + fac_test_y
108 | 
109 |     return train_x, train_y, test_x, test_y
110 | 
111 | def gatherHoldoutData_10times(data_folder, cfg, split_factor):
112 |     random.seed(1)
113 |     SPLIT_FACTOR = split_factor
114 |     #Load Datasets
115 |     f = open(data_folder + cfg[0] + "_dataset.csv", 'r')
116 |     reader = csv.reader(f, delimiter=',')
117 |     reg = list(reader)
118 | 
119 |     f = open(data_folder + cfg[1] + "_dataset.csv", 'r')
120 |     reader = csv.reader(f, delimiter=',')
121 |     fac = list(reader)
122 |     print "###########################################"
123 |     print "Configuration " + cfg[1]
124 |     print "###########################################"
125 | 
126 | 
127 |     #Convert data to floats (and labels to integers)
128 |     reg_data = []
129 |     for i in reg[1:]:
130 |         int_array = []
131 |         for pl in i[:-1]:
132 |             int_array.append(float(pl))
133 |         int_array.append(-1) #0, inliers
134 |         reg_data.append(int_array)
135 | 
136 |     fac_data = []
137 |     for i in fac[1:]:
138 |         int_array = []
139 |         for pl in i[:-1]:
140 |             int_array.append(float(pl))
141 |         int_array.append(1) #1, outliers
142 |         fac_data.append(int_array)
143 | 
144 |     train_x_t = []
145 |     train_y_t = []
146 |     test_x_t = []
147 |     test_y_t = []
148 | 
149 |     for k in range(0,10):
150 |         reg_data2 = deepcopy(reg_data)
151 |         fac_data2 = deepcopy(fac_data)
152 | 
153 | 
154 |         #Shuffle both datasets
155 |         shuffled_reg_data = random.sample(reg_data2, len(reg_data2))
156 |         shuffled_fac_data = random.sample(fac_data2, len(fac_data2))
157 | 
158 |         #Build label tensors
159 |         reg_labels = []
160 |         for i in shuffled_reg_data:
161 |             reg_labels.append(int(i[len(reg_data2[0])-1]))
162 | 
163 |         fac_labels = []
164 |         for i in shuffled_fac_data:
165 |             fac_labels.append(int(i[len(reg_data2[0])-1]))
166 | 
167 |         #Take label out of data tensors
168 |         for i in range(0, len(shuffled_reg_data)):
169 |             shuffled_reg_data[i].pop()
170 | 
171 |         for i in range(0, len(shuffled_fac_data)):
172 |             shuffled_fac_data[i].pop()
173 | 
174 | 
175 |         #Build training and testing datasets
176 |         #Split each class data in the appropriate proportion for training
177 |         reg_proportion_index = int(len(reg_labels)* SPLIT_FACTOR)
178 |         reg_train_x = shuffled_reg_data[:reg_proportion_index]
179 |         reg_train_y = reg_labels[:reg_proportion_index]
180 | 
181 |         fac_proportion_index = int(len(fac_labels)*SPLIT_FACTOR)
182 |         fac_train_x = shuffled_fac_data[:fac_proportion_index]
183 |         fac_train_y = fac_labels[:fac_proportion_index]
184 | 
185 |         #Create training sets by combining the randomly selected samples from each class
186 |         train_x = reg_train_x + fac_train_x
187 |         train_y = reg_train_y + fac_train_y
188 | 
189 |         #Make the split for the testing data
190 |         reg_test_x = shuffled_reg_data[reg_proportion_index:]
191 |         reg_test_y = reg_labels[reg_proportion_index:]
192 |         fac_test_x = shuffled_fac_data[fac_proportion_index:]
193 |         fac_test_y = fac_labels[fac_proportion_index:]
194 | 
195 |         #Create testing set by combining the holdout samples
196 |         test_x = reg_test_x + fac_test_x
197 |         test_y = reg_test_y + fac_test_y
198 | 
199 |         train_x_t.append(train_x)
200 |         train_y_t.append(train_y)
201 |         test_x_t.append(test_x)
202 |         test_y_t.append(test_y)
203 | 
204 | 
205 |     return train_x_t, train_y_t, test_x_t, test_y_t
206 | 
207 | def gatherAllData(data_folder, cfg):
208 |     #Load Datasets
209 |     f = open(data_folder + cfg[0] + "_dataset.csv", 'r')
210 |     reader = csv.reader(f, delimiter=',')
211 |     reg = list(reader)
212 | 
213 |     f = open(data_folder + cfg[1] + "_dataset.csv", 'r')
214 |     reader = csv.reader(f, delimiter=',')
215 |     fac = list(reader)
216 |     print "###########################################"
217 |     print "Configuration " + cfg[1]
218 |     print "###########################################"
219 | 
220 |     #Convert data to floats (and labels to integers)
221 |     reg_data = []
222 |     for i in reg[1:]:
223 |         int_array = []
224 |         for pl in i[:-1]:
225 |             int_array.append(float(pl))
226 |         int_array.append(0)
227 |         reg_data.append(int_array)
228 | 
229 |     fac_data = []
230 |     for i in fac[1:]:
231 |         int_array = []
232 |         for pl in i[:-1]:
233 |             int_array.append(float(pl))
234 |         int_array.append(1)
235 |         fac_data.append(int_array)
236 | 
237 | 
238 |     #Shuffle both datasets
239 |     shuffled_reg_data = random.sample(reg_data, len(reg_data))
240 |     shuffled_fac_data = random.sample(fac_data, len(fac_data))
241 | 
242 |     #Build label tensors
243 |     reg_labels = []
244 |     for i in shuffled_reg_data:
245 |         reg_labels.append(int(i[len(reg_data[0])-1]))
246 | 
247 |     fac_labels = []
248 |     for i in shuffled_fac_data:
249 |         fac_labels.append(int(i[len(reg_data[0])-1]))
250 | 
251 |     #Take label out of data tensors
252 |     for i in range(0, len(shuffled_reg_data)):
253 |         shuffled_reg_data[i].pop()
254 | 
255 |     for i in range(0, len(shuffled_fac_data)):
256 |         shuffled_fac_data[i].pop()
257 | 
258 |     #Create training sets by combining the randomly selected samples from each class
259 |     train_x = shuffled_reg_data + shuffled_fac_data
260 |     train_y = reg_labels + fac_labels
261 | 
262 |     #Shuffle positive/negative samples for CV purposes
263 |     x_shuf = []
264 |     y_shuf = []
265 |     index_shuf = range(len(train_x))
266 |     shuffle(index_shuf)
267 |     for i in index_shuf:
268 |         x_shuf.append(train_x[i])
269 |         y_shuf.append(train_y[i])
270 | 
271 |     return x_shuf, y_shuf
272 | 
273 | def runIsolationSearch(data_folder, cfg, cnt_factor):
274 | 
275 | 
276 |     max_acc = 0
277 |     max_tree = 0
278 | 
279 |     for n, t in enumerate(range(10,500,10)):
280 |         print t
281 |         acc = 0
282 |         tnr = 0
283 |         fnr = 0
284 |         tpr = 0
285 |         fpr = 0
286 |         ppv = 0
287 |         npv = 0
288 |         for i in range(0,3):
289 |             #Gather the dataset
290 |             train_x, train_y, test_x, test_y = gatherHoldoutData(data_folder, cfg)
291 | 
292 |             clf = IsolationForest(n_estimators=int(t), random_state=rng, max_features=1.0, contamination=cnt_factor)
293 | 
294 |             # fit the model
295 |             cnt_train = int(math.ceil(cnt_factor * (len(train_x)/2)))
296 |             clf.fit(train_x[:(len(train_x)/2) + cnt_train])
297 | 
298 |             #make predictions on testing data
299 |             cnt_test = int(math.ceil(cnt_factor * (len(test_x)/2)))
300 |             #y_true, y_pred = test_y[:(len(test_x)/2) + cnt_test], clf.predict(test_x[:(len(test_x)/2) + cnt_test])
301 | 
302 |             y_true, y_pred = test_y, clf.predict(test_x)
303 | 
304 |             #print(roc_auc_score(y_true, -clf.decision_function(test_x[:(len(test_x)/2) + cnt_test])))
305 | 
306 |             eps = 0.0000000001
307 |             FPositives = 0
308 |             FNegatives = 0
309 |             TPositives = 0
310 |             TNegatives = 0
311 | 
312 |             for n, lbl in enumerate(y_pred):
313 |                 if(lbl == -1 and y_true[n] == -1):
314 |                     TNegatives += 1
315 |                 elif(lbl == 1 and y_true[n] == -1):
316 |                     FPositives += 1
317 |                 elif(lbl == -1 and y_true[n] == 1):
318 |                     FNegatives += 1
319 |                 elif(lbl == 1 and y_true[n] == 1):
320 |                     TPositives += 1
321 | 
322 |             accuracy = (TPositives + TNegatives)/float((len(test_x)))
323 |             TNR = TNegatives/(TNegatives+float(FPositives)+eps)
324 |             FNR = FNegatives/(TPositives+float(FNegatives))
325 |             TPR = TPositives/(TPositives+float(FNegatives))
326 |             FPR = FPositives/(FPositives+float(TNegatives)+eps)
327 |             PPV = TPositives/(TPositives+float(FPositives))
328 |             NPV = TNegatives/(TNegatives+float(FNegatives)+eps)
329 | 
330 |             acc+=accuracy
331 |             tnr+=TNR
332 |             fnr+=FNR
333 |             tpr+=TPR
334 |             fpr+=FPR
335 |             ppv+=PPV
336 |             npv+=NPV
337 | 
338 |         ac = acc/3
339 |         if(int(t)%100 == 0):
340 |             print "100 trees = " + str(ac)
341 |         if(ac > max_acc):
342 |             max_acc = ac
343 |             max_tree = int(t)
344 |     print max_acc
345 |     print max_tree
346 | 
347 | def runIsolationRounds(data_folder, cfg, cnt_factor):
348 | 
349 |     acc = 0
350 |     tnr = 0
351 |     fnr = 0
352 |     tpr = 0
353 |     fpr = 0
354 |     ppv = 0
355 |     npv = 0
356 | 
357 |     for i in range(0,10):
358 |         #Gather the dataset
359 |         train_x, train_y, test_x, test_y = gatherHoldoutData(data_folder, cfg)
360 | 
361 |         clf = IsolationForest(n_estimators=100, random_state=rng, max_features=1.0, contamination=cnt_factor)
362 | 
363 |         # fit the model
364 |         cnt_train = int(math.ceil(cnt_factor * (len(train_x)/2)))
365 |         clf.fit(train_x[:(len(train_x)/2) + cnt_train])
366 | 
367 |         #make predictions on testing data
368 |         cnt_test = int(math.ceil(cnt_factor * (len(test_x)/2)))
369 |         #y_true, y_pred = test_y[:(len(test_x)/2) + cnt_test], clf.predict(test_x[:(len(test_x)/2) + cnt_test])
370 | 
371 |         y_true, y_pred = test_y, clf.predict(test_x)
372 | 
373 |         #print(roc_auc_score(y_true, -clf.decision_function(test_x[:(len(test_x)/2) + cnt_test])))
374 | 
375 |         eps = 0.0000000001
376 |         FPositives = 0
377 |         FNegatives = 0
378 |         TPositives = 0
379 |         TNegatives = 0
380 | 
381 |         for n, lbl in enumerate(y_pred):
382 |             if(lbl == -1 and y_true[n] == -1):
383 |                 TNegatives += 1
384 |             elif(lbl == 1 and y_true[n] == -1):
385 |                 FPositives += 1
386 |             elif(lbl == -1 and y_true[n] == 1):
387 |                 FNegatives += 1
388 |             elif(lbl == 1 and y_true[n] == 1):
389 |                 TPositives += 1
390 | 
391 |         accuracy = (TPositives + TNegatives)/float((len(test_x)))
392 |         TNR = TNegatives/(TNegatives+float(FPositives)+eps)
393 |         FNR = FNegatives/(TPositives+float(FNegatives))
394 |         TPR = TPositives/(TPositives+float(FNegatives))
395 |         FPR = FPositives/(FPositives+float(TNegatives)+eps)
396 |         PPV = TPositives/(TPositives+float(FPositives))
397 |         NPV = TNegatives/(TNegatives+float(FNegatives)+eps)
398 | 
399 |         acc+=accuracy
400 |         tnr+=TNR
401 |         fnr+=FNR
402 |         tpr+=TPR
403 |         fpr+=FPR
404 |         ppv+=PPV
405 |         npv+=NPV
406 | 
407 | 
408 |     print "Acc = " + str(acc/10)
409 |     print "TPR = " + str(tpr/10)
410 |     print "TNR = " + str(tnr/10)
411 |     print "FPR = " + str(fpr/10)
412 |     print "FNR = " + str(fnr/10)
413 |     print "PPV = " + str(ppv/10)
414 |     print "NPV = " + str(npv/10)
415 | 
416 | def runIsolation(data_folder, cfg, cnt_factor):
417 |     rng = np.random.RandomState(42)
418 |     #Gather the dataset
419 |     train_x, train_y, test_x, test_y = gatherHoldoutData(data_folder, cfg)
420 | 
421 |     clf = IsolationForest(n_estimators=100,random_state=rng, bootstrap=True, max_features=1.0, contamination=cnt_factor)
422 | 
423 |     # fit the model
424 |     cnt_train = int(math.ceil(cnt_factor * (len(train_x)/2)))
425 |     clf.fit(train_x[:(len(train_x)/2) + cnt_train])
426 | 
427 |     #make predictions on testing data
428 |     cnt_test = int(math.ceil(cnt_factor * (len(test_x)/2)))
429 |     y_true, y_pred = test_y[:(len(test_x)/2) + cnt_test], clf.predict(test_x[:(len(test_x)/2) + cnt_test])
430 | 
431 |     #y_true, y_pred = test_y, clf.predict(test_x)
432 | 
433 |     #print(roc_auc_score(y_true, -clf.decision_function(test_x[:(len(test_x)/2) + cnt_test])))
434 | 
435 |     eps = 0.0000000001
436 |     FPositives = 0
437 |     FNegatives = 0
438 |     TPositives = 0
439 |     TNegatives = 0
440 | 
441 |     for n, lbl in enumerate(y_pred):
442 |         if(lbl == -1 and y_true[n] == -1):
443 |             TNegatives += 1
444 |         elif(lbl == 1 and y_true[n] == -1):
445 |             FPositives += 1
446 |         elif(lbl == -1 and y_true[n] == 1):
447 |             FNegatives += 1
448 |         elif(lbl == 1 and y_true[n] == 1):
449 |             TPositives += 1
450 | 
451 |     accuracy = (TPositives + TNegatives)/float((len(test_x)/2) + cnt_test)
452 |     TNR = TNegatives/(TNegatives+float(FPositives)+eps)
453 |     FNR = FNegatives/(TPositives+float(FNegatives))
454 |     TPR = TPositives/(TPositives+float(FNegatives))
455 |     FPR = FPositives/(FPositives+float(TNegatives)+eps)
456 |     PPV = TPositives/(TPositives+float(FPositives))
457 |     NPV = TNegatives/(TNegatives+float(FNegatives)+eps)
458 |     print "Acc = " + str(accuracy)
459 |     print "TPR = " + str(TPR)
460 |     print "TNR = " + str(TNR)
461 |     print "FPR = " + str(FPR)
462 |     print "FNR = " + str(FNR)
463 |     print "PPV = " + str(PPV)
464 |     print "NPV = " + str(NPV)
465 | 
466 | def runOptimizedIso_CV(data_folder, cfg):
467 |     train_X, train_Y, test_X, test_Y = gatherHoldoutData_10times(data_folder, cfg, 0.9)
468 | 
469 |     estimators = [50, 100, 200] #np.linspace(0.1, 1, 10)
470 |     samples=[64, 128, 256, 512]
471 |     cnt_factors = [0]
472 | 
473 |     auc_report = []
474 |     best_config = []
475 |     max_auc = 0
476 |     for estimator in estimators:
477 |         for s in samples:
478 |                 mean_fpr = np.linspace(0, 1, 100)
479 |                 tprs = []
480 |                 for n in range(0,10):
481 |                     train_x = train_X[n]
482 |                     train_y = train_Y[n]
483 |                     test_x = test_X[n]
484 |                     test_y = test_Y[n]
485 | 
486 |                     rng = np.random.RandomState(2)
487 |                     clf = IsolationForest(n_estimators=estimator, max_samples=s, random_state=rng, bootstrap=True, max_features=1.0, contamination=0.5)
488 |                     clf.fit(train_x)
489 | 
490 |                     #make predictions on testing data
491 |                     y_true, y_pred = test_y, clf.predict(test_x)
492 |                     #print y_pred
493 |                     for n ,l in enumerate(y_pred):
494 |                         if(l==1):
495 |                             y_pred[n] = -1
496 |                         elif(l==-1):
497 |                             y_pred[n] = 1
498 | 
499 |                     fpr, tpr, thresholds = roc_curve(y_true, y_pred, drop_intermediate=True,pos_label=1)
500 |                     #print y_true
501 |                     #print y_pred
502 |                     tprs.append(interp(mean_fpr, fpr, tpr))
503 |                     tprs[-1][0] = 0.0
504 | 
505 |                     roc_auc = auc(fpr, tpr)
506 |                     #print "Fold %i auc: %f" % (n, roc_auc)
507 | 
508 |                 mean_tpr = np.mean(tprs, axis=0)
509 |                 mean_tpr[-1] = 1.0
510 |                 mean_auc = auc(mean_fpr, mean_tpr)
511 |                 auc_report.append(mean_auc)
512 | 
513 |                 if(mean_auc > max_auc):
514 |                     max_auc = mean_auc
515 |                     best_config = [mean_fpr, mean_tpr, estimator,s]
516 |                 print ("%f - estimator:%i, max-samples: %i" % (mean_auc, estimator, s))
517 | 
518 |     print "################\n# Summary"
519 |     print "Max. AUC: %f, Estimator: %i, Samples: %i" % (max_auc, best_config[2],best_config[3])
520 |     print "Avg. AUC: %f, " % (np.mean(auc_report,axis=0))
521 |     #Figure properties
522 | 
523 |     fig = plt.figure()
524 |     ax1 = fig.add_subplot(111)
525 |     plt.xlim([0, 1])
526 |     plt.ylim([0, 1])
527 |     plt.xlabel('False Positive Rate', fontsize=26)
528 |     plt.ylabel('True Positive Rate', fontsize=26)
529 | 
530 |     ax1.plot([0, 1], [0, 1], 'k--', lw=2, color='orange', label = 'Random Guess')
531 |     ax1.grid(color='black', linestyle='dotted')
532 |     plt.setp(ax1.get_xticklabels(), fontsize=16)
533 |     plt.setp(ax1.get_yticklabels(), fontsize=16)
534 |     plt.plot(best_config[0], best_config[1], color='b', label=r'ROC (AUC = %0.2f)' % (max_auc), lw=2, alpha=.8)
535 |     plt.legend(loc='lower right', fontsize='x-large')
536 | 
537 |     fig.savefig('Isolation/' + "DeltaShaper_Isolation_" + cfg[1] + ".pdf")   # save the figure to file
538 |     plt.close(fig)
539 | 
540 | 
541 | if __name__ == "__main__":
542 | 
543 |     cfgs = [
544 |     ["RegularTraffic",
545 |     "DeltaShaperTraffic_320"],
546 |     ["RegularTraffic",
547 |     "DeltaShaperTraffic_160"]]
548 | 
549 |     if not os.path.exists('Isolation'):
550 |             os.makedirs('Isolation')
551 | 
552 |     print "Isolation Forest - Summary Statistic Features - Set1"
553 |     feature_set = 'Stats_60' #'Stats_60' / 'PL_60'
554 |     data_folder = 'FeatureSets/' + feature_set + '/'
555 |     if not os.path.exists('Isolation/' + feature_set):
556 |                 os.makedirs('Isolation/' + feature_set)
557 | 
558 | 
559 |     for cfg in cfgs:
560 |         runOptimizedIso_CV(data_folder,cfg)
561 |     print "#####################################\n"
562 | 
563 |     print "Isolation Forest - Packet Length Features - Set2"
564 |     feature_set = 'PL_60' #'Stats_60' / 'PL_60'
565 |     data_folder = 'FeatureSets/' + feature_set + '/'
566 |     if not os.path.exists('Isolation/' + feature_set):
567 |                 os.makedirs('Isolation/' + feature_set)
568 | 
569 |     for cfg in cfgs:
570 |         runOptimizedIso_CV(data_folder,cfg)
571 | 
572 | 


--------------------------------------------------------------------------------